In [1]:
import sys, os, numpy as np, pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, GlobalMaxPool1D, SimpleRNN, GRU
from keras.layers import SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate
from keras.models import Model

import gc
from sklearn.preprocessing import StandardScaler
from keras import optimizers

from joblib import dump, load

import keras
keras.config.disable_traceback_filtering()

2024-05-28 03:06:38.499693: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-28 03:06:38.535745: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# quick and dirty way to change the current working directory to root (/toxic-comment-classification)
# you should run this at least once just to be certain
from os import chdir, path, getcwd
if getcwd().endswith("src"):
    chdir(path.pardir)
if path.isfile("checkcwd"):
    print("Success")
else:
    raise Exception("Something went wrong. cwd=" + getcwd())
root_path = os.getcwd()

Success


In [3]:
from src import constants
constants.MAX_FEATURES

5000

In [4]:
path = 'kaggle/input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
EMBEDDING_GLOVE = f'{path}glove_embeddings/glove.6B.300d.txt'
EMBEDDING_FT = f'{path}fasttext_embeddings/wiki-news-300d-1M.vec'
TRAIN_DATA_FILE = f'{path}{comp}train.csv.zip'
TEST_DATA_FILE = f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE = f'{clean_data_path}data_train_cleaned_light_allcase.txt'
CLEAN_TEST_DATA_FILE = f'{clean_data_path}data_test_cleaned_light_allcase.txt'
SAMPLE_SUBMISSION = f'{path}{comp}sample_submission.csv.zip'

In [5]:
save_path = 'src/hybrid-rnn/'

Embedding parameter

In [6]:
max_features = constants.MAX_FEATURES # some big number, bigger than number of unique words(?)
maxlen = constants.MAXLEN # max number of words in a comment to use

Read data

In [7]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read().splitlines()
    
list_sentences_train = read_from_file(CLEAN_TRAIN_DATA_FILE)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = read_from_file(CLEAN_TEST_DATA_FILE)

train = train.assign(comment_text=list_sentences_train)
test = test.assign(comment_text=list_sentences_test)

In [8]:
import re


def add_features(df):
    # work with original text (before preprocessing and cleaning)
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df['comment_text'].apply(lambda comment: len(re.findall(r'\S+', comment)))
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df

train = add_features(train)
test = add_features(test)

# extract features
features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

# normalize features
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))

dump(ss, save_path + 'scaler.bin', compress=True)

features = ss.transform(features)
test_features = ss.transform(test_features)



Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

In [9]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test))

dump(tokenizer, save_path + 'tokenizer.bin', compress=True)

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [10]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index_gl = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_GLOVE))
# embeddings_index_ft = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FT))

vector: 300 glove + 300 fasttext + 1 allcap

In [11]:
# word_index = tokenizer.word_index
# nb_words = min(max_features, len(word_index))
# embedding_matrix = np.zeros((nb_words, 601))


# # something: filler word for empty comment

# # word2vec of 'something'
# something_gl = embeddings_index_gl.get("something")
# something_ft = embeddings_index_ft.get("something")

# something = np.zeros((601,))
# something[:300, ] = something_gl
# something[300:600, ] = something_ft
# something[600, ] = 0

In [12]:
# def all_caps(word: str) -> bool:
#     return len(word) > 1 and word.isupper()

# def embed_word(embedding_matrix, i, word):
#     embedding_vector_ft = embeddings_index_ft.get(word)
#     if embedding_vector_ft is not None:
#         # embed word if is exists in fasttext dict
#         if all_caps(word):
#             last_value = np.array([1])
#         else:
#             last_value = np.array([0])
#         embedding_vector_gl = embeddings_index_gl.get(word)
#         if embedding_vector_gl is not None:
#             embedding_matrix[i, :300] = embedding_vector_gl
#         embedding_matrix[i, 300:600] = embedding_vector_ft
#         embedding_matrix[i, 600] = last_value
#     else:
#         # embed word with filler word
#         embedding_matrix[i] = something


# for word, i in word_index.items():
#     if i >= max_features:
#         continue
#     embed_word(embedding_matrix, i, word)

Ver 2: 300 glove + 1 allcap

In [13]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, 301))


# something: filler word for empty comment

# word2vec of 'something'
something_gl = embeddings_index_gl.get("something")
# something_ft = embeddings_index_ft.get("something")

something = np.zeros((301,))
something[:300, ] = something_gl
# something[300:600, ] = something_ft
something[300, ] = 0

In [14]:
def all_caps(word: str) -> bool:
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix, i, word):
    if all_caps(word):
        last_value = np.array([1])
    else:
        last_value = np.array([0])
    embedding_vector_gl = embeddings_index_gl.get(word)
    if embedding_vector_gl is not None:
        embedding_matrix[i, :300] = embedding_vector_gl
        embedding_matrix[i, 300] = last_value
    else:
        # embed word with filler word
        embedding_matrix[i] = something


for word, i in word_index.items():
    if i >= max_features:
        continue
    embed_word(embedding_matrix, i, word)

  embedding_matrix[i, 300] = last_value


In [15]:
# embeddings_index_ft = None
embeddings_index_gl = None
gc.collect()

0

In [16]:
from keras.metrics import AUC

def get_model(features, clipvalue=1., num_filters=40, dropout=0.5, embed_size=301):

    inp = Input(shape=(maxlen,))
    print(inp.shape)
    
    # Layer 1: concatenated fasttext and glove twitter embeddings
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    print(x.shape)
    
    # Layer 2: SpatialDropout1D(0.5)
    x = SpatialDropout1D(dropout)(x)
    
    # Layer 3: Bidirectional CuDNNLSTM
    x = Bidirectional(LSTM(num_filters, return_sequences=True))(x)


    # Layer 4: Bidirectional CuDNNGRU
    x = Bidirectional(GRU(num_filters, return_sequences=True))(x)
    
    # Layer 5: A concatenation of maximum pool, average pool and 
    # two features: "Unique words rate" and "Rate of all-caps words"
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    features_input = Input(shape=(features.shape[1],))
    
    x = concatenate([avg_pool, max_pool, features_input])
    
    # Layer 6: output dense layer.
    outp = Dense(6, activation="sigmoid")(x)

    model = Model(inputs=[inp, features_input], outputs=outp)

    adam = optimizers.Adam(clipvalue=clipvalue)

    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=[AUC(name='auc')])
    return model

In [17]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(save_path + 'hybrid.keras', monitor='val_auc', mode='max', save_best_only=True, verbose=1)

In [18]:
model = get_model(features)

batch_size = constants.BATCH_SIZE
epochs = constants.EPOCHS

gc.collect()
model.fit([X_t, features], y, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_split=0.1)

(None, 100)
(None, 100, 301)


2024-05-28 03:07:10.989996: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-28 03:07:11.022484: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-28 03:07:11.022522: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-28 03:07:11.026303: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-28 03:07:11.026465: I external/local_xla/xla/stream_executor

Epoch 1/10


2024-05-28 03:07:14.171643: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m1495/1496[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 36ms/step - auc: 0.9059 - loss: 0.1050
Epoch 1: val_auc improved from -inf to 0.97994, saving model to src/hybrid-rnn/hybrid.keras
[1m1496/1496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 37ms/step - auc: 0.9060 - loss: 0.1050 - val_auc: 0.9799 - val_loss: 0.0522
Epoch 2/10
[1m1496/1496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - auc: 0.9771 - loss: 0.0521
Epoch 2: val_auc did not improve from 0.97994
[1m1496/1496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 36ms/step - auc: 0.9771 - loss: 0.0521 - val_auc: 0.9784 - val_loss: 0.0502
Epoch 3/10
[1m1495/1496[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 35ms/step - auc: 0.9803 - loss: 0.0487
Epoch 3: val_auc improved from 0.97994 to 0.98168, saving model to src/hybrid-rnn/hybrid.keras
[1m1496/1496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 36ms/step - auc: 0.9803 - loss: 0.0487 - val_auc: 0.9817 - val_lo

KeyboardInterrupt: 

In [19]:
model.load_weights(save_path + 'hybrid.keras')
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
predict = model.predict([X_te, test_features], batch_size=1024, verbose=1)
sample_submission[list_classes] = predict
sample_submission.to_csv(root_path + '/kaggle/working/' + '1fold-hybrid-rnn.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 39ms/step


# RERUN THIS