In [57]:
import sys, os, numpy as np, pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, GlobalMaxPool1D, SimpleRNN, GRU
from keras.layers import SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate
from keras.models import Model

import gc
from sklearn.preprocessing import StandardScaler
from keras import optimizers

import keras
keras.config.disable_traceback_filtering()

In [2]:
# quick and dirty way to change the current working directory to root (/toxic-comment-classification)
# you should run this at least once just to be certain
from os import chdir, path, getcwd
if getcwd().endswith("src"):
    chdir(path.pardir)
if path.isfile("checkcwd"):
    print("Success")
else:
    raise Exception("Something went wrong. cwd=" + getcwd())
root_path = os.getcwd()

Success


In [3]:
path = 'kaggle/input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
EMBEDDING_GLOVE = f'{path}glove_embeddings/glove.6B.300d.txt'
EMBEDDING_FT = f'{path}fasttext_embeddings/wiki-news-300d-1M.vec'
TRAIN_DATA_FILE = f'{path}{comp}train.csv.zip'
TEST_DATA_FILE = f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE = f'{clean_data_path}data_train_cleaned_light_allcase.txt'
CLEAN_TEST_DATA_FILE = f'{clean_data_path}data_test_cleaned_light_allcase.txt'
SAMPLE_SUBMISSION = f'{path}{comp}sample_submission.csv.zip'

Embedding parameter

In [4]:
max_features = 100000 # some big number, bigger than number of unique words
maxlen = 900 # max number of words in a comment to use

Read data

In [5]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read().splitlines()
    
list_sentences_train = read_from_file(CLEAN_TRAIN_DATA_FILE)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = read_from_file(CLEAN_TEST_DATA_FILE)

In [6]:
def add_features(df):
    # work with original text (before preprocessing and cleaning)
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df

train = add_features(train)
test = add_features(test)

# extract features
features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

# normalize features
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)

  df['num_words'] = df.comment_text.str.count('\S+')


Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

In [7]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index_gl = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_GLOVE))
embeddings_index_ft = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FT))

vector: 300 glove + 300 fasttext + 1 allcap

In [9]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, 601))


# something: filler word for empty comment

# word2vec of 'something'
something_gl = embeddings_index_gl.get("something")
something_ft = embeddings_index_ft.get("something")

something = np.zeros((601,))
something[:300, ] = something_gl
something[300:600, ] = something_ft
something[600, ] = 0

In [10]:
def all_caps(word: str) -> bool:
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix, i, word):
    embedding_vector_ft = embeddings_index_ft.get(word)
    if embedding_vector_ft is not None:
        # embed word if is exists in fasttext dict
        if all_caps(word):
            last_value = np.array([1])
        else:
            last_value = np.array([0])
        embedding_vector_gl = embeddings_index_gl.get(word)
        if embedding_vector_gl is not None:
            embedding_matrix[i, :300] = embedding_vector_gl
        embedding_matrix[i, 300:600] = embedding_vector_ft
        embedding_matrix[i, 600] = last_value
    else:
        # embed word with filler word
        embedding_matrix[i] = something


for word, i in word_index.items():
    if i >= max_features:
        continue
    embed_word(embedding_matrix, i, word)

  embedding_matrix[i, 600] = last_value


In [11]:
embeddings_index_ft = None
embeddings_index_gl = None
gc.collect()

0

In [64]:
def get_model(features, clipvalue=1., num_filters=40, dropout=0.5, embed_size=601):

    inp = Input(shape=(maxlen,))
    print(inp.shape)
    
    # Layer 1: concatenated fasttext and glove twitter embeddings
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    print(x.shape)
    
    # Layer 2: SpatialDropout1D(0.5)
    x = SpatialDropout1D(dropout)(x)
    
    # Layer 3: Bidirectional CuDNNLSTM
    x = Bidirectional(LSTM(num_filters, return_sequences=True))(x)


    # Layer 4: Bidirectional CuDNNGRU
    x, x_copy, ignore = Bidirectional(GRU(num_filters, return_sequences=True, return_state = True))(x)  
    
    # Layer 5: A concatenation of the last state, maximum pool, average pool and 
    # two features: "Unique words rate" and "Rate of all-caps words"
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    features_input = Input(shape=(features.shape[1],))
    
    x = concatenate([avg_pool, x_copy, max_pool, features_input])
    
    # Layer 6: output dense layer.
    outp = Dense(6, activation="sigmoid")(x)

    model = Model(inputs=[inp,features_input], outputs=outp)
    adam = optimizers.Adam(clipvalue=clipvalue)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model

In [65]:
from keras.callbacks import ModelCheckpoint, Callback
from sklearn.metrics import roc_auc_score

checkpoint_path = 'model_checkpoint/'

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                self.model.save_weights(checkpoint_path + "best_weights.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 5:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True

In [66]:
from keras import backend as K
from sklearn.model_selection import KFold

model = get_model(features)

batch_size = 32
epochs = 100
num_folds = 10

gc.collect()
K.clear_session()

# Uncomment for out-of-fold predictions
scores = []
oof_predict = np.zeros((train.shape[0],6))

predict = np.zeros((test.shape[0],6))
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X_t):
    
    kfold_y_train, kfold_y_test = y[train_index], y[test_index]
    kfold_X_train = X_t[train_index]
    kfold_X_features = features[train_index]
    kfold_X_valid = X_t[test_index]
    kfold_X_valid_features = features[test_index] 
    
    gc.collect()
    K.clear_session()
    
    model = get_model(features)
    
    
    ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_valid_features], kfold_y_test), interval = 1)
    
    model.fit([kfold_X_train,kfold_X_features], kfold_y_train, batch_size=batch_size, epochs=epochs, verbose=1,
             callbacks = [ra_val])
    
    gc.collect()
    
    #model.load_weights(bst_model_path)
    model.load_weights(checkpoint_path + "best_weights.h5")
    
    predict += model.predict([X_te, test_features], batch_size=batch_size, verbose=1) / num_folds
    
    gc.collect()
    # uncomment for out of fold predictions
    oof_predict[test_index] = model.predict([kfold_X_valid, kfold_X_valid_features],batch_size=batch_size, verbose=1)
    cv_score = roc_auc_score(kfold_y_test, oof_predict[test_index])
    
    scores.append(cv_score)
    print('score: ', cv_score)


print("Done")
#
print('Total CV score is {}'.format(np.mean(scores)))

sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
sample_submission[list_classes] = predict
sample_submission.to_csv(root_path + '/kaggle/working/' + 'rnn_glove_submission_thanh309.csv', index=False)

# uncomment for out of fold predictions
oof = pd.DataFrame.from_dict({'id': train['id']})
for c in list_classes:
   oof[c] = np.zeros(len(train))
   
oof[list_classes] = oof_predict
for c in list_classes:
   oof['prediction_' +c] = oof[c]
oof.to_csv(root_path + '/kaggle/working/' + 'oof.csv', index=False)

(None, 900)
(None, 900, 601)
(None, 900)
(None, 900, 601)
Epoch 1/100


TypeError: Exception encountered when calling GRU.call().

[1mlen is not well defined for a symbolic Tensor (functional_1_1/bidirectional_1_2/forward_gru_1/Squeeze:0). Please call `x.shape` rather than `len(x)` for shape information.[0m

Arguments received by GRU.call():
  • sequences=tf.Tensor(shape=(None, 900, 80), dtype=float32)
  • initial_state=None
  • mask=None
  • training=True