In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Input, Dense, Embedding, SpatialDropout1D, concatenate, GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

In [5]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

In [6]:
train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
train.shape

(159571, 8)

In [8]:
print('Check for null values in train set:\n', train.isnull().sum())
print('Check for null values in test set:\n', test.isnull().sum())

Check for null values in train set:
 id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64
Check for null values in test set:
 id              0
comment_text    0
dtype: int64


In [9]:
print('Total toxic comments:          ',np.sum(train['toxic']))
print('Total severe_toxic comments:   ',np.sum(train['severe_toxic']))
print('Total obscene comments:        ',np.sum(train['obscene']))
print('Total threat comments:         ',np.sum(train['threat']))
print('Total insult comments:         ',np.sum(train['insult']))
print('Total identity_hate comments:  ',np.sum(train['identity_hate']))

Total toxic comments:           15294
Total severe_toxic comments:    1595
Total obscene comments:         8449
Total threat comments:          478
Total insult comments:          7877
Total identity_hate comments:   1405


In [10]:
## find length of longest and shortested char. max_len to be used to set input shape

max_len = len(max(pd.concat([train['comment_text'], test['comment_text']]), key = len).split())
min_len = len(min(pd.concat([train['comment_text'], test['comment_text']]), key = len).split())

print('Shortest comment: {} words'.format(min_len))
print('Longest comment:  {} words'.format(max_len))

Shortest comment: 1 words
Longest comment:  773 words


In [11]:
# ROC-AUC accuracy measure during model training - provided by Kaggle

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [12]:
# define functions to create lists with all unique characters and words 

def unique_char_count(text):
    '''returns char_list containing all unique characters in text'''
    
    character_list = []
    index = 0 
    for comment in text:
        unique_characters = set(list(comment))
        index += 1
        if index % 50000 == 0:
            print('{} rows parsed...'.format(index))
        for char in unique_characters:
            if char not in character_list:
                character_list.append(char)
        
    return character_list

def unique_word_count(text):
    '''returns word_list containing all uniques words in text'''
    
    word_list = []
    index = 0 

    for comment in text:
        unique_words = set(comment.split())
        index += 1
        if index % 50000 == 0:
            print('{} rows parsed...'.format(index))
        for word in unique_words:
            word_list.append(word)

    word_list = set(word_list)
    
    return word_list

In [13]:
# find number of unique characters in both train and test set and identify how many new characters are in test set

train_char = unique_char_count(train['comment_text'])
test_char = unique_char_count(test['comment_text'])

differences = []
for i in test_char:
    if i not in train_char:
        differences.append(i)

print('There are {} unique characters in the train set.'.format(len(train_char)))
print('There are {} unique characters in the test set.'.format(len(test_char)))
print('{} characters in test set that are not in train set.'.format(len(differences)))

50000 rows parsed...
100000 rows parsed...
150000 rows parsed...
50000 rows parsed...
100000 rows parsed...
150000 rows parsed...
There are 2335 unique characters in the train set.
There are 5112 unique characters in the test set.
3207 characters in test set that are not in train set.


In [14]:
# find the number of unique words in both train and test set and identify how many new characters are in test set

train_words = unique_word_count(train['comment_text'])
test_words = unique_word_count(test['comment_text'])

differences = []
for i in test_words:
    if i not in train_words:
        differences.append(i)

print('There are {} unique words in the train set.'.format(len(train_words)))
print('There are {} unique words in the test set.'.format(len(test_words)))
print('{} words in test set that are not in train set.'.format(len(differences)))

50000 rows parsed...
100000 rows parsed...
150000 rows parsed...
50000 rows parsed...
100000 rows parsed...
150000 rows parsed...
There are 532299 unique words in the train set.
There are 611496 unique words in the test set.
429930 words in test set that are not in train set.


In [15]:
def read_embed_vecs(file):
    '''read pre-trained embedding file and return:
    word_to_vec_map mapping words to embedding vector
    words_to_index mapping words to respective index
    index_to_words mapping index to respective word
    
    note: FastText files is utf8 encoded'''
    
    with open(file, 'r', encoding="utf8") as f:
        
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            current_word = line[0]
            words.add(current_word)
            word_to_vec_map[current_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
            
    return words_to_index, index_to_words, word_to_vec_map

In [16]:
'''
pre-trained word vectors from:  
@inproceedings{mikolov2018advances,
  title={Advances in Pre-Training Distributed Word Representations},
  author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand},
  booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
  year={2018}
}'''

words_to_index, index_to_words, word_to_vec_map = read_embed_vecs('D:\\FastText\\crawl-300d-2M.vec')

In [17]:
# split to train and test sets
X_train = train['comment_text']
Y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X_test = test['comment_text']

In [18]:
# use 5% of total words 
max_words = int((len(train_words) + len(test_words)) * .05)

In [19]:
# tokenize words using 10% of total words
tokenizer = text.Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [20]:
print('Longest comment in train set is: {} tokens'.format(len(max(X_train, key = len))))
print('Longest comment in test set is:  {} tokens'.format(len(max(X_test, key = len))))

Longest comment in train set is: 1401 tokens
Longest comment in test set is:  2142 tokens


In [21]:
# pad sequences for uniform length. allow max length of input vector to be 1/3 of longest comment

max_len = int(max_len/3)
X_train = sequence.pad_sequences(X_train, maxlen = max_len)
X_test = sequence.pad_sequences(X_test, maxlen = max_len)

In [22]:
# initialize embedding matrix with 0's or pre-trained feature vector where available
# max words + 1 for keras embedding

embed_size = 300
embed_matrix = np.zeros((max_words, embed_size))

for word, idx in tokenizer.word_index.items():
    if idx >= max_words: continue
    if word in word_to_vec_map:
        embed_matrix[idx, :] = word_to_vec_map[word]

In [23]:
def gru_model():
    '''initialize a bidirectional GRU with:
    1D spatial dropout
    max pooling'''
    inp = Input(shape = (max_len, ))
    x = Embedding(input_dim = max_words, output_dim = embed_size, weights = [embed_matrix])(inp)
    x = SpatialDropout1D(.2)(x)
    x = Bidirectional(GRU(80, return_sequences = True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation = 'sigmoid')(conc)    
    
    model = Model(inputs = inp, outputs = outp)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model

In [24]:
model = gru_model()

In [None]:
batch_size = 32
epochs = 2

x_tra, x_val, y_tra, y_val = train_test_split(X_train, Y_train, train_size = .95, random_state = 1)
RocAuc = RocAucEvaluation(validation_data=(x_val, y_val), interval=1)



In [None]:
hist = model.fit(x_tra, y_tra, batch_size = batch_size, epochs = epochs, validation_data = (x_val, y_val), callbacks=[RocAuc],\
                 verbose = 2)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2


In [None]:
y_pred = model.predict(X_test, batch_size = 1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission_kf_gru_v2.csv', index=False)