In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from google.colab import auth
from google.colab import files
from google.colab import drive
from googleapiclient.discovery import build
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import pickle

Using TensorFlow backend.


In [2]:
uploaded = files.upload()

Saving kaggle(1).json to kaggle(1).json


In [0]:
!mkdir /root/.kaggle/
!cp kaggle.json /root/.kaggle/
!rm kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d yliu9999/glove6b50d

Downloading glove6b50d.zip to /content
 72% 49.0M/67.7M [00:02<00:02, 8.23MB/s]
100% 67.7M/67.7M [00:02<00:00, 25.1MB/s]


In [12]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading train.csv.zip to /content
 95% 25.0M/26.3M [00:00<00:00, 20.5MB/s]
100% 26.3M/26.3M [00:00<00:00, 38.6MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 91.0MB/s]
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 212MB/s]
Downloading test.csv.zip to /content
 38% 9.00M/23.4M [00:00<00:01, 11.8MB/s]
100% 23.4M/23.4M [00:00<00:00, 32.1MB/s]


In [5]:
!unzip glove6b50d.zip

Archive:  glove6b50d.zip
  inflating: glove.6B.50d.txt        


In [13]:
!unzip test.csv.zip test.csv
!unzip test_labels.csv.zip test_labels.csv
!unzip train.csv.zip train.csv

Archive:  test.csv.zip
  inflating: test.csv                
Archive:  test_labels.csv.zip
  inflating: test_labels.csv         
Archive:  train.csv.zip
  inflating: train.csv               


In [14]:
!unzip sample_submission.csv.zip sample_submission.csv

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   


In [0]:
path = './'
EMBEDDING_FILE=f'{path}glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [0]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [0]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

# Cleaning

In [0]:
train.isnull().sum()

In [0]:
test.isnull().sum()

In [0]:
train.count() 

In [0]:
train.head(10)

In [0]:
test.head(10)

Dit deel gaat de meest bekende contractions veranderen hun voluit geschreven vorm

In [0]:
# Get the values from the training and test set
list_sentences_train = train["comment_text"].fillna("_na_").values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [19]:
# Define the classes
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# Get the values from the training set
y = train[list_classes].values
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

In [0]:
??tokenizer

In [0]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [0]:


X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [0]:
??pad_sequences()

In [0]:
X_t

In [0]:
tokenizer.word_index

## Contradictions

In [0]:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

In [0]:
def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re

In [0]:
contractions, contractions_re = _get_contractions(contraction_dict)

In [0]:
def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

In [0]:
train_comment_list = train['comment_text'].values
list_train_no_contractions = []
for item in train_comment_list:
  list_train_no_contractions.append(replace_contractions(item))

In [0]:
test_comment_list = test['comment_text'].values
list_test_no_contractions = []
for item in test_comment_list:
  list_test_no_contractions.append(replace_contractions(item))

In [0]:
train['comment_text'] = list_train_no_contractions
test['comment_text'] = list_test_no_contractions

In [0]:
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I am ...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I am really not trying to edit war. I...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI cannot make any real suggestions on...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [0]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you wi...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I do not anonymously edit articles at all.


Hier worden verkeerd geschreven woorden eruit gehaald worden aub de googleNews vectors niet lokaal downloaden 1.5GB

## Spelling controle 
### Heeft geen meerwaarde want schrijffouten kunnen bepalen of de comment toxic is of niet.

In [0]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [0]:
!gunzip GoogleNews-vectors-negative300.bin.gz

In [0]:
import re
from collections import Counter
import gensim
import heapq
from operator import itemgetter
from multiprocessing import Pool

In [0]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
words = model.index2word
w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i
WORDS = w_rank

In [0]:
def words(text): return re.findall(r'\w+', text.lower())

In [0]:
def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)


In [0]:
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab


In [0]:
vocab = build_vocab(train.comment_text)
dicts_corrected_words = []
top_90k_words = dict(heapq.nlargest(90000, vocab.items(), key=itemgetter(1)))

pool = Pool(4)
corrected_words = pool.map(correction,list(top_90k_words.keys()))

for word,corrected_word in zip(top_90k_words,corrected_words):
    if word!=corrected_word:
        dicts_corrected_words.append({word: corrected_word})

In [0]:
dicts_corrected_words

[{'to': 'two'},
 {'of': 'on'},
 {'and': 'an'},
 {'a': 'at'},
 {'"': 's'},
 {'-': 's'},
 {'page.': 'page'},
 {'article.': 'article'},
 {'you.': 'you'},
 {',': 's'},
 {'Wikipedia.': 'Wikipedia'},
 {'.': 's'},
 {'(UTC)': 'UTC'},
 {'article,': 'article'},
 {'page,': 'page'},
 {'it,': 'it'},
 {'|': 's'},
 {'me.': 'me'},
 {'here.': 'here'},
 {')': 's'},
 {'—': 's'},
 {'you,': 'you'},
 {'However,': 'However'},
 {'(talk)': 'talk'},
 {'that.': 'that'},
 {'Also,': 'Also'},
 {'this.': 'this'},
 {'me,': 'me'},
 {'Wikipedia,': 'Wikipedia'},
 {'that,': 'that'},
 {'this,': 'this'},
 {'(and': 'hand'},
 {"Wikipedia's": 'Wikipedians'},
 {'here,': 'here'},
 {'is,': 'is'},
 {'Thanks.': 'Thanks'},
 {'editing.': 'editing'},
 {'not.': 'not'},
 {'articles.': 'articles'},
 {'there.': 'there'},
 {'so,': 'so'},
 {'–': 's'},
 {'...': 'in.'},
 {'now.': 'now'},
 {'do.': 'do'},
 {'(I': 'I'},
 {'pages,': 'pages'},
 {'""The': 'The'},
 {'Hello,': 'Hello'},
 {'(or': 'for'},
 {'deletion,': 'deletion'},
 {'articles,': 'ar

# Training 

Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

In [0]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe. We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.

In [24]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  if self.run_code(code, result):


(0.020940498, 0.6441043)

In [29]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
nb_words

20000

In [0]:
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

Simple bidirectional LSTM with two fully connected layers. We add some dropout to the LSTM since even 2 epochs is enough to overfit.

In [0]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
??LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)

In [35]:
model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1)



Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f694423e668>

In [0]:
model.save('LSTM.pth')

In [0]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
y_preds_np = model.predict([X_te], batch_size=1024, verbose=1)



In [0]:
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
sample_submission[list_classes] = y_preds_np
sample_submission.to_csv('submission.csv', index=False)

In [0]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
conMat = confusion_matrix(y_true=y.argmax(axis=1), y_pred=y_preds_np.argmax(axis=1))

In [0]:
df_cm = pd.DataFrame(conMat, index = [i for i in target_columns], columns = [i for i in target_columns])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

In [0]:
custom_test = tokenizer.texts_to_sequences(['Nonsense? kiss off, geek. What I said is true.. I\'ll have your account terminated.'])
custom_test_ter = pad_sequences(custom_test, maxlen=maxlen)
model.predict(custom_test_ter)

array([[0.5177543 , 0.00234741, 0.05660301, 0.01019084, 0.11131756,
        0.00309392]], dtype=float32)

In [0]:
custom_test = tokenizer.texts_to_sequences(['I love PXL.'])
custom_test_ter = pad_sequences(custom_test, maxlen=maxlen)
model.predict(custom_test_ter)

array([[4.85095382e-03, 6.64591789e-06, 6.83635473e-04, 3.62992287e-05,
        5.73695637e-04, 1.07490516e-04]], dtype=float32)

In [0]:
custom_test = tokenizer.texts_to_sequences(['I love my mom.'])
custom_test_ter = pad_sequences(custom_test, maxlen=maxlen)
model.predict(custom_test_ter)

array([[0.12829536, 0.00045675, 0.0118587 , 0.00201872, 0.02777336,
        0.00158019]], dtype=float32)