# Modules

In [1]:
import pandas as pd
import numpy as np
import re
import gensim
from gensim.models import Word2Vec
from gensim.models import doc2vec
from scipy.spatial import distance
from nltk.corpus import stopwords
from gensim.models.doc2vec import LabeledSentence

from multiprocessing import Pool

# Data

In [2]:
path = '/home/hugoperrin/Bureau/Datasets/ToxicComment/'

train=pd.read_csv(path + 'train.csv')
test=pd.read_csv(path + 'test.csv')

In [3]:
train.shape, test.shape

((159571, 8), (153164, 2))

# Exploration

In [4]:
print('Nombre de commentaires:', train.shape[0])
print('Nombre de toxic:', train[train['toxic']==1].shape[0])
print('Nombre de severe_toxic:', train[train['severe_toxic']==1].shape[0])
print('Nombre de obscene:', train[train['obscene']==1].shape[0])
print('Nombre de threat:', train[train['threat']==1].shape[0])
print('Nombre de identity hate:', train[train['identity_hate']==1].shape[0])
print('Nombre de insult:', train[train['insult']==1].shape[0])

Nombre de commentaires: 159571
Nombre de toxic: 15294
Nombre de severe_toxic: 1595
Nombre de obscene: 8449
Nombre de threat: 478
Nombre de identity hate: 1405
Nombre de insult: 7877


In [5]:
train[train['insult']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0
56,0020fd96ed3b8c8b,=Tony Sidaway is obviously a fistfuckee. He lo...,1,0,1,0,1,0


In [6]:
print('Nombre de commentaires considéré comme insulte non toxiques:',train[(train['toxic']==0)&(train['insult']==1)].shape[0])
print('On en déduit que tous les commentaires classé dans une catégories ne sont pas nécessairement toxiques ')

Nombre de commentaires considéré comme insulte non toxiques: 533
On en déduit que tous les commentaires classé dans une catégories ne sont pas nécessairement toxiques 


# Data preprocessing

In [7]:
def text_to_words(raw_text, remove_stopwords):
    
    # 1. Remove non-letters, but including numbers
    letters_only = re.sub("[^0-9a-zA-Z]", " ", raw_text)
    
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english")) # In Python, searching a set is much faster
        meaningful_words = [w for w in words if not w in stops] # Remove stop words
        words = meaningful_words
    return words 

# With stop words
sentences_train = train['comment_text'].apply(text_to_words, remove_stopwords=False)
sentences_test = test['comment_text'].apply(text_to_words, remove_stopwords=False)

In [8]:
p = Pool()
length = np.array(p.map(len, sentences_train))
p.close()
print('Nombre de mots moyen: %d, nombre de mots médian: %d' %(np.mean(length),np.median(length)))

Nombre de mots moyen: 69, nombre de mots médian: 37


In [9]:
print('Nombre total de commentaires: %d' %len(sentences_train.tolist()+sentences_test.tolist()))

Nombre total de commentaires: 312735


# Word embeddings

In [10]:
%%time 
# Word2Vec on train & test comments
model = Word2Vec(sentences_train.tolist()+sentences_test.tolist(), min_count=1, size=100, workers=8)

CPU times: user 3min 49s, sys: 1.31 s, total: 3min 50s
Wall time: 57.9 s


In [11]:
print('Vocab size: %d' %len(model.wv.vocab))

Vocab size: 309869


In [12]:
model.most_similar('fuck')

[('fucking', 0.6382911801338196),
 ('cunt', 0.6099342107772827),
 ('hell', 0.6068971157073975),
 ('bitch', 0.585590124130249),
 ('piss', 0.5641528964042664),
 ('damn', 0.5607340335845947),
 ('shit', 0.5599602460861206),
 ('motherfucker', 0.542536199092865),
 ('dumbass', 0.5193943977355957),
 ('asshole', 0.5075051784515381)]

# Final matrix embedding 

In [13]:
def get_matrix(comment):
    try:
        max_word = 30

        if len(comment) >= max_word:
            mat = model.wv[comment[:max_word]]
        else:
            mat = model.wv[comment[:max_word]]
            mat = np.concatenate((mat, np.zeros((max_word-len(comment),100),dtype="float32")), axis = 0)

        mat[max_word-1,]=model.wv[comment].sum(0)/len(comment)
    except:
        mat = np.zeros((max_word,100),dtype="float32")
        
    return mat

In [14]:
pool = Pool()
train = pool.map(get_matrix, sentences_train)
pool.close()
np.save('/home/hugoperrin/Bureau/Datasets/ToxicComment/Comment2Vec_train_vM.npy', np.array(train))
del train, sentences_train

In [15]:
print('Train done')

Train done


In [14]:
pool = Pool()
test = pool.map(get_matrix, sentences_test)
pool.close()
np.save('/home/hugoperrin/Bureau/Datasets/ToxicComment/Comment2Vec_test_vM.npy', np.array(test))
del test, sentences_test

In [15]:
print('Test done')

Test done


In [None]:
# Stop here