# Toxic Comment

## Chargement des données

In [1]:
import data.read_data as read
import data.transform_data as transform
from numpy import concatenate


In [2]:
train_csv_file_name = "res/train.csv"
test_csv_file_name = "res/test.csv"
test_label_csv_file_name = "res/test_labels.csv"
submission_csv_file_name = "res/sample_submission.csv"


In [3]:
df_train = read.load_train_csv(train_csv_file_name)


In [4]:
print(df_train)


                      id                                       comment_text  \
0       0000997932d777bf  Explanation\nWhy the edits made under my usern...   
1       000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   
2       000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   
3       0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...   
4       0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   
5       00025465d4725e87  "\n\nCongratulations from me as well, use the ...   
6       0002bcb3da6cb337       COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK   
7       00031b1e95af7921  Your vandalism to the Matt Shirvington article...   
8       00037261f536c51d  Sorry if the word 'nonsense' was offensive to ...   
9       00040093b2687caa  alignment on this subject and which are contra...   
10      0005300084f90edc  "\nFair use rationale for Image:Wonju.jpg\n\nT...   
11      00054a5e18b50dd4  bbq \n\nbe a man and lets 

In [5]:
for col in df_train.iloc[:, 2:]:
    print(col, ":", df_train[col].sum(), "/", len(df_train))


toxic : 15294 / 159571
severe_toxic : 1595 / 159571
obscene : 8449 / 159571
threat : 478 / 159571
insult : 7877 / 159571
identity_hate : 1405 / 159571


In [6]:
print(df_train.dtypes)


id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object


In [7]:
limit_train = 30000


In [8]:
X_train = df_train.iloc[:limit_train, :2].values
Y_train = df_train.iloc[:limit_train, 2:].values

X_test = df_train.iloc[limit_train:, :2].values
Y_test = df_train.iloc[limit_train:, 2:].values

X_train = transform.split_comment(X_train)
X_test = transform.split_comment(X_test)

count, vocab = transform.mk_vocab(X_train + X_test)


In [9]:
print(len(vocab))


25385


In [10]:
print(X_train[0])


('0000997932d777bf', array(['explanation', 'why', 'the', 'edits', 'made', 'under', 'my',
       'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted',
       'they', 'weren', 't', 'vandalisms', 'just', 'closure', 'on',
       'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls',
       'fac', 'and', 'please', 'don', 't', 'remove', 'the', 'template',
       'from', 'the', 'talk', 'page', 'since', 'i', 'm', 'retired', 'now',
       '89', '205', '38', '27'], dtype='<U11'))


In [11]:
ident_train, X_train = transform.pass_data_to_word_idx(X_train, vocab, count)
ident_test, X_test = transform.pass_data_to_word_idx(X_test, vocab, count)


In [12]:
max_len = transform.get_max_len_sent(concatenate((X_train, X_test), axis=0))
print(max_len)
print(concatenate((X_train, X_test), axis=0))
print(X_train)


1400
[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 16,
       34,  3, 35, 36,  3, 37, 38, 39, 24, 40, 41, 42, 43, 44, 45, 46])
 array([47, 48, 49, 50, 51, 52, 53, 24, 40, 54, 55, 56, 57, 37, 58, 59, 60,
       61, 62, 63])
 array([64, 65, 24, 40, 66, 67, 68, 69, 70, 71, 72, 73, 18, 74, 51, 75, 76,
       77, 78, 79, 80, 31, 81, 69, 82, 83,  4, 84, 85,  7, 37, 38, 49, 86,
       69, 87, 88, 89,  3, 90, 91,  3, 92, 93])
 ...
 array([ 8954,  2380,   128,    92,   176,   136,  6633,  2059, 15974,
        3645])
 array([  31,   72, 1415,  384,   72,  185,  893,  133,  294, 1130,   20,
          3,  279,   69,  134,    3,  132,  616,  275,   42,   74,   24,
        383,   26,   72])
 array([  31,   24,   66,   33,   16,  112,  133,  577,   24,  446,  506,
         31,    7, 2501,  185,  529, 1107, 1836,  162, 1839,   85, 1771,
       1062,  133,  134,  529, 2580,  242, 1836,   84,   85, 6725,

In [13]:
mean = 0
total = 0
for s in concatenate((X_train, X_test), axis=0):
    total += 1
    mean += s.shape[0]
print(mean / total)


67.03647905947822


In [14]:
X_train = transform.pad_sentences(X_train, max_len, vocab[transform.padding])
X_test = transform.pad_sentences(X_test, max_len, vocab[transform.padding])


In [15]:
print(X_train[0:10])


[[  1.   2.   3. ...   0.   0.   0.]
 [ 47.  48.  49. ...   0.   0.   0.]
 [ 64.  65.  24. ...   0.   0.   0.]
 ...
 [173. 174.  69. ...   0.   0.   0.]
 [182. 101.   3. ...   0.   0.   0.]
 [218.  20.  51. ...   0.   0.   0.]]


In [16]:
print(X_train.shape)


(30000, 1400)


In [17]:
print(Y_test[0])
print(Y_test.shape)


[0 0 0 0 0 0]
(129571, 6)


## Apprentissage

In [18]:
from model.conv_model import ConvModel
from model.recurrent_model import RecurrentModel
import torch as th
import torch.nn as nn
from utils.cuda import use_cuda
from tqdm import tqdm
from torchnet.meter import AUCMeter
import sys


In [19]:
th.cuda.is_available()


True

In [20]:
model = RecurrentModel(len(vocab), 32, max_len)
loss_fn = nn.BCELoss()

if use_cuda():
    model.cuda()
    loss_fn.cuda()

opt = th.optim.Adagrad(model.parameters(), lr=1e-3)

nb_epoch = 10

batch_size = 32
nb_batch = int(X_train.shape[0] / batch_size)

auc_meters = {0:AUCMeter(), 1:AUCMeter(), 2:AUCMeter(), 3:AUCMeter(), 4:AUCMeter(), 5:AUCMeter()}

for e in range(nb_epoch):
    model.train()
        
    sum_loss = 0
    nb_sent = 0
    
    for i in tqdm(range(nb_batch)):
        i_min = batch_size * i
        i_max = batch_size * (i + 1) if batch_size * (i + 1) < X_train.shape[0] else X_train.shape[0]
                
        x = th.Tensor(X_train[i_min:i_max]).long()
        y = th.Tensor(Y_train[i_min:i_max])
        
        if use_cuda():
            x, y = x.cuda(), y.cuda()
        
        out = model(x)
        loss = loss_fn(out, y)
        loss.backward()
        opt.step()
        
        sum_loss += loss.item()
        nb_sent += 1
    
    sum_loss /= nb_sent
    
    print("Epoch %d : loss = %f" % (e, sum_loss))
    
    model.eval()
    
    for i in range(6):
        auc_meters[i].reset()
    
    x_test = th.Tensor(X_test).long()
    y_test = th.Tensor(Y_test)
    
    batch_size_test = 32
    nb_batch_test = int(x_test.size(0) / batch_size_test)
    
    for i in tqdm(range(nb_batch_test)):
        i_min = batch_size_test * i
        i_max = batch_size_test * (i + 1) if batch_size_test * (i + 1) < x_test.size(0) else x_test.size(0)
        
        x = x_test[i_min:i_max]
        y = y_test[i_min:i_max]
        
        if use_cuda():
            x, y = x.cuda(), y.cuda()
        
        out_test = model(x)
        
        for i in range(6):
            auc_meters[i].add(out_test[:, i].detach(), y[:, i].detach())
        
    roc_auc_scores = {
        "toxic": auc_meters[0].value()[0],
        "severe_toxic": auc_meters[1].value()[0],
        "obscene": auc_meters[2].value()[0],
        "threat": auc_meters[3].value()[0],
        "insult": auc_meters[4].value()[0],
        "identity_hate": auc_meters[5].value()[0]
    }
    
    print(roc_auc_scores)
    print("Mean ROC AUC : %f" % (sum([v for _, v in roc_auc_scores.items()]) / 6))
    sys.stdout.flush()


100%|██████████| 937/937 [01:04<00:00, 14.43it/s]


Epoch 0 : loss = 0.193588


100%|██████████| 4049/4049 [01:29<00:00, 45.25it/s]


{'toxic': 0.6695582176552952, 'severe_toxic': 0.7682088372990039, 'obscene': 0.6907660469504885, 'threat': 0.6716507738319862, 'insult': 0.6943585908527442, 'identity_hate': 0.6965577613560796}
Mean ROC AUC : 0.698517


100%|██████████| 937/937 [01:05<00:00, 14.40it/s]


Epoch 1 : loss = 0.146439


100%|██████████| 4049/4049 [01:29<00:00, 43.40it/s]


{'toxic': 0.7130422352507916, 'severe_toxic': 0.815970083673153, 'obscene': 0.7366989456025935, 'threat': 0.7204367551245356, 'insult': 0.7470842649479166, 'identity_hate': 0.745167857039428}
Mean ROC AUC : 0.746400


100%|██████████| 937/937 [01:08<00:00, 14.45it/s]


Epoch 2 : loss = 0.129951


100%|██████████| 4049/4049 [01:29<00:00, 45.38it/s]


{'toxic': 0.7578895577186178, 'severe_toxic': 0.8647738868230734, 'obscene': 0.7821151627152639, 'threat': 0.7846298333110409, 'insult': 0.7915855606109675, 'identity_hate': 0.7895712950501583}
Mean ROC AUC : 0.795094


100%|██████████| 937/937 [01:05<00:00, 14.40it/s]


Epoch 3 : loss = 0.119391


100%|██████████| 4049/4049 [01:29<00:00, 45.25it/s]


{'toxic': 0.7954045909006139, 'severe_toxic': 0.8941974736667988, 'obscene': 0.8160701879125256, 'threat': 0.8274261650686214, 'insult': 0.8294753054011997, 'identity_hate': 0.8223546376452859}
Mean ROC AUC : 0.830821


100%|██████████| 937/937 [01:04<00:00, 14.44it/s]


Epoch 4 : loss = 0.108279


100%|██████████| 4049/4049 [01:29<00:00, 45.47it/s]


{'toxic': 0.8235734515378332, 'severe_toxic': 0.9126102027500047, 'obscene': 0.8397888121149358, 'threat': 0.8575590433447092, 'insult': 0.8552749348350253, 'identity_hate': 0.8449628749592235}
Mean ROC AUC : 0.855628


100%|██████████| 937/937 [01:05<00:00, 14.32it/s]


Epoch 5 : loss = 0.100156


100%|██████████| 4049/4049 [01:29<00:00, 45.47it/s]


{'toxic': 0.8394261677748119, 'severe_toxic': 0.922454194947358, 'obscene': 0.8543097107904822, 'threat': 0.8744635679402113, 'insult': 0.8697847993540381, 'identity_hate': 0.8549224932640256}
Mean ROC AUC : 0.869227


100%|██████████| 937/937 [01:06<00:00, 14.02it/s]


Epoch 6 : loss = 0.094013


100%|██████████| 4049/4049 [01:32<00:00, 43.97it/s]


{'toxic': 0.8493789663315767, 'severe_toxic': 0.9281840121370168, 'obscene': 0.8643769912241245, 'threat': 0.8827708625380035, 'insult': 0.8792793669264153, 'identity_hate': 0.861864979708748}
Mean ROC AUC : 0.877643


100%|██████████| 937/937 [01:05<00:00, 14.36it/s]


Epoch 7 : loss = 0.088503


100%|██████████| 4049/4049 [01:30<00:00, 44.87it/s]


{'toxic': 0.8705496806815598, 'severe_toxic': 0.9384897845513593, 'obscene': 0.8822263476918245, 'threat': 0.8936757912376074, 'insult': 0.8972977341570413, 'identity_hate': 0.8804838358471319}
Mean ROC AUC : 0.893787


100%|██████████| 937/937 [01:05<00:00, 14.30it/s]


Epoch 8 : loss = 0.081571


100%|██████████| 4049/4049 [01:30<00:00, 44.78it/s]


{'toxic': 0.881550757062576, 'severe_toxic': 0.9448820960336464, 'obscene': 0.8923357611945941, 'threat': 0.9014139579223186, 'insult': 0.9068178303681139, 'identity_hate': 0.8909022339775061}
Mean ROC AUC : 0.902984


100%|██████████| 937/937 [01:05<00:00, 14.33it/s]


Epoch 9 : loss = 0.076111


100%|██████████| 4049/4049 [01:30<00:00, 44.99it/s]


{'toxic': 0.8876196499554122, 'severe_toxic': 0.9501249004145171, 'obscene': 0.8993151102754469, 'threat': 0.9080386836238437, 'insult': 0.9129971214926218, 'identity_hate': 0.8955558551750478}
Mean ROC AUC : 0.908942


In [21]:
print(model)


ConvModel(
  (emb): Embedding(25385, 16)
  (seq1): Sequential(
    (0): Conv1d(16, 24, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): Conv1d(24, 32, kernel_size=(3,), stride=(1,))
    (3): ReLU()
    (4): MaxPool1d(kernel_size=1394, stride=1394, padding=0, dilation=1, ceil_mode=False)
  )
  (seq2): Sequential(
    (0): Linear(in_features=32, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=128, out_features=6, bias=True)
    (3): Sigmoid()
  )
)


## Sauvegarde du modèle, vocabulaire etc.

In [21]:
import pickle


In [23]:
saved_model = model.cpu()
saved_loss = loss_fn.cpu()


In [26]:
import datetime
dt = datetime.datetime.now()
saved_identity = "_lstm_30k_" + dt.strftime("%Hh_%Mm_%Ss_%d_%m_%Y")


In [27]:
pickle.dump(model, open("saved/model" + saved_identity + ".p", "wb"))
pickle.dump(loss_fn, open("saved/loss_fn" + saved_identity + ".p", "wb"))
pickle.dump(opt, open("saved/opt" + saved_identity + ".p", "wb"))
pickle.dump(vocab, open("saved/vocab" + saved_identity + ".p", "wb"))
pickle.dump(count, open("saved/count" + saved_identity + ".p", "wb"))
