In [1]:
import data.read_data as read
import data.transform_data as transform
from numpy import concatenate


In [2]:
train_csv_file_name = "res/train.csv"
test_csv_file_name = "res/test.csv"
test_label_csv_file_name = "res/test_labels.csv"
submission_csv_file_name = "res/sample_submission.csv"


In [3]:
df_train = read.load_train_csv(train_csv_file_name).head(41000)


In [4]:
print(df_train)


                     id                                       comment_text  \
0      0000997932d777bf  Explanation\nWhy the edits made under my usern...   
1      000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   
2      000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   
3      0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...   
4      0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   
5      00025465d4725e87  "\n\nCongratulations from me as well, use the ...   
6      0002bcb3da6cb337       COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK   
7      00031b1e95af7921  Your vandalism to the Matt Shirvington article...   
8      00037261f536c51d  Sorry if the word 'nonsense' was offensive to ...   
9      00040093b2687caa  alignment on this subject and which are contra...   
10     0005300084f90edc  "\nFair use rationale for Image:Wonju.jpg\n\nT...   
11     00054a5e18b50dd4  bbq \n\nbe a man and lets discuss it-ma

In [5]:
print(df_train.dtypes)


id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object


In [6]:
limit_train = 40000


In [7]:
X_train = df_train.iloc[:limit_train, :2].values
Y_train = df_train.iloc[:limit_train, 2:].values

X_test = df_train.iloc[limit_train:, :2].values
Y_test = df_train.iloc[limit_train:, 2:].values

X_train = transform.split_comment(X_train)
X_test = transform.split_comment(X_test)

count, vocab = transform.mk_vocab(X_train + X_test)


In [8]:
print(len(vocab))


11154


In [9]:
print(X_train[0])


('0000997932d777bf', array(['explanation', 'why', 'the', 'edits', 'made', 'under', 'my',
       'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted',
       'they', 'weren', 't', 'vandalisms', 'just', 'closure', 'on',
       'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls',
       'fac', 'and', 'please', 'don', 't', 'remove', 'the', 'template',
       'from', 'the', 'talk', 'page', 'since', 'i', 'm', 'retired', 'now',
       '89', '205', '38', '27'], dtype='<U11'))


In [10]:
ident_train, X_train = transform.pass_data_to_word_idx(X_train, vocab, count)
ident_test, X_test = transform.pass_data_to_word_idx(X_test, vocab, count)


In [11]:
max_len = transform.get_max_len_sent(concatenate((X_train, X_test), axis=0))
print(max_len)
print(concatenate((X_train, X_test), axis=0))
print(X_train)


1400
[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 16, 33,
        3, 34, 35,  3, 36, 37, 38, 24, 39, 40, 41, 42, 43, 44, 45])
 array([46, 47, 48, 49, 50, 51, 24, 39, 52, 53, 54, 55, 36, 56, 57, 58, 59,
       60, 61])
 array([62, 63, 24, 39, 64, 65, 66, 67, 68, 69, 70, 71, 18, 72, 49, 73, 74,
       75, 76, 77, 78, 30, 79, 67, 80, 81,  4, 82, 83,  7, 36, 37, 47, 84,
       67, 85, 86, 87,  3, 88, 89,  3, 90, 91])
 ...
 array([6741,   30,    3, 5566, 3793,   67,    3, 3114,  130,  112,  103,
        718,  531,  532,  724,  552, 6741,   30,    3, 5566, 3793,   67,
          3, 3114,  822,   36])
 array([ 2120,  2454,  4244,    61,    65,   114,  6074,  2559, 10164,
         404,  8703,    74,    67,  4178,   269,    81,   106,  2586,
        1569,    83,  1145,   203,    74,  6108,    26,  1240,   502,
           3,    74,   162,  3500,    49,  1569,    83,  1145,   250,
         627,  9444,

In [12]:
X_train = transform.pad_sentences(X_train, max_len, vocab[transform.padding])
X_test = transform.pad_sentences(X_test, max_len, vocab[transform.padding])


In [13]:
print(X_train[0:10])


[[  1.   2.   3. ...   0.   0.   0.]
 [ 46.  47.  48. ...   0.   0.   0.]
 [ 62.  63.  24. ...   0.   0.   0.]
 ...
 [170. 171.  67. ...   0.   0.   0.]
 [179.  99.   3. ...   0.   0.   0.]
 [ 20.  49. 214. ...   0.   0.   0.]]


In [14]:
print(X_train.shape)


(40000, 1400)


In [15]:
print(Y_test[0])
print(Y_test.shape)


[0 0 0 0 0 0]
(1000, 6)


In [16]:
from model.conv_model import ConvModel
import torch as th
import torch.nn as nn
from utils.cuda import use_cuda
from sklearn.metrics import roc_auc_score


In [17]:
th.cuda.is_available()


True

In [18]:
model = ConvModel(len(vocab), max_len)
loss_fn = nn.BCELoss()

if use_cuda():
    model.cuda()
    loss_fn.cuda()

opt = th.optim.Adagrad(model.parameters(), lr=5e-3)

nb_epoch = 3

batch_size = 32
nb_batch = int(X_train.shape[0] / batch_size)

for e in range(nb_epoch):
    model.train()
    
    if use_cuda():
        model.cuda()
        
    sum_loss = 0
    nb_sent = 0
    
    for i in range(nb_batch):
        i_min = batch_size * i
        i_max = batch_size * (i + 1) if batch_size * (i + 1) < X_train.shape[0] else X_train.shape[0]
                
        x = th.Tensor(X_train[i_min:i_max]).long()
        y = th.Tensor(Y_train[i_min:i_max])
        if use_cuda():
            x = x.cuda()
            y = y.cuda()
        out = model(x)
        loss = loss_fn(out, y)
        loss.backward()
        opt.step()
        
        sum_loss += loss.item()
        nb_sent += 1
    
    sum_loss /= nb_sent
    
    print("Epoch %d : loss = %f" % (e, sum_loss))
    
    model.eval()
    
    model.cpu()
    
    x_test = th.Tensor(X_test).long()
    y_test = th.Tensor(Y_test)
    res = th.zeros(y_test.size())
    
    batch_size_test = 32
    nb_batch_test = int(x_test.size(0) / batch_size_test)
    
    for i in range(nb_batch_test):
        i_min = batch_size_test * i
        i_max = batch_size_test * (i + 1) if batch_size_test * (i + 1) < x_test.size(0) else x_test.size(0)
        
        x = x_test[i_min:i_max]
        y = y_test[i_min:i_max]
        
        out_test = model(x)
        # diff = th.abs(y - out_test)
        res[i_min:i_max] = out_test
    
    y_test = y_test.detach().numpy()
    res = res.detach().numpy()
    roc_auc_scores = {
        "toxic": roc_auc_score(y_test[:, 0], res[:, 0]),
        "severe_toxic": roc_auc_score(y_test[:, 1], res[:, 1]),
        "obscene": roc_auc_score(y_test[:, 2], res[:, 2]),
        "threat": roc_auc_score(y_test[:, 3], res[:, 3]),
        "insult": roc_auc_score(y_test[:, 4], res[:, 4]),
        "identity_hate": roc_auc_score(y_test[:, 5], res[:, 5])
    }
    
    print(roc_auc_scores)
    print(sum([v for _, v in roc_auc_scores.items()]) / 6)
    """
    diff = res.mean(dim=0)
    differences = {
        "toxic": diff[0].item(),
        "severe_toxic": diff[1].item(),
        "obscene": diff[2].item(),
        "threat": diff[3].item(),
        "insult": diff[4].item(),
        "identity_hate": diff[5].item()}"""


Epoch 0 : loss = 0.123167


{'toxic': 0.8511110389540106, 'severe_toxic': 0.9004040404040404, 'obscene': 0.8604491079235735, 'threat': 0.9585422935473086, 'insult': 0.8737955750123394, 'identity_hate': 0.8753976311336719}
0.8866166144958242


Epoch 1 : loss = 0.087995


{'toxic': 0.92222186143672, 'severe_toxic': 0.9615151515151515, 'obscene': 0.9302710647243773, 'threat': 0.9689067201604815, 'insult': 0.9443979484538294, 'identity_hate': 0.9275126903553299}
0.9424709061076483


Epoch 2 : loss = 0.064401


{'toxic': 0.9522356073643536, 'severe_toxic': 0.9739393939393939, 'obscene': 0.9657308701233466, 'threat': 0.9725844199264461, 'insult': 0.9649134101590163, 'identity_hate': 0.9442301184433164}
0.9622723033259789


In [18]:
print(model)


ConvModel(
  (emb): Embedding(25385, 16)
  (seq1): Sequential(
    (0): Conv1d(16, 24, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): Conv1d(24, 32, kernel_size=(3,), stride=(1,))
    (3): ReLU()
    (4): MaxPool1d(kernel_size=1394, stride=1394, padding=0, dilation=1, ceil_mode=False)
  )
  (seq2): Sequential(
    (0): Linear(in_features=32, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=6, bias=True)
    (3): Sigmoid()
  )
)


In [19]:
import pickle


In [25]:
saved_model = model.cpu()
saved_loss = loss_fn.cpu()


In [26]:
import datetime
dt = datetime.datetime.now()
saved_identity = "_lowercase_" + dt.strftime("%Hh_%Mm_%Ss_%d_%m_%Y")


In [27]:
pickle.dump(model, open("saved/model" + saved_identity + ".p", "wb"))
pickle.dump(loss_fn, open("saved/loss_fn" + saved_identity + ".p", "wb"))
pickle.dump(opt, open("saved/opt" + saved_identity + ".p", "wb"))
pickle.dump(vocab, open("saved/vocab" + saved_identity + ".p", "wb"))
pickle.dump(count, open("saved/count" + saved_identity + ".p", "wb"))
