In [1]:
import data.read_data as read
import data.transform_data as transform
from numpy import concatenate


In [2]:
train_csv_file_name = "res/train.csv"
test_csv_file_name = "res/test.csv"
test_label_csv_file_name = "res/test_labels.csv"
submission_csv_file_name = "res/sample_submission.csv"


In [3]:
df_train = read.load_train_csv(train_csv_file_name).head(41000)


In [4]:
print(df_train)


                     id                                       comment_text  \
0      0000997932d777bf  Explanation\nWhy the edits made under my usern...   
1      000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   
2      000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   
3      0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...   
4      0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   
5      00025465d4725e87  "\n\nCongratulations from me as well, use the ...   
6      0002bcb3da6cb337       COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK   
7      00031b1e95af7921  Your vandalism to the Matt Shirvington article...   
8      00037261f536c51d  Sorry if the word 'nonsense' was offensive to ...   
9      00040093b2687caa  alignment on this subject and which are contra...   
10     0005300084f90edc  "\nFair use rationale for Image:Wonju.jpg\n\nT...   
11     00054a5e18b50dd4  bbq \n\nbe a man and lets discuss it-ma

In [5]:
print(df_train.dtypes)


id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object


In [6]:
limit_train = 40000


In [7]:
X_train = df_train.iloc[:limit_train, :2].values
Y_train = df_train.iloc[:limit_train, 2:].values

X_test = df_train.iloc[limit_train:, :2].values
Y_test = df_train.iloc[limit_train:, 2:].values

X_train = transform.split_comment(X_train)
X_test = transform.split_comment(X_test)

count, vocab = transform.mk_vocab(X_train + X_test)


In [8]:
print(len(vocab))


12501


In [9]:
print(X_train[0])


In [10]:
ident_train, X_train = transform.pass_data_to_word_idx(X_train, vocab, count)
ident_test, X_test = transform.pass_data_to_word_idx(X_test, vocab, count)


In [11]:
max_len = transform.get_max_len_sent(concatenate((X_train, X_test), axis=0))
print(max_len)
print(concatenate((X_train, X_test), axis=0))


1399
[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 14, 30,  2, 31, 32,
        2, 33, 34, 35, 21, 36, 37, 38, 39, 40, 41, 42])
 array([43, 44, 45, 46, 47, 48, 21, 36, 49, 50, 51, 52, 33, 53, 54, 55, 56,
       57, 58])
 array([59, 60, 21, 36, 61, 62, 63, 64, 65, 66, 67, 68, 16, 69, 46, 70, 71,
       72, 73, 74, 75, 76, 77, 64, 78, 79,  3, 80, 81,  6, 33, 34, 44, 82,
       64, 83, 84, 85,  2, 86, 87,  2, 88, 89])
 ...
 array([8863,   76,    2, 6154, 4131,   64,    2, 3383,  156,  110,  102,
        764,  567,  568,  769,  588, 7462,   76,    2, 4131,   64,    2,
       3629,  872,   33])
 array([ 2263,  2629,  4652,    58,    62,   112,  7602,  5382, 11352,
         424,  9669,    71,    64,  4574,   282,    79,   105,  2769,
        1688,    81,  1209,   208,    71,  6724,    23,  1317,   532,
           2,    71,   165,  3812,    46,  1688,    81,  1209,   262,
         911, 10536,  1183,  2305,   1

In [12]:
X_train = transform.pad_sentences(X_train, max_len, vocab[transform.padding])
X_test = transform.pad_sentences(X_test, max_len, vocab[transform.padding])


In [13]:
print(X_train[0:10])


[[  1.   2.   3. ...   0.   0.   0.]
 [ 43.  44.  45. ...   0.   0.   0.]
 [ 59.  60.  21. ...   0.   0.   0.]
 ...
 [174. 175.  64. ...   0.   0.   0.]
 [184.  98.   2. ...   0.   0.   0.]
 [ 18.  46. 220. ...   0.   0.   0.]]


In [14]:
print(X_train.shape)


(40000, 1399)


In [15]:
print(Y_test[0])
print(Y_test.shape)


[0 0 0 0 0 0]
(1000, 6)


In [16]:
from model.conv_model import ConvModel
import torch as th
import torch.nn as nn
from utils.cuda import use_cuda


In [17]:
model = ConvModel(len(vocab))
loss_fn = nn.BCELoss()

if use_cuda():
    model.cuda()
    loss_fn.cuda()

opt = th.optim.Adagrad(model.parameters(), lr=1e-2)

nb_epoch = 10

batch_size = 32
nb_batch = int(X_train.shape[0] / batch_size)

for e in range(nb_epoch):
    model.train()
    
    if use_cuda():
        model.cuda()
        
    sum_loss = 0
    nb_sent = 0
    
    for i in range(nb_batch):
        i_min = batch_size * i
        i_max = batch_size * (i + 1) if batch_size * (i + 1) < X_train.shape[0] else X_train.shape[0]
                
        x = th.Tensor(X_train[i_min:i_max]).long()
        y = th.Tensor(Y_train[i_min:i_max])
        if use_cuda():
            x = x.cuda()
            y = y.cuda()
        out = model(x)
        loss = loss_fn(out, y)
        loss.backward()
        opt.step()
        
        sum_loss += loss.item()
        nb_sent += 1
    
    sum_loss /= nb_sent
    
    print("Epoch %d : loss = %f" % (e, sum_loss))
    
    model.eval()
    
    model.cpu()
    
    x_test = th.Tensor(X_test).long()
    y_test = th.Tensor(Y_test)
    res = th.zeros(y_test.size())
    
    batch_size_test = 32
    nb_batch_test = int(x_test.size(0) / batch_size_test)
    
    for i in range(nb_batch_test):
        i_min = batch_size_test * i
        i_max = batch_size_test * (i + 1) if batch_size_test * (i + 1) < x_test.size(0) else x_test.size(0)
        
        x = x_test[i_min:i_max]
        y = y_test[i_min:i_max]
        
        out_test = model(x)
        diff = th.abs(y - out_test)
        res[i_min:i_max] = diff
    
    diff = res.mean(dim=0)
    differences = {
        "toxic": diff[0].item(),
        "severe_toxic": diff[1].item(),
        "obscene": diff[2].item(),
        "threat": diff[3].item(),
        "insult": diff[4].item(),
        "identity_hate": diff[5].item()}
    
    print(differences)


Epoch 0 : loss = 0.132084


Epoch 1 : loss = 0.095171


Epoch 2 : loss = 0.072656


Epoch 3 : loss = 0.058912


Epoch 4 : loss = 0.052584


Epoch 5 : loss = 0.048456


Epoch 6 : loss = 0.046417


Epoch 7 : loss = 0.043130


Epoch 8 : loss = 0.040100


Epoch 9 : loss = 0.040141


In [18]:
import pickle


In [21]:
saved_model = model.cpu()
saved_loss = loss_fn.cpu()


In [22]:
pickle.dump(model, open("saved/model.p", "wb"))
pickle.dump(loss_fn, open("saved/loss_fn.p", "wb"))
pickle.dump(opt, open("saved/opt.p", "wb"))
pickle.dump(vocab, open("saved/vocab.p", "wb"))
pickle.dump(count, open("saved/count.p", "wb"))
