In [None]:
# coding: utf-8
import pandas as pd
import numpy as np
import re
import logging
import torch
from torchtext import data
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import pickle
import io
import time
import sys
import model
import datahelper
LOGGER = logging.getLogger("toxic_dataset")



device = -1 # 0 for gpu, -1 for cpu
batch_size = 16
test_mode = 0  # 0 for train+test 1 for test
embedding_dim = 100
hidden_dim = 64
epochs = 4



print('Reading data..')
normalize_pipeline = data.Pipeline(convert_token=datahelper.normalizeString)
ID = data.Field(sequential=False, batch_first=True)
TEXT = data.Field(sequential=True, lower=True, eos_token='<EOS>', init_token='<BOS>',
                  pad_token='<PAD>', fix_length=None, batch_first=True, preprocessing=normalize_pipeline)
LABEL = data.Field(sequential=False, batch_first=True)


train = data.TabularDataset(
        path='../data/train.tsv', format='tsv',
        fields=[('Id', ID), ('Label', LABEL), ('Review', TEXT)], skip_header=True)
test = data.TabularDataset(
        path='../data/test.tsv', format='tsv',
        fields=[('Id', ID), ('Review', TEXT)], skip_header=True)


TEXT.build_vocab(train.Review,test.Review)
ID.build_vocab(train.Id, test.Id)
LABEL.build_vocab(train.Label, test.Label)


print('Build Finished.')


Reading data..


In [None]:
train_iter = data.BucketIterator(dataset=train, batch_size=batch_size, sort_key=lambda x: len(x.Review), device=device, repeat=False)
test_iter = data.Iterator(dataset=test, batch_size=batch_size, device=device, shuffle=False, repeat=False)


In [None]:
train_dl = datahelper.BatchWrapper(train_iter, "Review", ["Id", "Label"])
test_dl = datahelper.BatchWrapper(test_iter, "Review", ["Id"])
print('Reading data done.')

In [20]:
next(iter(train_dl))

(Variable containing:
  2.0000e+00  4.2830e+03  1.9350e+03  ...   2.3442e+04  3.0000e+00  1.0000e+00
  2.0000e+00  1.9800e+03  4.2000e+01  ...   1.6149e+05  3.0000e+00  1.0000e+00
  2.0000e+00  7.2000e+01  1.6830e+03  ...   2.1580e+03  3.0000e+00  1.0000e+00
                 ...                   ⋱                   ...                
  2.0000e+00  4.3430e+03  3.6198e+04  ...   5.0000e+00  2.7759e+04  3.0000e+00
  2.0000e+00  1.3799e+05  1.7096e+04  ...   8.4360e+03  1.8000e+01  3.0000e+00
  2.0000e+00  1.0600e+02  2.3000e+01  ...   4.9100e+02  1.1880e+03  3.0000e+00
 [torch.LongTensor of size 16x130], Variable containing:
  29979      2
    499      1
  30440      1
   9760      1
  22513      2
  30111      2
  14624      2
  20991      2
  41146      1
   2968      2
  29242      1
   2382      2
  38307      2
  41220      2
  38982      2
   5564      1
 [torch.FloatTensor of size 16x2])

In [None]:



print('Initialing model..')
MODEL = model.lstm_model(len(TEXT.vocab), embedding_dim, hidden_dim, batch_size)
if device == 0:
    MODEL.cuda()

# Train
if not test_mode:
    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(MODEL.parameters(), lr=1e-3)
    print('Start training..')

    train_iter.create_batches()
    batch_num = len(list(train_iter.batches))

    for i in range(epochs) :
        avg_loss = 0.0
        train_iter.init_epoch()
        batch_count = 0
        for batch, label in train_dl:
            batch_start = time.time()
            y_pred,_ = MODEL(batch)
            loss = loss_function(y_pred, label)
            MODEL.zero_grad()
            loss.backward()
            optimizer.step()
            batch_count += 1
            batch_end = time.time()
            print('Finish {}/{} batch, {}/{} epoch. Time consuming {}s, loss is {}'.format(batch_count, batch_num, i+1, epochs, round(batch_end - batch_start, 2), float(loss)))
        torch.save(MODEL.state_dict(), 'model' + str(i+1)+'.pth')           

# Test
print('Start predicting...')
MODEL.load_state_dict(torch.load('model{}.pth'.format(epochs)))

f1 = open('submission.csv','w')
f1.write('"id","sentiment"'+'\n')
final_res = []

for batch in iter(test_dl):
    hidden = MODEL.init_hidden()
    y_pred,_ = MODEL(batch)
    pred_res = y_pred.data.max(1)[1].cpu().numpy()
    final_res.extend(pred_res)

print('Prediction done...')
for idx, res in enumerate(final_res):
    text_id = test_iter.dataset.examples[idx].Id
    f1.write(text_id + ',' + str(res)+'\n')
print('Results dumping done...')





