In [1]:
# coding: utf-8
import pandas as pd
import numpy as np
import re
import logging
import torch
from torchtext import data
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import pickle
import io
import time
import sys
import model
import datahelper


In [17]:
device = -1 # 0 for gpu, -1 for cpu
batch_size = 16
test_mode = 0  # 0 for train+test 1 for test
embedding_dim = 100
hidden_dim = 64
epochs = 1

In [2]:




print('Reading data..')
normalize_pipeline = data.Pipeline(convert_token=datahelper.normalizeString)
ID = data.Field(sequential=False, batch_first=True)
TEXT = data.Field(sequential=True, lower=True, eos_token='<EOS>', init_token='<BOS>',
                  pad_token='<PAD>', fix_length=None, batch_first=True, preprocessing=normalize_pipeline)
LABEL = data.Field(sequential=False, batch_first=True)


train = data.TabularDataset(
        path='../data/train.tsv', format='tsv',
        fields=[('Id', ID), ('Label', LABEL), ('Review', TEXT)], skip_header=True)
test = data.TabularDataset(
        path='../data/test.tsv', format='tsv',
        fields=[('Id', ID), ('Review', TEXT)], skip_header=True)


TEXT.build_vocab(train.Review,test.Review)
ID.build_vocab(train.Id, test.Id)
LABEL.build_vocab(train.Label, test.Label)


print('Build Finished.')

Reading data..
Build Finished.


In [3]:
train_iter = data.BucketIterator(dataset=train, batch_size=batch_size, sort_key=lambda x: len(x.Review), device=device, repeat=False)
test_iter = data.Iterator(dataset=test, batch_size=batch_size, device=device, shuffle=False, repeat=False)


In [7]:
train_dl = datahelper.BatchWrapper(train_iter, "Review", ["Label"])
test_dl = datahelper.BatchWrapper(test_iter, "Review", ["Id"])
print('Reading data done.')

Reading data done.


In [9]:
next(iter(train_dl))

in


(Variable containing:
  2.0000e+00  7.2000e+01  2.0200e+02  ...   1.0000e+00  1.0000e+00  1.0000e+00
  2.0000e+00  7.2000e+01  4.6200e+02  ...   1.0000e+00  1.0000e+00  1.0000e+00
  2.0000e+00  2.9482e+04  3.3640e+03  ...   1.0000e+00  1.0000e+00  1.0000e+00
                 ...                   ⋱                   ...                
  2.0000e+00  1.0600e+02  1.8800e+02  ...   2.0816e+05  3.0000e+00  1.0000e+00
  2.0000e+00  7.2000e+01  1.0100e+02  ...   8.4000e+01  2.8033e+04  3.0000e+00
  2.0000e+00  1.4120e+04  8.2100e+02  ...   5.6900e+02  2.7545e+05  3.0000e+00
 [torch.LongTensor of size 16x243], Variable containing:
     1
     2
     1
     2
     2
     1
     1
     1
     2
     1
     2
     1
     2
     2
     2
     1
 [torch.LongTensor of size 16x1])

In [23]:
print('Initialing model..')
MODEL = model.lstm_model(len(TEXT.vocab), embedding_dim, hidden_dim, batch_size)
if device == 0:
    MODEL.cuda()

# Train
if not test_mode:
    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(MODEL.parameters(), lr=1e-3)
    print('Start training..')

    train_iter.create_batches()
    batch_num = len(list(train_iter.batches))

    for i in range(epochs) :
        avg_loss = 0.0
        train_iter.init_epoch()
        batch_count = 0
        for batch, label in train_dl:
            batch_start = time.time()
            y_pred,_ = MODEL(batch)
            label = (label-1).view(-1)
            loss = loss_function(y_pred, label)
            MODEL.zero_grad()
            loss.backward()
            optimizer.step()
            batch_count += 1
            batch_end = time.time()
            print('Finish {}/{} batch, {}/{} epoch. Time consuming {}s, loss is {}'.format(batch_count, batch_num, i+1, epochs, round(batch_end - batch_start, 2), float(loss)))
            torch.save(MODEL.state_dict(), 'model' + str(i+1)+'.pth')           

Initialing model..
Start training..
in
Finish 1/1563 batch, 1/1 epoch. Time consuming 0.89s, loss is 0.6891403198242188
in


KeyboardInterrupt: 

In [25]:
# Test
print('Start predicting...')
MODEL.load_state_dict(torch.load('model{}.pth'.format(epochs)))

f1 = open('submission.csv','w')
f1.write('"id","sentiment"'+'\n')
final_res = []

for batch, _ in iter(test_dl):
    hidden = MODEL.init_hidden()
    y_pred,_ = MODEL(batch)
    pred_res = y_pred.data.max(1)[1].cpu().numpy()
    final_res.extend(pred_res)

print('Prediction done...')
for idx, res in enumerate(final_res):
    text_id = test_iter.dataset.examples[idx].Id
    f1.write(text_id + ',' + str(res)+'\n')
print('Results dumping done...')


Start predicting...
in
in
in


Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/message.c:4018)
KeyboardInterrupt


in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
i