In [3]:
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data as td
import torchtext
from torchtext.data import Iterator, BucketIterator
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import copy

In [4]:
manualSeed = random.randint(1, 10000) # use if you want new results
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)

Random Seed:  3759


<torch._C.Generator at 0x1a8d56ace50>

In [5]:
# custom weights initialization called on netG and netD
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

In [6]:
def split_text(text):
    return text.split(' ')

In [15]:
def prepare_sequences():
    tokenizer = split_text # the function above is the function we will be using to tokenize the text
    TEXT = torchtext.data.ReversibleField(sequential=True, tokenize=tokenizer, lower=True)
    LABEL = torchtext.data.Field(sequential=False, use_vocab=False) # sequential and use_vocab=False since no text (binary)
    QID = torchtext.data.Field(sequential=False, use_vocab=False)
    train_datafields = [("qid", None), ("question_text", TEXT), ("target", LABEL)]
    train = torchtext.data.TabularDataset( # If we had a validation set as well, we would add an additional .splits(...)
                        path="data/sample/training/train_train_sample_cleaned.csv", # the root directory where the data lies
                        format='csv',
                        # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
                        skip_header=True, 
                        fields=train_datafields)
    val_datafields = [("qid", None),
                     ("question_text", TEXT), ("target", LABEL)] 
    val = torchtext.data.TabularDataset( 
                path="data/sample/test/train_test_sample_cleaned.csv",
                format="csv",
                skip_header=True,
                fields=val_datafields
    )
    test_datafields = [("qid", QID),
                       ("question_text", TEXT)]
    test = torchtext.data.TabularDataset( 
                path="data/test_cleaned.csv",
                format="csv",
                skip_header=True,
                fields=test_datafields
    )
    return TEXT, LABEL, train, val, test

In [16]:
# TEXT, LABEL, folds, test = prepare_sequences()
TEXT, LABEL, train, val, test = prepare_sequences()

In [17]:

TEXT.build_vocab(train, vectors="glove.6B.100d")
TEXT.build_vocab(val, vectors="glove.6B.100d")
TEXT.build_vocab(test, vectors="glove.6B.100d")

.vector_cache\glove.6B.zip: 862MB [11:43, 1.23MB/s]                               
100%|█████████▉| 398632/400000 [00:40<00:00, 18837.44it/s]

In [28]:
train_iter = BucketIterator(
     train, # we pass in the datasets we want the iterator to draw data from
     batch_size=64, 
     sort_key=lambda x: len(x.question_text), # the BucketIterator needs to be told what function it should use to group the data.
     sort_within_batch=False, # sorting would add bias
     repeat=False 
)

ngpu = 1  
val_iter = Iterator(
    val,
    batch_size=64,
    sort=False,
    sort_within_batch=False,
    repeat=False
)

test_iter = Iterator(
    test,
    batch_size=64,
    sort=False,
    sort_within_batch=False,
    repeat=False
)
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

In [1]:
class BiLSTM(nn.Module):
    def __init__(self, hidden_dim, emb_dim, num_linear, num_lstm, bidirectional, lstm_dropout, lin_layer_dropout):
        super(BiLSTM, self).__init__() # gives us access to nn.Module methods and attributes
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim) # similar to word2vec model
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=num_lstm, dropout=lstm_dropout, bidirectional=bidirectional)
        self.linear_layers = []
        self.lin_layer_dropout = lin_layer_dropout
        if bidirectional == True:
            scale = 2 # Twice as many units in case of bidirectional
        else:
            scale = 1
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim * scale, hidden_dim * scale))
            self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim * scale, 2)
        
    def forward(self, seq):
        out, _ = self.encoder(self.embedding(seq))
        feature = out[-1, :, :] # Keep the hidden layer from the last LSTM iteration
        print(feature.shape)
        
        for layer in self.linear_layers:
            feature = layer(feature) 
            feature = F.relu(feature)
            # feature = F.dropout(feature, p=self.lin_layer_dropout, training=True)
            preds = self.predictor(feature) # Preds is current predictions at timestep t
        
        feature = F.relu(feature)
        preds = self.predictor(feature)
        # return nn.Softmax(dim=1)(preds)
        return nn.LogSoftmax(dim = 1)(preds)

NameError: name 'nn' is not defined

In [2]:
# Initialize all network parameters
nh = 500
em_sz = 50
nl = 1
nlstm = 2
bidirectional = True
lstm_drop = 0.1
lin_drop = 0.1

In [None]:
# Initialize all training parameters
num_epochs = 5


In [None]:
lstm = BiLSTM(nh, em_sz, nl, nlstm, bidirectional, lstm_drop, lin_drop).cuda()
lstm.apply(weights_init)
print(lstm)

In [None]:
for data in train_iter:
    optimizer.zero_grad()
    lstm.zero_grad()
    predicted = lstm.forward(data.question_text.to(device).long())
    break

In [None]:
criterion = nn.NLLLoss()
lr = 0.2
optimizer = optim.SGD(lstm.parameters(), lr = lr, momentum=0.9)
errors = []
for epoch in range(num_epochs):
    print("Epoch={}".format(epoch))
    total_error = 0
    progress = 0
    for data in train_iter:
        optimizer.zero_grad()
        lstm.zero_grad()
        predicted = lstm.forward(data.question_text.to(device).long())
        loss = criterion(predicted.squeeze(), data.target.to(device).long())
        total_error += loss
        loss.backward()
        optimizer.step()
        if progress % 100 == 0: 
            print("loss -> {}".format(loss.item()))
        progress += 1
    print("Total Batch error={}".format(total_error))
    errors.append(total_error)
torch.save(lstm.state_dict(),'bi_lstm_cleaned_v1.1.pt')

In [15]:
lstm = BiLSTM(nh, em_sz, nl, nlstm, bidirectional, lstm_drop, lin_drop).cuda()
lstm.load_state_dict(torch.load('bi_lstm_cleaned_v1.1.pt'))
lstm.eval()

BiLSTM(
  (embedding): Embedding(113030, 50)
  (encoder): LSTM(50, 500, num_layers=2, dropout=0.1, bidirectional=True)
  (predictor): Linear(in_features=1000, out_features=2, bias=True)
)

In [16]:
accuracy = 0
batch_size = 64
num_batches = 0
for batch in val_iter:
    actual = batch.target.to(device)
    question = batch.question_text.to(device).long()
    preds = lstm.forward(question)
    actual = actual.cpu().detach().numpy()
    preds = preds.cpu().detach().numpy()
    preds = np.array([np.argmax(row) for row in preds])
    total_correct = sum(actual == preds)
    accuracy += total_correct
    num_batches += 1
accuracy / (num_batches * batch_size)

0.8737030632411067

In [29]:
# Prepare submission for kaggle
predictions = {}
for batch in test_iter:
    qid = batch.qid
    question = batch.question_text.to(device).long()
    preds = lstm.forward(question)
    preds = preds.cpu().detach().numpy()
    preds = np.array([np.argmax(row) for row in preds])
    for idx, ID in enumerate(qid):
        predictions[ID.item()] = preds[idx]

In [30]:
import pandas as pd

In [69]:
test_qid = pd.read_csv('test_qid.csv', header = None)

In [70]:
submission = {'qid' : [], 'prediction' : []}

In [71]:
test_qid.columns = ['indx', 'qid']
for indx in test_qid.indx: 
    submission['qid'].append(test_qid.qid[test_qid.indx == indx].item())
    submission['prediction'].append(predictions[indx])

In [72]:
len(submission['qid'])

375806

In [74]:
submission_ = pd.DataFrame.from_dict(submission)

In [75]:
submission_.head()

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0


In [76]:
submission_.to_csv('submission.csv', index = False)