In [111]:
import pandas as pd
import pickle as pkl
import spacy
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [32]:
train = pd.read_csv('data/snli_train.tsv', sep='\t')
val = pd.read_csv('data/snli_val.tsv', sep='\t')

train.columns = ['premise', 'hypothesis', 'label']
val.columns = ['premise', 'hypothesis', 'label']

In [40]:
print('train data size: {}'.format(len(train)))
print('validation data size: {}'.format(len(val)))

train data size: 100000
validation data size: 1000


In [52]:
train_sentence = pd.concat([train['premise'], train['hypothesis']])

In [137]:
y_train = train['label'].replace('entailment',1).replace('neutral',0).replace('contradiction',-1)

In [57]:
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]

def tokenize_dataset(dataset):
    token_dataset = []
    all_tokens = []
    count = 0
    for sample in dataset:
        count += 1
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        all_tokens += tokens
        if count%10000 == 0:
            print('10000 tokenized.')
    return token_dataset, all_tokens

train_tokens, all_tokens = tokenize_dataset(train_sentence)

In [59]:
print ("Total number of tokens in train dataset is {}".format(len(all_tokens)))
print ("Total number of *unique* tokens in train dataset is {}".format(len(set(all_tokens))))

Total number of tokens in train dataset is 2038281
Total number of *unique* tokens in train dataset is 19643


In [60]:
pkl.dump(train_tokens, open("train_tokens.p", "wb"))
pkl.dump(all_tokens, open("all_tokens.p", "wb"))

In [61]:
from collections import Counter

max_vocab_size = 2000
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

def token2index(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

token2id, id2token = build_vocab(all_tokens)
train_indices = token2index(train_tokens)

In [None]:
embed = nn.Embedding()

In [176]:
MAX_SENTENCE_LENGTH = 15

import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    def __init__(self, pre_list, hypo_list, target_list):
        self.pre_list = pre_list
        self.hypo_list = hypo_list
        self.target_list = target_list
        assert (len(self.pre_list) == len(self.target_list))
        assert (len(self.pre_list) == len(self.target_list))
        
    def __len__(self):
        return len(self.pre_list)
        
    def __getitem__(self, key):
        pre_idx = self.pre_list[key][:MAX_SENTENCE_LENGTH]
        hypo_idx = self.hypo_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [pre_idx, hypo_idx, len(pre_idx), label]


In [177]:
train_data = NewsGroupDataset(train_indices[:100000], train_indices[100000:], y_train)

In [1]:
train_data

NameError: name 'train_data' is not defined

In [185]:
def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    for datum in batch:
        label_list.append(datum[3])
        length_list.append(2*datum[2])
    for datum in batch:
        pre_padded = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-len(datum[0]))), 
                                mode="constant", constant_values=0)  
        hypo_padded = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-len(datum[1]))), 
                                mode="constant", constant_values=0)
        
        data_list.append(np.concatenate([pre_padded, hypo_padded]))
        
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [186]:
BATCH_SIZE = 10

train_loader = torch.utils.data.DataLoader(dataset=train_data, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

In [187]:
class LogisticRegressionPyTorch(nn.Module):
    def __init__(self, n_in, n_out):
        """
        n_in: Number of features
        n_out: Number of output classes
        """
        # Initialize the parent class - this is a Python requirement
        super().__init__()
        
        # Set up out linear layer. This initializes the weights
        # Note that self.linear is itself a nn.Module, nested within
        #   this module
        self.linear = nn.Linear(n_in, n_out)
        
        # Explicitly initialize the weights with the initialization
        #   scheme we want.
        self.init_weights()
        
    def forward(self, x):
        """
        x: Input data [N, k]
        ---
        Returns: log probabilities of each class [N, c]
        """
        # Apply the linear function to get our logit (real numbers)
        logit = self.linear(x)
        
        # Apply log_softmax to get logs of normalized probabilities
        return F.log_softmax(logit, dim=1)
    
    def init_weights(self):
        # Use some specific initialization schemes
        nn.init.xavier_normal_(self.linear.weight)
        nn.init.uniform_(self.linear.bias)

In [188]:
model = LogisticRegressionPyTorch(n_in=2*MAX_SENTENCE_LENGTH, n_out=3)
criterion = torch.nn.CrossEntropyLoss()  
learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [189]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [191]:
# Run over many iterations!
for i in range(10000):
    
    # Always zero-out the gradients managed by your optimizer
    # PyTorch does not automatically zero-out your gradients
    #   You can also do pt_model.zero_grad() in this case.
    optimizer.zero_grad()
    
    # !! Put model into training mode. This does not do anything
    #   in a simple Logistic Regression model, but will be important 
    #   later. (See: Dropout)
    pt_model.train()
    
    # Compute the predicted log-probabilities
    y_hat = pt_model(x)
    
    # Compute the loss
    train_loss = criterion(y_hat, y)
    
    # Back-propagate the gradients to the parameters
    train_loss.backward()
    
    # Apply the gradient updates to the parameters
    optimizer.step()
    
    # Recompute the loss in evaluation mode, and record it.
    # Again, this does not do anything here, but will be important later.
    # Since we are evaluating, we will also tell PyTorch not to
    #   compute gradients.
    pt_model.eval()
    with torch.no_grad():
        y_hat = pt_model(x)
        eval_loss = criterion(y_hat, y)
        
    # Record the loss
    # Note that 'loss' is a Tensor, but loss.item() is a number
    loss_val_ls.append(eval_loss.item())

RuntimeError: Expected object of scalar type Float but got scalar type Long for argument #4 'mat1'