In [174]:
import time
import json
import math
import csv


import pandas as pd
import numpy as np

import tqdm

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
from torch import optim

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from sklearn.model_selection import train_test_split

from pytorch_pretrained_bert.tokenization import BertTokenizer

from flair.embeddings import BertEmbeddings, FlairEmbeddings, StackedEmbeddings 
from flair.data import Sentence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
def rescue_code(function):
    import inspect
    get_ipython().set_next_input("".join(inspect.getsourcelines(function)[0]))

In [7]:
# Read CSV data

train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
train.drop('Unnamed: 0',inplace=True, axis=1)
test.drop('Unnamed: 0',inplace=True, axis=1)

# Retain the rights columns

X_train = train.drop(['id','answer_start', 'answer_stop','text', 'is_impossible'], axis=1)
X_test = test.drop(['id','answer_start', 'answer_stop','text', 'is_impossible'], axis=1)

y_train = train.text
y_test = test.text

In [8]:
# Add the Sentence start and end markers

def separators(string):
    string = "[CLS] {} [EOS]".format(string)
    return string

X_train['context'] = X_train.context.apply(separators)
X_train['question'] = X_train.question.apply(separators)

In [9]:
# Compute the tokens for the input

TOKENS_COMPUTED = True

if not TOKENS_COMPUTED:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    tok_context_train = [tokenizer.tokenize(con) for con in X_train.context]
    tok_context_test = [tokenizer.tokenize(con) for con in X_test.context]

    tok_qs_train = [tokenizer.tokenize(qs) for qs in X_train.question]
    tok_qs_test = [tokenizer.tokenize(qs) for qs in X_test.question]

    tok_answer_train = [tokenizer.tokenize(con) for con in y_train.astype(str)]
    tok_answer_test = [tokenizer.tokenize(con) for con in y_test.astype(str)]


# Load the tokens

if TOKENS_COMPUTED:
    with open('../data/train/train_context_tok.json') as f:
        tok_context_train = json.load(f)

    with open('../data/train/train_qs_tok.json') as f:
        tok_qs_train = json.load(f)

    with open('../data/train/train_answer_tok.json') as f:
        tok_answer_train = json.load(f)

    ## Get json dicts for test

    with open('../data/test/test_context_tok.json') as f:
        tok_context_test = json.load(f)

    with open('../data/test/test_qs_tok.json') as f:
        tok_qs_test = json.load(f)

    with open('../data/test/test_answer_tok.json') as f:
        tok_answer_test = json.load(f)

In [10]:
# Create a vocab class containing dictionaries and ids to words translators
# Any word that has less than 100 occurences won't be added to the vocabulary
THRES = 100

class Vocab:
    def __init__(self, str_lst):
        self.str_lst = str_lst
        self.vocab, self.len_vocab = self.get_vocab()
        self.reversed_vocab = self.reverse_dict()
        self.ids_col = self.str_to_int(False)
        self.words_col = self.int_to_str(False)

    def get_vocab(self):
        vocab = [item for sublist in self.str_lst for item in sublist]
        vocab_dict = {}
        counter = 1
        for token in vocab:
            if token not in vocab_dict.keys():
                vocab_dict[token] = {'id':counter, 'occurences':1}
                counter += 1
            else:
                vocab_dict[token]['occurences'] += 1

        length = [1 if vocab_dict[tok]['occurences'] > THRES else 0 for tok in vocab_dict.keys()]

        final_dict = {}
        counter = 1
        for iter in range(len(length)):
            if length[iter]:
                this = list(vocab_dict.keys())[iter]
                final_dict[this] = counter
                counter +=1

        final_dict['UNK'] = 0
        return final_dict, sum(length)

    def str_to_int(self, other):
        ints = []
        if other is False:
            for sentence in self.str_lst:
                sent = []
                for word in sentence:
                    try:
                        inti = self.vocab[word]
                    except KeyError:
                        inti = 0
                    sent.append(inti)
                ints.append(sent)
            return pd.Series(ints)
        else:
            for sentence in other:
                sent = []
                for word in sentence:
                    try:
                        inti = self.vocab[word]
                    except KeyError:
                        inti = 0
                    sent.append(inti)
                ints.append(sent)
            return pd.Series(ints)

    def reverse_dict(self):
        return {v: k for k, v in self.vocab.items()}

    def int_to_str(self, other):
        words = []
        if other is False:
            for sentence in self.ids_col:
                words.append([self.reversed_vocab[ids] for ids in sentence])
        else:
            for sentence in other:
                words.append([self.reversed_vocab[ids] for ids in sentence])
            
        return pd.Series(words)

class Squad:
    def __init__(self, context, qs):
        self.context = Vocab(context)
        self.qs = self.context.str_to_int(qs)

In [11]:
print('Building vocabulary...')

X_train_vocab = Squad(tok_context_train, tok_qs_train)
X_test_vocab = Squad(tok_context_test, tok_qs_test)

X_train["ids_context"] = X_train_vocab.context.ids_col
X_train["ids_qs"] = X_train_vocab.qs
y_train = X_train_vocab.context.str_to_int(tok_answer_train)

X_test["ids_context"] = X_test_vocab.context.ids_col
X_test["ids_qs"] = X_test_vocab.qs
y_test = X_test_vocab.context.str_to_int(tok_answer_test)

print('Vocab built')

Building vocabulary...
Vocab built


In [12]:
# Join ids for input prep

print('Concatenating ids')
def concat(df , col_a, col_b, name):
    new_col = df[col_a] + df[col_b]
    df[name] = new_col

concat(X_train, 'ids_context', 'ids_qs', 'ids')
concat(X_test, 'ids_context', 'ids_qs', 'ids')

X_train['ids'] = X_train.ids.apply(np.array)
X_test['ids'] = X_test.ids.apply(np.array)

Concatenating ids


In [13]:
# Tokenize the input with Flair function

print('Building Dataset Sentences')

concat(X_train, 'question', 'context', 'txt')
concat(X_test, 'question', 'context', 'txt')


print('Computing tokens for further BERT embedding')

X_train['txt'] = X_train.txt.apply(Sentence)
X_test['txt'] = X_test.txt.apply(Sentence)

Building Dataset Sentences
Computing tokens for further BERT embedding


In [14]:
# Get sizes for input and outputs

sizes_in = [len(i) for i in X_train.txt]
print(max(sizes_in))

sizes_out = [len(y_train[i]) for i in range(len(y_train))]
print(max(sizes_out))

669
68


In [15]:
# Delete rows with input exceeding max length

train_vocab_size = X_train_vocab.context.len_vocab+1

print('Train Vocab size is: {}'.format(train_vocab_size))
print('Deleting rows with inputs too large for embedding')

X_train['target'] = y_train
X_train = X_train[X_train['txt'].map(len) <= 512]

X_test['target'] = y_test
X_test = X_test[X_test['txt'].map(len) <= 512]

Train Vocab size is: 14514
Deleting rows with inputs too large for embedding


In [16]:
# Save tokens

SAVE_VOCAB = False

if SAVE_VOCAB:
    ## Save vocabs and tokens for train
    with open('../data/train/train_qs_tok.json', 'w') as fp:
        json.dump(tok_qs_train, fp)

    with open('../data/train/train_context_vocab.json', 'w') as fp:
        json.dump(X_train_vocab.context.vocab, fp)

    with open('../data/train/train_context_tok.json', 'w') as fp:
        json.dump(tok_context_train, fp)

    with open('../data/train/train_answer_tok.json', 'w') as fp:
        json.dump(tok_answer_train, fp)
    
    ## Save vocabs and tokens for test

    with open('../data/test/test_qs_tok.json', 'w') as fp:
        json.dump(tok_qs_test, fp)

    with open('../data/test/test_context_vocab.json', 'w') as fp:
        json.dump(X_test_vocab.context.vocab, fp)

    with open('../data/test/test_context_tok.json', 'w') as fp:
        json.dump(tok_context_test, fp)

    with open('../data/test/test_answer_tok.json', 'w') as fp:
        json.dump(tok_answer_test, fp)

In [17]:
sizes_in = [len(i) for i in X_train.txt]
print(max(sizes_in))

sizes_out = [len(y_train[i]) for i in range(len(y_train))]
print(max(sizes_out))

512
68


In [18]:
oversized  = [1 if size>512 else 0 for size in sizes_in]
sum(oversized)/len(sizes_in)

0.0

In [19]:
# Make all outputs the same size

def padding(series, max_size):
    return [np.concatenate((sentence, np.zeros(max_size-len(sentence)))) for sentence in series]

target_tensor = padding(X_train.target, max(sizes_out))
X_train['target'] = target_tensor

In [20]:
# init Flair embeddings
flair_forward_embedding = FlairEmbeddings('multi-forward')
flair_backward_embedding = FlairEmbeddings('multi-backward')

# init multilingual BERT
bert_embedding = BertEmbeddings('bert-base-multilingual-cased')

stacked_embeddings = StackedEmbeddings(
    embeddings=[flair_forward_embedding, flair_backward_embedding, bert_embedding])

In [21]:
# Use embedding on a sentence

def embedding(sentence):
    stacked_embeddings.embed(sentence)
    return torch.Tensor([list(word.embedding) for word in sentence])

In [22]:
#trainset = torch.utils.data.TensorDataset(in_train, out_train)
#trainloader = torch.utils.data.DataLoader(trainset, batch_size=1, num_workers=2)

In [170]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("The model has {} trainable parameters".format(count_parameters(model)))

optimizer = optim.Adam(model.parameters())

criterion = nn.NLLLoss()

The model has 31892658 trainable parameters


In [171]:
def stochastic_train(model, optimizer, criterion, clip, dataframe):
    
    model.train()
    
    epoch_loss = 0
    
    for sto_batch in range(dataframe.shape[0]):
        
        src = dataframe['txt'][sto_batch]
        trg = dataframe['target'][sto_batch]
        
        print('Got the batch')
        optimizer.zero_grad()
        
        print('Launching BERT embedding')
        src = embedding(src).unsqueeze_(0)
        trg = torch.LongTensor(trg)
        
        print('Launching LSTM forward')
        output = model(src, trg)
        output = output.squeeze_(0)
        
        
        loss = criterion(output, trg)
        print('Loss is: ',loss.item())
        print('Launching Backprop')
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [172]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0]
            trg = batch[1]

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [173]:
N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = stochastic_train(model, optimizer, criterion, CLIP, X_train)
    #valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #if valid_loss < best_valid_loss:
    #    best_valid_loss = valid_loss
    #    torch.save(model.state_dict(), '../models/tut1-model.pt')
    
    print('Epoch: {} | Time: {}m {}s'.format(epoch+1, epoch_mins, epoch_secs))
    print('\nTrain Loss: {} | Train PPL: {}'.format(train_loss, math.exp(train_loss)))
    #print('\n Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}'.format())
    
    
#model.load_state_dict(torch.load('../models/tut1-model.pt'))

#test_loss = evaluate(model, test_iterator, criterion)

#print('| Test Loss: {} | Test PPL: {} |'.format(test_loss, math.exp(test_loss)))

Got the batch
Launching BERT embedding
Launching LSTM forward


NameError: name 'Variable' is not defined