In [2]:
import torch
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import torchtext

In [3]:
##Change working directory to talk-berty-to-me root
import os
os.chdir("D:/University/Projects/AML/talk-berty-to-me")

In [None]:
# data_gutenberg = pd.read_csv('data/books_and_genres.csv')
# dataset_train = data_gutenberg.sample(frac=0.6, random_state=0)
# dataset_val = data_gutenberg.drop(dataset_train.index).sample(frac=0.5, random_state=0)
# dataset_test = data_gutenberg.drop(dataset_train.index).drop(
#     dataset_val.index)
# dataset_train.to_parquet('data/datasets/train.parquet', index=False)
# dataset_test.to_parquet('data/datasets/test.parquet', index=False)
# dataset_val.to_parquet('data/datasets/val.parquet', index=False)
# data_gutenberg.sample(20).to_parquet('data/datasets/dev.parquet', index=False)

In [None]:
#Add code to switch between datasets here

In [5]:
dev_data = pd.read_parquet('data/datasets/dev.parquet')

In [6]:
#Building vocabulary
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
vocab_iter = iter(dev_data.loc[:,'text'] + dev_data.loc[:,'title'] + dev_data.loc[:,'genres'])
def yield_tokens(train_iter):
    for text in train_iter:
        if not isinstance(text, str):
            if type(text) == list:
                for t in text:
                    yield tokenizer(t)
            continue
        yield tokenizer(text)

In [7]:
vocab = build_vocab_from_iterator(
    yield_tokens(vocab_iter), specials=["<unk>"], min_freq=1000)
vocab.set_default_index(vocab["<unk>"])

In [8]:
VECTOR_CACHE_DIR = '/Users/setul/mlpp23/.vector_cache'
glove = torchtext.vocab.GloVe('6B', cache=VECTOR_CACHE_DIR)
glove_vectors = glove.get_vecs_by_tokens(vocab.get_itos())

In [9]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\setul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\setul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
dev_data.loc[:,'sentences'] = dev_data.loc[:,'text'].apply(lambda x: nltk.tokenize.sent_tokenize(str(x)))
dev_data = dev_data.explode('sentences')
#dev_data.loc[:,'keywords'] = dev_data.loc[:,'sentences'].apply(lambda x: rake_extract(x))
dev_data = dev_data.loc[:,['title', 'sentences', 'genres']]
dev_data.reset_index(drop=True, inplace=True)
dev_data['label_sentences'] = dev_data.groupby('title')['sentences'].shift(-1)
dev_data = dev_data.dropna(subset=['label_sentences'])

In [11]:
dev_data.iloc[14800:14806]

Unnamed: 0,title,sentences,genres,label_sentences
14804,autobiography and selected essays,2.,"{'biography', 'non-fiction'}",Which essay seems to you to be most successful...
14805,autobiography and selected essays,Which essay seems to you to be most successful...,"{'biography', 'non-fiction'}",3.
14806,autobiography and selected essays,3.,"{'biography', 'non-fiction'}",Has the character of the audience any influenc...
14807,autobiography and selected essays,Has the character of the audience any influenc...,"{'biography', 'non-fiction'}",4.
14808,autobiography and selected essays,4.,"{'biography', 'non-fiction'}",Compare the structure of one of Huxley's essay...
14809,autobiography and selected essays,Compare the structure of one of Huxley's essay...,"{'biography', 'non-fiction'}",5.


In [12]:
from torch.nn.utils.rnn import pad_sequence
def collate_batch(batch):
    titles, genres, sentences, label_sentences = zip(*batch)
    context = [tokenizer(g) + ['<BOS>'] + tokenizer(t) + ['<EOS>'] +
               ['<BOS>'] + tokenizer(s) + ['<EOS>'] for t, g,
                s in zip(titles, genres, sentences)]
    label_sentence = [['<BOS>'] + tokenizer(s) + ['<EOS>'] for s in label_sentences]
    label_tensor = pad_sequence([torch.tensor(vocab.lookup_indices(t)) for t in label_sentence],
                                    padding_value=vocab['<pad>'], batch_first=True)
    encoder_tensor = pad_sequence([torch.tensor(vocab.lookup_indices(t)) for t in context],
                                 padding_value=vocab['<pad>'], batch_first=True)
    return encoder_tensor, label_tensor

In [13]:
#From HW4
from torch.utils.data import Sampler
class BatchSequentialSampler(Sampler):
    r"""Samples batches, s.t. the ith elements of each batch are sequential.

    Args:
        data_source (Dataset): dataset to sample from
    """

    def __init__(self, data_source, batch_size):
        self.data_source = data_source
        self.batch_size = batch_size
        
    def __iter__(self):
        num_batches = len(self.data_source)//self.batch_size
        for i in range(num_batches):
            for j in range(self.batch_size):
                yield(j * num_batches + i)

    def __len__(self):
        return (len(self.data_source)//self.batch_size) * self.batch_size

In [14]:
batch_sampler = BatchSequentialSampler(dev_data.loc[:,['title', 'genres', 'sentences', 'label_sentences']], 8)

In [15]:
batch_dataloader = torch.utils.data.DataLoader(dev_data.loc[:,['title', 'genres', 'sentences', 'label_sentences']].values,
                                                   batch_size=8, collate_fn=collate_batch, sampler=batch_sampler)

In [16]:
for idx, (context_tensor, label_tensor) in enumerate(batch_dataloader):
    print(context_tensor.shape)
    print(label_tensor.shape)
    break

torch.Size([8, 90])
torch.Size([8, 33])


In [17]:
import torch.nn as nn
import torch.nn.functional as F
IS_CUDA = torch.cuda.is_available()
if IS_CUDA:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [18]:
class BiRNN_encoder(nn.Module):
    def __init__ (self, embedding_dim, hidden_dim,
                  vocab_size, num_layers=2, type_rnn = 'LSTM', bidirectional = True,
                  dropout = 0.3, pad_idx = 0):
        super(BiRNN_encoder, self).__init__()
        self.rnns = []
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        hidden_size = hidden_dim * 2 if bidirectional else hidden_dim
        input_size = embedding_dim
        for _ in range(num_layers):
            if type_rnn == 'LSTM':
                rnn = nn.LSTM(input_size, hidden_size, 1, dropout = dropout,
                               bidirectional = bidirectional, batch_first=True)
            elif type_rnn == 'GRU':
                rnn = nn.GRU(input_size, hidden_size, 1, dropout = dropout,
                              bidirectional = bidirectional, batch_first=True)
            self.rnns.append(rnn)
            input_size = hidden_size*2 if bidirectional else hidden_size
        self.rnns = nn.ModuleList(self.rnns)
        self.dropout = nn.Dropout(dropout)
        self.type_rnn = type_rnn
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, input, hidden = None):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        rnn_input = embedded
        for idx, rnn in enumerate(self.rnns):
            output, hidden_output = rnn(rnn_input, hidden)
            hidden = hidden_output
            rnn_input = output
        if self.type_rnn == 'LSTM' and self.bidirectional:
            hidden_state = torch.cat((hidden[0][-2,:,:], hidden[0][-1,:,:]), dim = 1)
            cell = torch.cat((hidden[1][-2,:,:], hidden[1][-1,:,:]), dim = 1)
            hidden = (hidden_state, cell)
        elif self.type_rnn == 'GRU' and self.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        return hidden

In [19]:
encoder = BiRNN_encoder(300, 600, len(vocab), num_layers=2, type_rnn = 'LSTM',
                         bidirectional = True, dropout = 0.3, pad_idx = vocab['<pad>']).to(device)



In [21]:
for i, batch in enumerate(batch_dataloader):
    context_tensor, label_tensor = batch
    context_tensor, label_tensor = context_tensor.to(device), label_tensor.to(device)
    hidden,cell = encoder(context_tensor)
    print(hidden.shape)
    print(cell.shape)
    break

torch.Size([8, 2400])
torch.Size([8, 2400])


In [22]:
class BiRNN_decoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_dim, dropout = 0.3):
        super(BiRNN_decoder, self).__init__()
        #self.input_size = input_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_dim = vocab_dim
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(vocab_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout = dropout)
        self.fc_out = nn.Linear(hidden_dim, vocab_dim)


    def forward(self, input, hidden, context):
        #input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        # hidden = hidden.unsqueeze(0)
        # context = context.unsqueeze(0)
        print("input decoder LSTM emb",embedded.shape)
        print("input decoder LSTM hid",hidden.shape)
        print("input decoder LSTM ctx",context.shape)
        outputs, (hidden,context) = self.rnn(embedded, (hidden, context))
        predictions = self.fc_out(outputs)
        predictions = predictions.squeeze(0)
        return predictions, hidden, context
    


In [23]:
decoder = BiRNN_decoder(300, 2400, 1, len(vocab), dropout = 0.3).to(device)



In [24]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg, teacher_ratio = 0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        print("target length", trg.shape)
        trg_vocab_size = len(vocab)
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)
        hidden, context = self.encoder(src)
        input = trg[:, 0]
        input = input.unsqueeze(0)
        hidden = hidden.unsqueeze(0)
        context = context.unsqueeze(0)
        for t in range(1, trg_len):
            print("t", t)
            output, hidden, context = self.decoder(input, hidden, context)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[:,t] if np.random.random() < teacher_ratio else top1
            input = input.unsqueeze(0)
        return outputs

In [26]:
model = Seq2Seq(encoder, decoder).to(device)

In [None]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 999999
best_epoch = -1
sentence1 = "Hello I am starting"
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index = vocab['<pad>'])
ts1 = []
for epoch in range(num_epochs):
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.train(True)
  for batch_idx, batch in enumerate(batch_dataloader):
    input , target = batch
    input, target = input.to(device), target.to(device)
    output = model(input, target).to(device)

    #issue with reshaping
    #output pre is seq_len * batch_size * vocab_size
    output = output[1:].reshape(-1, output.shape[2])
    target = target[:,1:].reshape(-1)
    
    optimizer.zero_grad()
    loss = criterion(output, target)
    loss.backward()

    # Clip gradient >1 to prevent exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using bp 
    optimizer.step()
    #step += 1
    epoch_loss += loss.item()
    #writer.add_scalar("Training loss", loss, global_step=step)

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    if ((epoch - best_epoch) >= 10):
      print("no improvement in 10 epochs, break")
      break
  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(batch_dataloader))