In [3]:
import torch
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import torchtext

In [4]:
##Change working directory to talk-berty-to-me root
import os
os.chdir("D:/University/Projects/AML/talk-berty-to-me")

In [61]:
data_gutenberg = pd.read_csv('data/books_and_genres.csv')

In [62]:
dev_data = data_gutenberg.sample(frac=0.1, random_state=42)

In [7]:
# universal_set = set()
# def parse_set(string_set):
#     return ast.literal_eval(string_set)

# for string in list(zip(dev_data['genres'])):
#     parsed_set = parse_set(string[0])
#     universal_set = universal_set.union(parsed_set)
# universal_set_list = list(universal_set)

In [8]:
# selected_genres = ['fiction', 'classics', 'historical', '20th-century', 'non-fiction', 'literature', 'history']

In [9]:
# def OneHotEncodeGenres(genres):
#     return [1 if genre in genres else 0 for genre in selected_genres]


# dev_data.loc[:,'genre_one_hot'] = dev_data['genres'].apply(lambda x: OneHotEncodeGenres(x))

In [32]:
# dev_data.drop(columns=['genres'], inplace=True)

In [63]:
dev_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,genres
3686,3686,die ungleichen schalen,"Produced by Markus Brenner, Marina Lukas and t...",{'plays'}
3526,3526,galatea,"Produced by Carlo Traverso, Claudio Paganelli ...","{'contemporary', 'literary-fiction', 'romance'..."
5723,5723,surprising stories,"Produced by Susan Skinner, Marilynda Fraser-Cu...","{'unfinished', 'short-stories'}"
8603,8603,hypatia,Produced by P. J. Riddick\n\n\n\n\n\n\n\n\n\nH...,"{'literary-fiction', 'christian', 'history', '..."
3501,3501,the babes in the wood,"Produced by Jonathan Niehof, Suzanne Shell and...","{'picture-books', 'classics', 'fiction', '20th..."


In [65]:
#Building vocabulary
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
vocab_iter = iter(dev_data.loc[:,'text'] + dev_data.loc[:,'title'] + dev_data.loc[:,'genres'])
def yield_tokens(train_iter):
    for text in train_iter:
        if not isinstance(text, str):
            if type(text) == list:
                for t in text:
                    yield tokenizer(t)
            continue
        yield tokenizer(text)

In [66]:
vocab = build_vocab_from_iterator(
    yield_tokens(vocab_iter), specials=["<unk>"], min_freq=1000)
vocab.set_default_index(vocab["<unk>"])

In [67]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\setul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\setul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:
from rake_nltk import Rake
r = Rake()
def rake_extract(text):
    r.extract_keywords_from_text(str(text))
    ranked_phrases_with_scores = r.get_ranked_phrases_with_scores()
    sorted_phrases = sorted(ranked_phrases_with_scores, key=lambda x: x[0], reverse=True)
    if len(sorted_phrases) == 0:
        return ""
    return sorted_phrases[0][1]

Possible improvement is to remove proper nouns before employing rake. Will take more effort.
Reduce dataset to just English books. Improve text cleaning to remove redundant symbols.

In [69]:
dev_data.loc[:,'sentences'] = dev_data.loc[:,'text'].apply(lambda x: nltk.tokenize.sent_tokenize(str(x)))
dev_data = dev_data.explode('sentences')
dev_data.loc[:,'keywords'] = dev_data.loc[:,'sentences'].apply(lambda x: rake_extract(x))
dev_data = dev_data.loc[:,['title', 'sentences', 'keywords', 'genres']]
dev_data.reset_index(drop=True, inplace=True)

In [70]:
dev_data['label_sentences'] = dev_data.groupby('title')['sentences'].shift(-1)

dev_data = dev_data.dropna(subset=['label_sentences'])

In [71]:
dev_data['label_keywords'] = dev_data.groupby('title')['keywords'].shift(-1)

dev_data = dev_data.dropna(subset=['label_keywords'])

In [72]:
dev_data.head()

Unnamed: 0,title,sentences,keywords,genres,label_sentences,label_keywords
0,die ungleichen schalen,"Produced by Markus Brenner, Marina Lukas and t...",net die ungleichen schalen fünf einaktige dram...,{'plays'},Den Bühnen und Vereinen gegenüber\nManuskript.,den bühnen und vereinen gegenüber manuskript
1,die ungleichen schalen,Den Bühnen und Vereinen gegenüber\nManuskript.,den bühnen und vereinen gegenüber manuskript,{'plays'},Das Recht der Aufführung ist allein durch\nS. ...,das recht der aufführung ist allein durch
2,die ungleichen schalen,Das Recht der Aufführung ist allein durch\nS. ...,das recht der aufführung ist allein durch,{'plays'},90 zu erwerben.,90 zu erwerben
3,die ungleichen schalen,90 zu erwerben.,90 zu erwerben,{'plays'},"Copyright 1912 S. Fischer, Verlag, Berlin.",copyright 1912
4,die ungleichen schalen,"Copyright 1912 S. Fischer, Verlag, Berlin.",copyright 1912,{'plays'},Inhalt\n\nRasumowsky 9\n...,inhalt rasumowsky 9 gentz und fanny elßler 59 ...


In [80]:
from torch.nn.utils.rnn import pad_sequence
def collate_sentences(batch):
    titles, genres, sentences, label_sentences = zip(*batch)
    input_sentence = [tokenizer(t) + ['<BOS>'] + tokenizer(s) + ['<EOS>'] +
                       tokenizer(g) for t, s, g in zip(titles, sentences, genres)]
    label_sentence = [tokenizer(s) for s in label_sentences]
    input_tensor = pad_sequence([torch.tensor(vocab.lookup_indices(t)) for t in input_sentence],
                                 padding_value=vocab['<pad>'], batch_first=True)
    label_tensor = pad_sequence([torch.tensor(vocab.lookup_indices(t)) for t in label_sentence],
                                    padding_value=vocab['<pad>'], batch_first=True)
    return input_tensor, label_tensor

In [85]:
def collate_keywords(batch):
    titles, genres, keywords, label_keywords = zip(*batch)
    input_keywords = [tokenizer(t) + ['<BOS>'] + tokenizer(k) + ['<EOS>'] +
                       tokenizer(g) for t, k, g in zip(titles, keywords, genres)]
    label_keywords = [tokenizer(k) for k in label_keywords]
    input_tensor = pad_sequence([torch.tensor(vocab.lookup_indices(k)) for k in input_keywords],
                                 padding_value=vocab['<pad>'], batch_first=True)
    label_tensor = pad_sequence([torch.tensor(vocab.lookup_indices(k)) for k in label_keywords],
                                    padding_value=vocab['<pad>'], batch_first=True)
    return input_tensor, label_tensor

In [81]:
#From HW4
from torch.utils.data import Sampler
class BatchSequentialSampler(Sampler):
    r"""Samples batches, s.t. the ith elements of each batch are sequential.

    Args:
        data_source (Dataset): dataset to sample from
    """

    def __init__(self, data_source, batch_size):
        self.data_source = data_source
        self.batch_size = batch_size
        
    def __iter__(self):
        num_batches = len(self.data_source)//self.batch_size
        for i in range(num_batches):
            for j in range(self.batch_size):
                yield(j * num_batches + i)

    def __len__(self):
        return (len(self.data_source)//self.batch_size) * self.batch_size

In [82]:
sent_sampler = BatchSequentialSampler(dev_data.loc[
    :,['title', 'genres', 'sentences', 'label_sentences']], 8)

In [83]:
sent_dataloader = torch.utils.data.DataLoader(dev_data.loc[
    :,['title', 'genres', 'sentences', 'label_sentences']].values, batch_size=8,
      shuffle=False, collate_fn=collate_sentences, sampler=sent_sampler)

In [84]:
for idx, (input_tensor, label_tensor) in enumerate(sent_dataloader):
    print(input_tensor.shape)
    print(label_tensor.shape)
    break

torch.Size([8, 64])
torch.Size([8, 39])


In [86]:
keyword_dataloader = torch.utils.data.DataLoader(dev_data.loc[
    :,['title', 'genres', 'keywords', 'label_keywords']].values, batch_size=8,
      shuffle=False, collate_fn=collate_keywords)

In [87]:
for idx, (input_tensor, label_tensor) in enumerate(keyword_dataloader):
    print(input_tensor.shape)
    print(label_tensor.shape)
    break

torch.Size([8, 31])
torch.Size([8, 21])


In [38]:
VECTOR_CACHE_DIR = '/Users/setul/mlpp23/.vector_cache'
glove = torchtext.vocab.GloVe('6B', cache=VECTOR_CACHE_DIR)
glove_vectors = glove.get_vecs_by_tokens(vocab.get_itos())

In [39]:
import torch.nn as nn
import torch.nn.functional as F


In [None]:
class BiRNN(nn.Module):
    def __init__ (self, input_dim, embedding_dim, hidden_dim, output_dim,
                  num_layers, vocab_size, type_rnn = 'LSTM', bidirectional = True,
                  dropout = 0.3, pad_idx = 0, activation = 'tanh'):
        super(BiRNN, self).__init__()
        self.rnns = []
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        #self.embedding = nn.Embedding.from_pretrained(glove_vectors, padding_idx = pad_idx, freeze=False)
        hidden_size = hidden_dim * 2 if bidirectional else hidden_dim
        for layer_num in range(num_layers):
            input_size = embedding_dim if layer_num == 0 else hidden_size
            hidden_size = output_dim if layer_num == num_layers - 1 else hidden_size
            if type_rnn == 'LSTM':
                rnn = nn.LSTM(input_size, hidden_size, 1, dropout = dropout, bidirectional = bidirectional)
            elif type_rnn == 'GRU':
                rnn = nn.GRU(input_size, hidden_size, 1, dropout = dropout, bidirectional = bidirectional)
            self.rnns.append(rnn)
        self.rnns = nn.ModuleList(self.rnns)
        self.linear = nn.Linear(hidden_size, output_dim) if bidirectional else nn.Linear(hidden_dim, output_dim)
        self.activation = activation
        self.dropout = nn.Dropout(dropout)
        self.num_layers = num_layers
        # self.bidirectional = bidirectional
        # self.type_rnn = type_rnn
        # self.hidden_dim = hidden_dim
        # self.embedding_dim = embedding_dim
        # self.vocab_size = vocab_size
        # self.pad_idx = pad_idx
        # self.input_dim = input_dim
        # self.output_dim = output_dim
    
    def forward(self, input, hidden = None):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        rnn_input = embedded
        for layer_num, rnn in enumerate(self.rnns):
            output, hidden_output = rnn(rnn_input, hidden)
            hidden = hidden_output
        output = self.linear(output)
        output = torch.tanh(output)
        return output, hidden

In [89]:
IS_CUDA = torch.cuda.is_available()
if IS_CUDA:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [132]:
class BiRNN_encoder(nn.Module):
    def __init__ (self, embedding_dim, hidden_dim,
                  vocab_size, num_layers=2, type_rnn = 'LSTM', bidirectional = True,
                  dropout = 0.3, pad_idx = 0):
        super(BiRNN_encoder, self).__init__()
        self.rnns = []
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        hidden_size = hidden_dim * 2 if bidirectional else hidden_dim
        input_size = embedding_dim
        for layer_num in range(num_layers):
            if type_rnn == 'LSTM':
                rnn = nn.LSTM(input_size, hidden_size, 1, dropout = dropout,
                               bidirectional = bidirectional, batch_first=True)
            elif type_rnn == 'GRU':
                rnn = nn.GRU(input_size, hidden_size, 1, dropout = dropout,
                              bidirectional = bidirectional, batch_first=True)
            self.rnns.append(rnn)
            input_size = hidden_size*2 if bidirectional else hidden_size
        self.rnns = nn.ModuleList(self.rnns)
        self.dropout = nn.Dropout(dropout)
        self.type_rnn = type_rnn
        self.bidirectional = bidirectional

    def forward(self, input, hidden = None):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        rnn_input = embedded
        for idx, rnn in enumerate(self.rnns):
            print("rnn_layer", idx)
            output, hidden_output = rnn(rnn_input, hidden)
            hidden = hidden_output
            rnn_input = output
        if self.type_rnn == 'LSTM':
            hidden = hidden[0]
        if self.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        return output, hidden

In [141]:
MAX_LENGTH = 10
class BiRNN_decoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size,
                  output_dim, dropout = 0.3):
        super(BiRNN_decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, 1,
                            dropout = dropout, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.output_dim = output_dim

    def forward(self, encoder_output, encoder_hidden, target):
        batch_size = encoder_output.size(0)
        decoder_hidden = encoder_hidden
        decoder_input = torch.tensor([vocab['<BOS>']] * batch_size,
                                      device=device).view(batch_size, 1)
        decoder_outputs = []
        if target is None:
            count = MAX_LENGTH
        else:
            count = target.size(1)
        for i in range(count):
            embedded = self.embedding(decoder_input)
            embedded = self.dropout(embedded)
            decoder_output, decoder_hidden = self.rnn(embedded, decoder_hidden)
            decoder_outputs.append(decoder_output)
            _, prediction = decoder_output.topk(1)
            if target is not None:
                decoder_input = target[:, i].view(batch_size, 1)
            else:
                decoder_input = prediction.squeeze().detach().view(-1, 1)
        decoder_outputs = F.log_softmax(torch.cat(decoder_outputs, dim=1), dim=2)
        return decoder_outputs, decoder_hidden, None

        
        


### First create seq2seq model unconstrained by keywords

In [142]:
encoder = BiRNN_encoder(300, 600, len(vocab), 2, 'LSTM', True, 0.3, 0).to(device)
decoder = BiRNN_decoder(300, 300, len(vocab), 10, 0.3).to(device)

In [139]:
GRAD_CLIP = 1.
NUM_EPOCHS = 3

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if h is None:
        return None
    elif isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    


In [145]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

GRAD_CLIP = 1.
NUM_EPOCHS = 3
TEACHER_FORCING_RATIO = 0.5
MAX_LENGTH = 12

# Assuming the existence of 'encoder' and 'decoder' instances,
# and 'sent_dataloader' as your DataLoader instance for training data.

loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
val_losses = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = encoder.to(device)
decoder = decoder.to(device)

for epoch in range(NUM_EPOCHS):
    encoder.train()
    decoder.train()
    total_loss = 0

    for i, batch in enumerate(sent_dataloader):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)
        
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        output, hidden = encoder(src)
        hidden = repackage_hidden(hidden)  # Detach hidden state

        # Initial input to decoder is the <BOS> token
        input = trg[:, 0]  # Assuming trg[:, 0] is the <BOS> token for each sequence in the batch

        loss = 0
        for t in range(1, MAX_LENGTH):  # Skip <BOS> token
            output, hidden, _ = decoder(input, hidden, trg)
            loss += loss_fn(output.squeeze(1), trg[:, t])
            teacher_force = random.random() < TEACHER_FORCING_RATIO
            top1 = output.argmax(2)[:, 0]
            input = trg[:, t] if teacher_force else top1

        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), GRAD_CLIP)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), GRAD_CLIP)

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(sent_dataloader)
    print(f'Epoch {epoch+1}, Loss: {average_loss:.4f}')
    val_losses.append(average_loss)


rnn_layer 0
rnn_layer 1


RuntimeError: For batched 3-D input, hx and cx should also be 3-D but got (1-D, 1-D) tensors