In [1]:
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics
import sys
from functools import partial

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk

In [2]:
bert_model_name = 'distilbert-base-uncased' 
# Bert Imports
from transformers import DistilBertTokenizer, DistilBertModel
#bert_model = DistilBertModel.from_pretrained(bert_model_name)
tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)

In [3]:
sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'src'))
import retrievers

In [4]:

def make_dir_if_not_exists(directory):
	if not os.path.exists(directory):
		logging.info("Creating new directory: {}".format(directory))
		os.makedirs(directory)

def print_list(l, K=None):
	for i, e in enumerate(l):
		if i == K:
			break
		print(e)
	print()

def remove_multiple_spaces(string):
	return re.sub(r'\s+', ' ', string).strip()

def save_in_pickle(save_object, save_file):
	with open(save_file, "wb") as pickle_out:
		pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
	with open(pickle_file, "rb") as pickle_in:
		return pickle.load(pickle_in)

def save_in_txt(list_of_strings, save_file):
	with open(save_file, "w") as writer:
		for line in list_of_strings:
			line = line.strip()
			writer.write(f"{line}\n")

def load_from_txt(txt_file):
	with open(txt_file, "r") as reader:
		all_lines = list()
		for line in reader:
			line = line.strip()
			all_lines.append(line)
		return all_lines

In [5]:
import pandas as pd

print(torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


In [6]:
data_file = '../data/with_epoque.csv'
data = pd.read_csv(data_file)
print(len(data))
print(data.head())

573
                                    author  \
0                      WILLIAM SHAKESPEARE   
1  DUCHESS OF NEWCASTLE MARGARET CAVENDISH   
2                           THOMAS BASTARD   
3                           EDMUND SPENSER   
4                        RICHARD BARNFIELD   

                                             content  \
0  Let the bird of loudest lay\nOn the sole Arabi...   
1  Sir Charles into my chamber coming in,\nWhen I...   
2  Our vice runs beyond all that old men saw,\nAn...   
3  Lo I the man, whose Muse whilome did maske,\nA...   
4  Long have I longd to see my love againe,\nStil...   

                                 poem name          age                  type  
0               The Phoenix and the Turtle  Renaissance  Mythology & Folklore  
1                 An Epilogue to the Above  Renaissance  Mythology & Folklore  
2                       Book 7, Epigram 42  Renaissance  Mythology & Folklore  
3  from The Faerie Queene: Book I, Canto I  Renaissance  Mytho

In [7]:
def make_data_training(df, char_max_line = 20):
    inputs = []
    context = []
    targets = []
    previous = []
    for i,rows in df.iterrows():
        splitted = rows['content'].split('\n')
        if len(splitted) > 4:
            for i,line in enumerate(splitted): 
                if len(line.strip()) > 0 and len(line.split(' ')) <= char_max_line:
                    if i==0:
                        previous.append(' ')
                    else:
                        previous.append(splitted[i-1])
                    inputs.append(line)
                    targets.append(line)
                    context.append(' '.join([str(rows['author'])]))
        
    return pd.DataFrame(list(zip(inputs, context, targets, previous)),columns =['text', 'context','target', 'previous'])


class PoemDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx]

In [10]:
df = make_data_training(data, char_max_line = 30)

all_poems = df['text'].tolist()
context = df['context'].tolist()
previous = df['previous'].tolist()


print(df)

                                    text              context  \
0            Let the bird of loudest lay  WILLIAM SHAKESPEARE   
1               On the sole Arabian tree  WILLIAM SHAKESPEARE   
2             Herald sad and trumpet be,  WILLIAM SHAKESPEARE   
3      To whose sound chaste wings obey.  WILLIAM SHAKESPEARE   
4          But thou shrieking harbinger,  WILLIAM SHAKESPEARE   
...                                  ...                  ...   
13480              And the lisp of reeds    RICHARD ALDINGTON   
13481      And the sun upon thy breasts,    RICHARD ALDINGTON   
13482           And thou hearest me not,    RICHARD ALDINGTON   
13483                     Potuia, potuia    RICHARD ALDINGTON   
13484               Thou hearest me not.    RICHARD ALDINGTON   

                                  target                       previous  
0            Let the bird of loudest lay                                 
1               On the sole Arabian tree    Let the bird of loudest lay

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfIdfVectorizer=TfidfVectorizer()
tfIdf = tfIdfVectorizer.fit_transform(all_poems)

X = tfIdfVectorizer.transform(["Let the bird of loudest lay"])
names = np.array(tfIdfVectorizer.get_feature_names())
ind = np.array(X.indices[X.data.sort()][0][-3:][::-1])
res = names[ind]

print(' '.join(res))

bird lay let


In [10]:
df = make_data_training(data, char_max_line = 30)

all_poems = df['text'].tolist()
context = df['context'].tolist()
previous = df['previous'].tolist()


print(df)

                                    text              context  \
0            Let the bird of loudest lay  WILLIAM SHAKESPEARE   
1               On the sole Arabian tree  WILLIAM SHAKESPEARE   
2             Herald sad and trumpet be,  WILLIAM SHAKESPEARE   
3      To whose sound chaste wings obey.  WILLIAM SHAKESPEARE   
4          But thou shrieking harbinger,  WILLIAM SHAKESPEARE   
...                                  ...                  ...   
13480              And the lisp of reeds    RICHARD ALDINGTON   
13481      And the sun upon thy breasts,    RICHARD ALDINGTON   
13482           And thou hearest me not,    RICHARD ALDINGTON   
13483                     Potuia, potuia    RICHARD ALDINGTON   
13484               Thou hearest me not.    RICHARD ALDINGTON   

                                  target                       previous  
0            Let the bird of loudest lay                                 
1               On the sole Arabian tree    Let the bird of loudest lay

In [11]:
tokenizer.get_vocab()["hello"]

7592

In [12]:
    
def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = self.tokenizer(sentence)
        return sent_ids
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.tokenizer.decode(word_id) for word_id in sent_ids]
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return tokenizer.decode(sent_ids, skip_special_tokens=True)

    def decode_sentence_from_ids(self, sent_ids):
        return 

vocab = Vocabulary(tokenizer)

In [13]:
class Poem_dataset(Dataset):

    def __init__(self, poems, context, previous, vocab, device):

        l = []
        
        for i in range(len(poems)):
            X = tfIdfVectorizer.transform([poems[i]])
            ind = np.array(X.indices[X.data.sort()][0][-3:][::-1])
            key_words = names[ind]
            l.append( (context[i] + " sep " + ' '.join(key_words), poems[i] ))
        
        self.poems = l.copy()
        self.vocab = vocab
        self.device = device

        def encode(src, tgt):
            src_ids = self.vocab.get_ids_from_sentence(src)
            tgt_ids = self.vocab.get_ids_from_sentence(tgt)
            return (src_ids, tgt_ids)

        # We will pre-tokenize the conversations and save in id lists for later use
        self.tokenized_poems = [encode(src, tgt) for src, tgt in self.poems]
        
    def __len__(self):
        return len(self.poems)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {"conv_ids":self.tokenized_poems[idx], "conv":self.poems[idx]}

def collate_fn(batch):
    inputs, masks_input, outputs, masks_output = [], [], [], []

    for data in batch:

        tokenizer_output = data['conv_ids'][0]
        tokenized_sent = tokenizer_output['input_ids']
        
        tokenizer_target = data['conv_ids'][1]
        tokenized_sent_target = tokenizer_target['input_ids']
        
        mask_sentence = tokenizer_output['attention_mask']
        mask_target = tokenizer_target['attention_mask']
        
        inputs.append(torch.tensor(tokenized_sent).to(device))
        outputs.append(torch.tensor(tokenized_sent_target).to(device))
        masks_input.append(torch.tensor(mask_sentence).to(device))
        masks_output.append(torch.tensor(mask_target).to(device))
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    outputs = pad_sequence(outputs, batch_first=True, padding_value=0)
    masks_input = pad_sequence(masks_input, batch_first=True, padding_value=0.0)
    masks_output = pad_sequence(masks_output, batch_first=True, padding_value=0.0)
    return inputs, masks_input, outputs, masks_output

In [14]:
print(vocab.tokenizer.all_special_ids)
print(vocab.tokenizer.all_special_tokens)

[100, 102, 0, 101, 103]
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [15]:
dataset = Poem_dataset(all_poems, context, previous, vocab, device)

In [16]:
'''for src, tgt in dataset.poems[0:5]:
    sentence = src
    word_tokens = vocab.tokenized_sentence(sentence)
    word_ids = vocab.get_ids_from_sentence(sentence)
    print(sentence)
    print(tgt)
    print(word_tokens)
    print()

word = "the"
word_id = vocab.tokenizer(word.lower(),add_special_tokens=False)
print(f"Word = {word}")
print(f"Word ID = {word_id}")
print(f"Word decoded from ID = {vocab.decode_sentence_from_ids([word_id])}")'''

'for src, tgt in dataset.poems[0:5]:\n    sentence = src\n    word_tokens = vocab.tokenized_sentence(sentence)\n    word_ids = vocab.get_ids_from_sentence(sentence)\n    print(sentence)\n    print(tgt)\n    print(word_tokens)\n    print()\n\nword = "the"\nword_id = vocab.tokenizer(word.lower(),add_special_tokens=False)\nprint(f"Word = {word}")\nprint(f"Word ID = {word_id}")\nprint(f"Word decoded from ID = {vocab.decode_sentence_from_ids([word_id])}")'

In [17]:
def read_GloVe(filename):
  embeddings = {}
  for line in open(filename).readlines():
    fields = line.strip().split(" ")
    word = fields[0]
    embeddings[word] = [float(x) for x in fields[1:]]
  return embeddings

GloVe = read_GloVe("../data/glove.840B.300d.conll_filtered.txt")

In [18]:
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):

    decoder_learning_ratio = 5.0
    encoder_parameter_names = ['encode_emb', 'encode_gru', 'l1', 'l2']
                           
    encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([{'params': encoder_params},
                {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)
    
    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                source, source_mask, target, target_mask = batch_data
                optimizer.zero_grad()
                loss = model.compute_loss(source, source_mask, target, target_mask)
                total_loss += loss.item()
                loss.backward()
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
       
    torch.save(model.state_dict(), model_file)

# Define the model

In [19]:
bert = DistilBertModel.from_pretrained(bert_model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
retriever_model = retrievers.RetrieverPolyencoder(bert,device=device).to(device)

In [21]:
num_epochs = 5
batch_size = 64
learning_rate = 0.001

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)


train(retriever_model, data_loader, num_epochs, "baseline_model.pt",learning_rate=learning_rate)

training:   0%|          | 0/5 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/211 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/211 [00:00<?, ?batch/s]

epoch 3:   0%|          | 0/211 [00:00<?, ?batch/s]

epoch 4:   0%|          | 0/211 [00:00<?, ?batch/s]

RuntimeError: CUDA out of memory. Tried to allocate 58.00 MiB (GPU 0; 4.00 GiB total capacity; 2.28 GiB already allocated; 0 bytes free; 2.48 GiB reserved in total by PyTorch)

In [None]:
def predict_greedy(model, sentence, max_length=100):
    """Make predictions for the given input using greedy inference.
    
    Args:
        model: A sequence-to-sequence model.
        sentence: A input string.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.
    
    Returns:
        Model's predicted greedy response for the input, represented as string.
    """

    # You should make only one call to model.encode() at the start of the function, 
    # and make only one call to model.decode() per inference step.
    model.eval()    
    src_id = torch.tensor(vocab.get_ids_from_sentence(sentence))[:,None].to(device)
    encoder_output, encoder_mask, last_hidden = model.encode(src_id) 
    input = src_id[0,:]
    out = [bos_id]
    for t in range(max_length):
        input = input[None,:]
        out_decoder, last_hidden, _ = model.decode(input, last_hidden, encoder_output, encoder_mask)
        input = out_decoder.argmax(dim=-1)
        word = input.item()
        out.append(word)
        if word == eos_id:
            break
    
    decoded = vocab.decode_sentence_from_ids(out)
    return decoded
    

In [None]:
def predict_beam(model, sentence, k=5, max_length=100, hidden = None):

    alpha = 0.3
    model.eval()
    
    sentence_ids = torch.tensor(vocab.get_ids_from_sentence(sentence)).cuda()
    sentence_ids = sentence_ids.unsqueeze(1)
    encoder_output, encoder_mask, h = model.encode(sentence_ids)

    out_start = sentence_ids[0]
    beam = [out_start for i in range(k)]
    beam_scores = [1 for i in range(k)]
    
    if hidden:
        h = hidden
    hiddens = [h for i in range(k)]
    
    generations = []
    generations_scores = []
    curr_l = 0
    eos_tensor = torch.Tensor([eos_id]).int().cuda()
    while beam:
        logits = torch.Tensor().cuda()
        inds = torch.Tensor().int().cuda()
        curr_k = len(beam)
        if curr_l==max_length:
            for i in range(curr_k):
                  generations += [torch.cat((beam[i],eos_tensor),0)]
                  generations_scores += [new_beam_scores[i]]
            break
        else:
            for i in range(curr_k):
                out, hiddens[i], _ = model.decode(beam[i][-1].view(1,1), hiddens[i], encoder_output,
                                     encoder_mask)
                logit,ind = torch.topk(out.squeeze(), curr_k, dim=0)
                logits = torch.cat((logits,logit),0)
                inds = torch.cat((inds,ind),0)
            new_beam = []
            new_beam_scores = []
            new_hiddens = []
            if curr_l==0:
                for i in range(curr_k):
                    max_ind = torch.argmax(nn.functional.log_softmax(logit,dim=0))
                    new_beam_scores += [float(logit[max_ind])]
                    logit[max_ind] = -1e9
                    new_beam += [torch.cat((beam[0],ind[max_ind].unsqueeze(0)),0)]
                    new_hiddens += [hiddens[0]]
            else:
                top_logits,top_inds_logit = torch.topk(torch.repeat_interleave(torch.Tensor(beam_scores).cuda(),
                                                                               curr_k)\
                                                       +nn.functional.log_softmax(logits,dim=0),
                                                       curr_k, dim=0)
                for i in range(curr_k):
                    if inds[top_inds_logit[i]]==eos_id:
                        generations += [torch.cat((beam[top_inds_logit[i]//curr_k],inds[top_inds_logit[i]].unsqueeze(0)),0)]
                        generations_scores+=[float(logits[top_inds_logit[i]])/(generations[-1].shape[0]**alpha)]
                    else:
                        new_beam += [torch.cat((beam[top_inds_logit[i]//curr_k],inds[top_inds_logit[i]].unsqueeze(0)),0)]
                        new_hiddens += [hiddens[top_inds_logit[i]//curr_k]]
                        new_beam_scores += [float(logits[top_inds_logit[i]])]
            beam = new_beam
            beam_scores = new_beam_scores
            hiddens = new_hiddens
        curr_l +=1
    generations = [g for _, g in sorted(zip(generations_scores, generations))]
    generations.reverse()
    return [vocab.decode_sentence_from_ids(s.tolist()) for s in generations]

In [None]:
sentence = "WILLIAM SHAKESPEARE sep love moon bride sep tell me the love of comrades sweetens !"
print(predict_greedy(Erato_model, sentence, max_length=100))
print()
predict_beam(Erato_model, sentence, k=60, max_length=100)

In [None]:
i = 0

p = data.iloc()[i]

print(p['author'])
print(p['poem name'])
print(p['age'])
print()
print(p['content'])