In [25]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics
import sys
from functools import partial

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk
#from google.colab import files

In [2]:
import pandas as pd

print(torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


In [3]:
data_file = '../data/poem/with_epoque.csv'
dataset = pd.read_csv(data_file)
print(len(dataset))
print(dataset.head())

573
                                    author  \
0                      WILLIAM SHAKESPEARE   
1  DUCHESS OF NEWCASTLE MARGARET CAVENDISH   
2                           THOMAS BASTARD   
3                           EDMUND SPENSER   
4                        RICHARD BARNFIELD   

                                             content  \
0  Let the bird of loudest lay\r\nOn the sole Ara...   
1  Sir Charles into my chamber coming in,\r\nWhen...   
2  Our vice runs beyond all that old men saw,\r\n...   
3  Lo I the man, whose Muse whilome did maske,\r\...   
4  Long have I longd to see my love againe,\r\nSt...   

                                 poem name          age                  type  
0               The Phoenix and the Turtle  Renaissance  Mythology & Folklore  
1                 An Epilogue to the Above  Renaissance  Mythology & Folklore  
2                       Book 7, Epigram 42  Renaissance  Mythology & Folklore  
3  from The Faerie Queene: Book I, Canto I  Renaissance  Mytho

In [18]:
def make_data_training(df, bos_token = '<bos> ',  eos_token = ' <bos>'):
    inputs = []
    context = []
    targets = []
    for i,rows in df.iterrows():   
        for line in rows['content'].split('\r\n'):
            if len(line.strip()) > 0:
                inputs += [bos_token + line]
                targets += [line + eos_token]
                context.append(' '.join([str(rows['poem name']), rows['age'], rows['type']]))
        
    return pd.DataFrame(list(zip(inputs, context, targets)),columns =['text', 'context','target'])


#Defining torch dataset class for poems
class PoemDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx]

In [36]:
df = make_data_training(dataset)

num_lines = len(df)

idxs = list(range(num_lines))

test_idx = idxs[:int(0.1*num_lines)]
val_idx = idxs[int(0.1*num_lines):int(0.2*num_lines)]
train_idx = idxs[int(0.2*num_lines):]

train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)
test_df = df.iloc[test_idx].reset_index(drop=True)

train_data = train_df[['context', 'text', 'target']]
val_data   = val_df[['context', 'text', 'target']]
test_data  = test_df[['context', 'text', 'target']]

train_dataset = PoemDataset(train_data)
val_dataset   = PoemDataset(val_data)
test_dataset  = PoemDataset(test_data)

In [37]:
bert_model_name = 'distilbert-base-uncased' 

from transformers import DistilBertTokenizer, DistilBertModel
from transformers import get_linear_schedule_with_warmup
from tokenizers.processors import BertProcessing

bert_model = DistilBertModel.from_pretrained(bert_model_name)
tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
def transformer_collate_fn(batch, tokenizer):
    bert_vocab = tokenizer.get_vocab()
    bert_pad_token = bert_vocab['[PAD]']
    bert_unk_token = bert_vocab['[UNK]']
    bert_cls_token = bert_vocab['[CLS]']

    sentences, targets, masks = [], [], []
    for data in batch:

        tokenizer_output = tokenizer([data['text']])
        tokenized_sent = tokenizer_output['input_ids'][0]
        
        tokenizer_target = tokenizer([data['target']])
        tokenized_sent_target = tokenizer_target['input_ids'][0]
        
        mask = tokenizer_output['attention_mask'][0]
        sentences.append(torch.tensor(tokenized_sent))
        targets.append(torch.tensor(tokenized_sent_target))
        masks.append(torch.tensor(mask))
    sentences = pad_sequence(sentences, batch_first=True, padding_value=bert_pad_token)
    targets = pad_sequence(targets, batch_first=True, padding_value=bert_pad_token)
    masks = pad_sequence(masks, batch_first=True, padding_value=0.0)
    return sentences, targets, masks

In [None]:
class EratoModel(nn.Module):
    def __init__(self,
                 poly_encoder: nn.Module,
                 bert_encoder: nn.Module,
                 decoder: nn.Module,
                 enc_hid_dim=768, #default embedding size
                 outputs=2,
                 dropout=0.1):
        super().__init__()
        
        self.poly_encoder = poly_encoder
        self.bert_encoder = bert_encoder
        self.decoder = decoder


    def forward(self,
                src,
                mask):
        bert_output = self.bert_encoder(src, mask)

        ### YOUR CODE HERE ###
        hidden_state = bert_output[0]  # (bs, seq_len, dim)

        return None

# Model Training

In [39]:
#define hyperparameters
BATCH_SIZE = 10
LR = 1e-5
WEIGHT_DECAY = 0
N_EPOCHS = 3
CLIP = 1.0

#create pytorch dataloaders from train_dataset, val_dataset, and test_datset
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer), shuffle = True)
val_dataloader = DataLoader(val_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer))
test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer))

In [91]:
for batch in train_dataloader:
        sentences, targets, masks = batch[0], batch[1], batch[2]
        s = tokenizer.decode(sentences[0,:], skip_special_tokens=False, clean_up_tokenization_spaces=False)
        t = tokenizer.decode(targets[0,:], skip_special_tokens=False, clean_up_tokenization_spaces=False)
        
        bert_output = bert_model(sentences, masks)
        
        print(bert_output[0].shape)
        
        break

torch.Size([10, 18, 768])
