# load summerization data

In [1]:
import nltk

In [2]:
from datasets import load_dataset
dataset = load_dataset("billsum")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [4]:
train = dataset['train']
test = dataset['test']

In [5]:
print(type(train['text']))

<class 'datasets.arrow_dataset.Column'>


In [6]:
print(len(train['summary'][0]))

1561


In [7]:
sos_token = '<sos>'
eos_token = '<eos>'

In [8]:
def tokenizer(data,sos_token,eos_token):
    text_tokens = nltk.word_tokenize(data['text'])
    summarized_tokens = nltk.word_tokenize(data['summary'])
    text_tokens = [sos_token] + text_tokens + [eos_token]
    summarized_tokens = [sos_token] + summarized_tokens + [eos_token]
    return {'text_tokens':text_tokens,'summary_tokens':summarized_tokens}

In [9]:
fn_kwargs = {
    'sos_token':sos_token,
    'eos_token':eos_token,
}
train = train.map(tokenizer,fn_kwargs=fn_kwargs)
test = test.map(tokenizer,fn_kwargs=fn_kwargs)

In [10]:
len(train['text_tokens'][1])

2954

In [11]:
# from collections import Counter
def build_vocab(sentences):
    # counter = Counter([token for tokens in sentences for token in tokens])
    idx = 2  
    vocab = {'<unk>':0,'<pad>':1}
    for tokens in sentences:
        for token in tokens:
            if token in vocab.keys():
                continue
            vocab[token] = idx 
            idx += 1
            
    return vocab

In [12]:
def pad_seq(seq,pad,max_length):
    if(len(seq)>max_length):
        return seq[:max_length]
    return seq + [pad]*(max_length-len(seq))

In [13]:
def convert_word2index(vocab,tokens):
    tokens_indexes = []
    for token in tokens:
        if token in vocab:
            tokens_indexes.append(vocab[token])
        else:
            tokens_indexes.append(vocab['<unk>'])
    return tokens_indexes

In [14]:
texts = train['text_tokens']
summaries = train['summary_tokens']

In [15]:
vocab = build_vocab(texts)

In [16]:
texts = [convert_word2index(vocab,pad_seq(tokens,'<pad>',2000)) for tokens in texts]

In [17]:
print(type(texts))

<class 'list'>


In [18]:
summaries = [convert_word2index(vocab,pad_seq(tokens,'<pad>',1000)) for tokens in summaries]

In [28]:
test_text = test['text_tokens']
test_summary = test['summary_tokens']

In [29]:
test_text_idxes = [convert_word2index(vocab,pad_seq(tokens,'<pad>',2000)) for tokens in test_text]
test_summary_idxes = [convert_word2index(vocab,pad_seq(tokens,'<pad>',1000)) for tokens in test_summary]

# DataLoader


In [37]:
from torch.utils.data import DataLoader,Dataset
import torch
import numpy as np
import random

In [31]:
class TextSummary(Dataset):
    def __init__(self,text_idxes,summaries_idxs):
        self.text_idxes = text_idxes 
        self.summaries_idxs = summaries_idxs
    
    def __len__(self):
        return len(self.text_idxes)
    
    def __getitem__(self, index):
        return torch.tensor(self.text_idxes[index], dtype=torch.long), torch.tensor(self.summaries_idxs[index], dtype=torch.long)

    

text_summary = TextSummary(text_idxes = texts , summaries_idxs = summaries)
train_dataloader = DataLoader(text_summary , batch_size=32 , shuffle= True)
test_text_summary = TextSummary(text_idxes= test_text_idxes,summaries_idxs=test_summary_idxes)
test_dataloader = DataLoader(test_text_summary,batch_size=32,shuffle=True)


# building the model

## encoder

In [32]:

class encoder(torch.nn.Module):
    def __init__(self,input_dim,embedding_dim,hidden_dim,dropout,num_layers):
        super().__init__()
        self.embedding = torch.nn.Embedding(input_dim,embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim,hidden_size=hidden_dim,num_layers=num_layers,dropout=dropout)
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self,X):
        embedding_input = self.dropout(self.embedding(X))
        outputs,(hidden,cell) = self.lstm(embedding_input)
        return hidden,cell


## decoder

In [35]:
class decoder(torch.nn.Module):
    def __init__(self,output_dim,embedding_dim,hidden_dim,dropout,num_layers):
        super().__init__()
        self.output_dim = output_dim
        self.embbedding = torch.nn.Embedding(output_dim,embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim,hidden_size=hidden_dim,num_layers=num_layers,dropout=dropout)
        self.fc1 = torch.nn.Linear(embedding_dim,output_dim)
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self,X,prev_hidden,prev_cell):
        embedding_input = self.dropout(self.embedding(X.unsqueeze(0)))
        outputs,(hidden,cell) = self.lstm(embedding_input)
        prediction = self.fc1(outputs.squeeze(0))
        return hidden,cell,prediction


# seq2seq

In [38]:
class seq2seq(torch.nn.Module):
    def __init__(self,encoder,decoder,device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,X,target,teaching_force_ratio):
        hidden,cell = self.encoder(X)
        decoder_input = target[0,:]
        output_dim = self.decoder.output_dim
        batch_size = target.shape[1]
        target_size = target.shape[0]
        outputs = torch.zeros(target_size,batch_size,output_dim).to(self.device)

        for t in range(1,len(target_size)):
            output,hidden,cell = self.decoder(decoder_input,hidden,cell)
            outputs[t] = output
            teaching_force = random.random() < teaching_force_ratio
            top1 = output.argemax(1)
            decoder_input = target[t] if teaching_force else top1 
        
        return outputs