# load summerization data

In [2]:
import nltk

In [3]:
from datasets import load_dataset
dataset = load_dataset("billsum")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [5]:
train = dataset['train']
test = dataset['test']

In [6]:
print(type(train['text']))

<class 'datasets.arrow_dataset.Column'>


In [7]:
print(len(train['summary'][0]))

1561


In [8]:
sos_token = '<sos>'
eos_token = '<eos>'

In [9]:
def tokenizer(data,sos_token,eos_token):
    text_tokens = nltk.word_tokenize(data['text'])
    summarized_tokens = nltk.word_tokenize(data['summary'])
    text_tokens = [sos_token] + text_tokens + [eos_token]
    summarized_tokens = [sos_token] + summarized_tokens + [eos_token]
    return {'text_tokens':text_tokens,'summary_tokens':summarized_tokens}

In [10]:
fn_kwargs = {
    'sos_token':sos_token,
    'eos_token':eos_token,
}
train = train.map(tokenizer,fn_kwargs=fn_kwargs)
test = test.map(tokenizer,fn_kwargs=fn_kwargs)

In [11]:
len(train['text_tokens'][1])

2954

In [12]:
# from collections import Counter
def build_vocab(sentences):
    # counter = Counter([token for tokens in sentences for token in tokens])
    idx = 2  
    vocab = {'<unk>':0,'<pad>':1}
    for tokens in sentences:
        for token in tokens:
            if token in vocab.keys():
                continue
            vocab[token] = idx 
            idx += 1
            
    return vocab

In [13]:
def pad_seq(seq,pad,max_length):
    if(len(seq)>max_length):
        return seq[:max_length]
    return seq + [pad]*(max_length-len(seq))

In [14]:
def convert_word2index(vocab,tokens):
    tokens_indexes = []
    for token in tokens:
        if token in vocab:
            tokens_indexes.append(vocab[token])
        else:
            tokens_indexes.append(vocab['<unk>'])
    return tokens_indexes

In [15]:
texts = train['text_tokens'][:1000]
summaries = train['summary_tokens'][:1000]

In [16]:
vocab = build_vocab(texts)

In [17]:
texts = [convert_word2index(vocab,pad_seq(tokens,'<pad>',1000)) for tokens in texts]

In [18]:
print(type(texts))

<class 'list'>


In [19]:
summaries = [convert_word2index(vocab,pad_seq(tokens,'<pad>',500)) for tokens in summaries]

In [20]:
test_text = test['text_tokens']
test_summary = test['summary_tokens']

In [21]:
test_text_idxes = [convert_word2index(vocab,pad_seq(tokens,'<pad>',1000)) for tokens in test_text]
test_summary_idxes = [convert_word2index(vocab,pad_seq(tokens,'<pad>',500)) for tokens in test_summary]

# DataLoader


In [22]:
from torch.utils.data import DataLoader,Dataset
import torch
import numpy as np
import random
import tqdm

In [23]:
class TextSummary(Dataset):
    def __init__(self,text_idxes,summaries_idxs):
        self.text_idxes = text_idxes 
        self.summaries_idxs = summaries_idxs
    
    def __len__(self):
        return len(self.text_idxes)
    
    def __getitem__(self, index):
        return torch.tensor(self.text_idxes[index], dtype=torch.long), torch.tensor(self.summaries_idxs[index], dtype=torch.long)

    

text_summary = TextSummary(text_idxes = texts , summaries_idxs = summaries)
train_dataloader = DataLoader(text_summary , batch_size=16 , shuffle= True)
test_text_summary = TextSummary(text_idxes= test_text_idxes,summaries_idxs=test_summary_idxes)
test_dataloader = DataLoader(test_text_summary,batch_size=16,shuffle=True)


# building the model

## encoder

In [24]:

class Encoder(torch.nn.Module):
    def __init__(self,input_dim,embedding_dim,hidden_dim,dropout,num_layers):
        super().__init__()
        self.embedding = torch.nn.Embedding(input_dim,embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim,hidden_size=hidden_dim,num_layers=num_layers,dropout=dropout,batch_first=True)
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self,X):
        # X [batch_size,seq_len]
        embedding_input = self.dropout(self.embedding(X))
        # embedding_input [batch_size,seq_len,embedding_dim]
        outputs,(hidden,cell) = self.lstm(embedding_input)
        # outputs [batch_size,seq_length,hidden_dim]
        # hidden [num_layers,batch_size,hidden_dim]
        return hidden,cell


## decoder

In [25]:
class Decoder(torch.nn.Module):
    def __init__(self,output_dim,embedding_dim,hidden_dim,dropout,num_layers):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = torch.nn.Embedding(output_dim,embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim,hidden_size=hidden_dim,num_layers=num_layers,dropout=dropout,batch_first=True)
        self.fc1 = torch.nn.Linear(hidden_dim,output_dim)
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self,X,prev_hidden,prev_cell):
        # X [batch_size]
        X = X.unsqueeze(1)
        # X [batch_size,1]
        embedding_input = self.dropout(self.embedding(X))
        outputs,(hidden,cell) = self.lstm(embedding_input,(prev_hidden,prev_cell))
        # outputs [batch_size,1,hidden_dim]
        outputs = outputs.squeeze(1)
        # outputs [batch_size,hidden_dim]
        prediction = self.fc1(outputs)
        # prediction = [batch_size,output_dim]
        return prediction,hidden,cell


# seq2seq

In [26]:
class Seq2Seq(torch.nn.Module):
    def __init__(self,encoder,decoder,device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,X,target,teaching_force_ratio):
        # X [batch_size,seq_length]
        # target [batch_size,seq_length]
        hidden,cell = self.encoder(X)
        # hidden [num_layers,batch_size,hidden_dim]
        # cell [num_layers,batch_size,hidden_dim]
        decoder_input = target[:,0]
        output_dim = self.decoder.output_dim
        batch_size = target.shape[0]
        target_size = target.shape[1]
        outputs = torch.zeros(target_size,batch_size,output_dim).to(self.device)
        # outputs [seq_length , batch_size,output_dim]
        for t in range(1,target_size):
            output,hidden,cell = self.decoder(decoder_input,hidden,cell)
            # output [batch_size,output_dim]
            outputs[t] = output
            teaching_force = random.random() < teaching_force_ratio
            top1 = output.argmax(1)
            decoder_input = target[:,t] if teaching_force else top1 
        
        return outputs

# training model

In [27]:
input_dim = len(vocab)
output_dim = len(vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
num_layers = 2
encoder_dropout = .5
decoder_dropout = .5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(input_dim=input_dim,embedding_dim = encoder_embedding_dim,hidden_dim=hidden_dim,dropout=encoder_dropout,num_layers=num_layers)
decoder = Decoder(output_dim=output_dim,embedding_dim=decoder_embedding_dim,hidden_dim=hidden_dim,dropout=decoder_dropout,num_layers=num_layers)
model = Seq2Seq(encoder=encoder,decoder=decoder,device=device)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "e:\NLP_exercises\text_summarization_encoder_decoder_arch\venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "e:\NLP_exercises\text_summarization_encoder_decoder_arch\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "e:\NLP_exercises\text_summarization_encoder_decoder_arch\venv\Lib\site-packages\ipyk

In [28]:
def init_weights(m):
    for name, param in m.named_parameters():
        torch.nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(33728, 256)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(33728, 256)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc1): Linear(in_features=512, out_features=33728, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [29]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 41,927,616 trainable parameters


In [30]:
import torch
print(torch.__file__)
print(torch.__version__)


e:\NLP_exercises\text_summarization_encoder_decoder_arch\venv\Lib\site-packages\torch\__init__.py
2.3.0+cpu


In [31]:
import torch
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss(ignore_index=1)

In [1]:
def train_fn(model,optimizer,criterian,teaching_force_ratio,data_loader,clip,device):
    epoch_loss = 0 
    model.train()
    for X_batch,y_batch in data_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device),y_batch.to(device),teaching_force_ratio)
        # output [seq_length,batch_size,output_dim]
        output = output[1:, :, :].reshape(-1, output.shape[-1])
        y_batch = y_batch[:, 1:].reshape(-1)

        loss = criterian(output,y_batch)
        loss.backward()
        optimizer.step()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        epoch_loss += loss.item()
    return epoch_loss/len(data_loader)

In [2]:
def evalute_fn(model,test_loader,criterian,device):
    epoch_loss = 0 
    model.eval()
    with torch.inference_mode():
        for X_batch,y_batch in test_loader:
            output = model(X_batch.to(device),y_batch.to(device),0)

            output = output[1:, :, :].reshape(-1, output.shape[-1])
            y_batch = y_batch[:, 1:].reshape(-1)
            loss = criterian(output,y_batch)
            epoch_loss += loss.item()
    
    return epoch_loss/test_loader

In [3]:
epochs = 10
teaching_force_ratio = .5
clip =1
best_valid_loss = float('inf')
for epoch in tqdm.tqdm(range(epochs)):
    train_loss = train_fn(model,optimizer,criterion,teaching_force_ratio,train_dataloader,clip,device)
    test_loss = evalute_fn(model,test_dataloader,criterion,device)
    if test_loss < best_valid_loss:
        best_valid_loss = best_valid_loss
        torch.save(model.state_dict(), "text_summarization.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {best_valid_loss:7.3f} | Valid PPL: {np.exp(best_valid_loss):7.3f}")

NameError: name 'tqdm' is not defined

In [None]:
def Index2Word(vocab):
    index2word = {}
    for word,idx in vocab.items():
        index2word[idx] =word
    return index2word

In [None]:
index2word = Index2Word(vocab)

In [None]:
index2word[2]

'<sos>'

In [None]:
def summarize_fn(model,sentence,vocab,max_length,index2word):
    tokens = nltk.word_tokenize(sentence)
    tokens = [sos_token] + tokens + [eos_token]
    indexes = convert_word2index(vocab,tokens)
    tensor = torch.tensor(indexes,dtype=torch.int)
    tensor = tensor.unsqueeze(1)
    hidden,cell = model.encoder(tensor)
    input = torch.tensor([vocab[sos_token]],dtype=torch.int)
    outputs = []
    for _ in range(max_length):
        prediction,hidden,cell = model.decoder(input)
        predicted_idx = prediction.argmax(-1).item()
        input = torch.tensor([predicted_idx],dtype=torch.int)
        outputs.append(index2word[predicted_idx])
        if(predicted_idx == vocab[eos_token]):
            break
    
    return outputs

