In [1]:
!pip3 install torch torchvision



In [2]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from spacy.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split
import spacy
import pandas as pd
import numpy as np
import os
import re
from nltk.corpus import stopwords 
import random
from tqdm import tqdm
import math



In [3]:
# Tokenizer using spacy
nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer(nlp.vocab)

In [4]:
# Add data from files into dataframe for easier access
def create_dataframe(source_text_path,target_text_path):
    txt_files_source = [file for file in os.listdir(source_text_path) if file.endswith('.txt')]
    txt_files_target = [file for file in os.listdir(target_text_path) if file.endswith('.txt')]
    df = pd.DataFrame(columns=['headlines','text'])
    for source,target in zip(txt_files_source,txt_files_target):
        assert source==target
        source_file_path = os.path.join(source_text_path, source)
        target_file_path = os.path.join(target_text_path, target)
        # Read the content of the file
        with open(source_file_path,'r',encoding='latin-1') as file:
            source_text = file.read()
        with open(target_file_path,'r',encoding='latin-1') as file:
            target_text = file.read()
        df.loc[len(df.index)] = [source_text,target_text]
    return df

In [5]:
# Check accuracy function
def check_accuracy(output,labels):
    _ , predpos = output.max(1)
    num_samples=len(labels)
    num_correct=(predpos==labels).sum()
    return (num_correct/num_samples)*100

# Save checkpoint
def save_checkpoint(state,filename='weights.pth.tar'):
    print('Saving weights-->')
    torch.save(state,filename)

# Load checkpoint
def load_checkpoint(checkpoint,model,optim):
    print('Loading weights-->')
    model.load_state_dict(checkpoint['state_dict'])
    optim.load_state_dict(checkpoint['optimizer'])

In [6]:
df1 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/business","/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/business")
df2 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/entertainment","/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/entertainment")
df3 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/politics","/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/politics")
df4 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/sport","/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/sport")
df5 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/tech","/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/tech")

In [7]:
df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

In [8]:
# Split into train and test sets
df = df.rename(columns = {"headlines":"source_text","text":"summary_text"})
X,Y = df["source_text"],df["summary_text"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
train_df = pd.DataFrame({'source_text': X_train, 'summary_text': Y_train})
test_df = pd.DataFrame({'source_text': X_test, 'summary_text': Y_test})

In [9]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}


stop_words = set(stopwords.words('english'))

In [10]:
def text_cleaner(text):
    newString = text.lower()
    newString = newString.replace('"', "'")
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = [w for w in newString.split() if not w in stop_words]
    return " ".join(tokens)

In [11]:
# Tokenize and lowercase text using spacy
train_df['source_text'] = train_df['source_text'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])
train_df['summary_text'] = train_df['summary_text'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])

test_df['source_text'] = test_df['source_text'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])
test_df['summary_text'] = test_df['summary_text'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])

In [12]:
# Add START AND END tokens to summary
train_df['source_text'] = train_df['source_text'].apply(lambda x : ['_START_']+ x + ['_END_'])
train_df['summary_text'] = train_df['summary_text'].apply(lambda x : ['_START_']+ x + ['_END_'])

test_df['source_text'] = test_df['source_text'].apply(lambda x : ['_START_']+ x + ['_END_'])
test_df['summary_text'] = test_df['summary_text'].apply(lambda x : ['_START_']+ x + ['_END_'])

In [13]:
train_df.head()

Unnamed: 0,source_text,summary_text
1490,"[_START_, ferguson, fears, milan, cutting, edg...","[_START_, loss, could, worse, quality, bring, ..."
2001,"[_START_, ask, jeeves, joins, web, log, market...","[_START_, jim, lanzone, vice, president, searc..."
1572,"[_START_, safin, cool, wimbledon, newly, crown...","[_START_, expect, sampras, favourite, pressure..."
1840,"[_START_, mobiles, rack, years, use, mobile, p...","[_START_, cellnet, vodafone, mobile, phone, op..."
610,"[_START_, eminem, secret, gig, venue, revealed...","[_START_, fourth, album, rap, star, sale, two,..."


In [14]:
# Build vocabularies - each word has an index, note : words sorted in ascending order
all_tokens = train_df['source_text'].tolist() + train_df['summary_text'].tolist() + test_df['source_text'].tolist() + test_df['summary_text'].tolist()
source_vocab = {actual_word: idx for idx, (word_num, actual_word) in enumerate(sorted(enumerate(set(token for tokens in all_tokens for token in tokens)), key=lambda x: x[1]))}
target_vocab = {actual_word: idx for idx, (word_num, actual_word) in enumerate(sorted(enumerate(set(token for tokens in all_tokens for token in tokens)), key=lambda x: x[1]))}

In [15]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using",device)

Using cuda


In [16]:
temp = list(sorted(source_vocab.items()))
for word, idx in temp[-5:]:
    print(word,idx)

zuluaga 27632
zurich 27633
zutons 27634
zvonareva 27635
zvyagintsev 27636


In [17]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, source_texts, target_summaries, source_vocab, target_vocab):
        self.source_texts = source_texts
        self.target_summaries = target_summaries
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source_text = [self.source_vocab[word] for word in self.source_texts[idx]]
        target_summary = [self.target_vocab[word] for word in self.target_summaries[idx]]
        return torch.tensor(source_text), torch.tensor(target_summary)

In [18]:
# Create custom datasets
train_dataset = CustomDataset(train_df['source_text'].tolist(), train_df['summary_text'].tolist(),source_vocab, target_vocab)
test_dataset = CustomDataset(test_df['source_text'].tolist(), test_df['summary_text'].tolist(),source_vocab, target_vocab)

In [20]:
def get_max_seqlen():
    max_length = 0
    for index, row in train_df.iterrows():
        # Calculate the length of the current row
        row_length = len(row['source_text'])
        # Update the maximum length if the current row length is greater
        max_length = max(max_length, row_length)
    for index, row in test_df.iterrows():
        # Calculate the length of the current row
        row_length = len(row['source_text'])
        # Update the maximum length if the current row length is greater
        max_length = max(max_length, row_length)
    print("Max length in dataset ",max_length)
    return max_length

In [21]:
'''
Note : 
In PyTorch, the `collate_fn` parameter in the `DataLoader` can be either a function or an object of a class. Both approaches are valid, and the choice depends on your preference and the complexity of your collation logic.

1. Function as `collate_fn`:
def my_collate_fn(batch):
    # Your custom collation logic here
    return processed_batch
# Use the function with DataLoader
train_loader = DataLoader(dataset, batch_size=64, collate_fn=my_collate_fn)

2. Class as `collate_fn`:
class MyCollateClass:
    def __call__(self, batch):
        # Your custom collation logic here
        return processed_batch
# Instantiate the class and use it with DataLoader
my_collate_instance = MyCollateClass()
train_loader = DataLoader(dataset, batch_size=64, collate_fn=my_collate_instance)

Using a class allows you to maintain state between batches if needed, as the class instance retains its state between calls. This can be beneficial if your collation logic requires some persistent information.

The key point is that the `collate_fn` parameter should be a callable (a function or an object with a `__call__` method) that takes a list of batch data and returns the processed batch. The processing typically involves padding sequences, converting data types, or any other necessary steps to prepare the batch for the model.
'''

'\nNote : \nIn PyTorch, the `collate_fn` parameter in the `DataLoader` can be either a function or an object of a class. Both approaches are valid, and the choice depends on your preference and the complexity of your collation logic.\n\n1. Function as `collate_fn`:\ndef my_collate_fn(batch):\n    # Your custom collation logic here\n    return processed_batch\n# Use the function with DataLoader\ntrain_loader = DataLoader(dataset, batch_size=64, collate_fn=my_collate_fn)\n\n2. Class as `collate_fn`:\nclass MyCollateClass:\n    def __call__(self, batch):\n        # Your custom collation logic here\n        return processed_batch\n# Instantiate the class and use it with DataLoader\nmy_collate_instance = MyCollateClass()\ntrain_loader = DataLoader(dataset, batch_size=64, collate_fn=my_collate_instance)\n\nUsing a class allows you to maintain state between batches if needed, as the class instance retains its state between calls. This can be beneficial if your collation logic requires some pe

In [22]:
# Define collate function for DataLoader
def collate_fn(batch):
    sources, targets = zip(*batch)
    padded_sources = pad_sequence(sources, batch_first=True)
    padded_targets = pad_sequence(targets, batch_first=True)
    return padded_sources, padded_targets

In [23]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadAttention,self).__init__()
        assert embedding_dim % num_heads == 0, "embedding_dim must be divisible by num_heads"

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.dim_perhead = embedding_dim // num_heads

        self.W_q = nn.Linear(embedding_dim, embedding_dim)
        self.W_k = nn.Linear(embedding_dim, embedding_dim)
        self.W_v = nn.Linear(embedding_dim, embedding_dim)
        self.W_o = nn.Linear(embedding_dim, embedding_dim)

    def scaled_dot_product_attention(self,Q,K,V,mask=None):
        # Q,K,V Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        
        K = K.transpose(-2,-1) # K = K.permute(0,1,3,2) also works
        # K Shape(after permute) : [Batch_Size X Num_Heads X Dim Per Head X Seq_len]
        attn_scores = torch.matmul(Q,K) / math.sqrt(self.dim_perhead)
        # attn_scores Shape : [Batch_Size X Num_Heads X Seq_len X Seq_len]
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        # attn_probs Shape : [Batch_Size X Num_Heads X Seq_len X Seq_len]
        output = torch.matmul(attn_probs, V)
        # output Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        return output

    def split_heads(self, x):
        # X shape : [Batch_Size X Seq_len X Embedding Dim]
        batch_size, seq_length, d_model = x.size()
        x = x.view(batch_size, seq_length,self.num_heads,self.dim_perhead)
        # X shape : [Batch_Size X Seq_len X Num_Heads X Dim Per Head]
        x = x.transpose(1,2)
        # X shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        return x

    def combine_heads(self, x):
        # x Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        batch_size, _, seq_length, dim_perhead = x.size()
        x = x.transpose(1,2).contiguous()
        # x Shape : [Batch_Size X Seq_len X Num_Heads X Dim Per Head]
        x = x.view(batch_size, seq_length,self.embedding_dim)
        # x Shape : [Batch_Size X Seq_len X Embedding Dim]
        return x

    def forward(self, Q, K, V, mask=None):
        # Q,K,V Shape : [Batch_Size X Seq_len X Embedding Dim]
        Q = self.split_heads(self.W_q(Q)) 
        K = self.split_heads(self.W_k(K)) 
        V = self.split_heads(self.W_v(V)) 
        # Q,K,V Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        # attn_output Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        output = self.W_o(self.combine_heads(attn_output))
        # output Shape :  # x Shape : [Batch_Size X Seq_len X Embedding Dim]
        return output

In [24]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # shape does not change here
        return self.fc2(F.relu(self.fc1(x)))


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length,dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model,2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe',pe.unsqueeze(0))

    def forward(self, x):
        # shape does not change here, adding positional encoding information
        return x + self.pe[:, :x.size(1)]

In [25]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # x shape [Batch_Size X Seq_len X Embedding Dim]
        attn_output = self.self_attn(x, x, x,mask)
        # attn_output shape [Batch_Size X Seq_len X Embedding Dim]
        x = self.norm1(x + self.dropout(attn_output))
        # x shape [Batch_Size X Seq_len X Embedding Dim]
        ff_output = self.feed_forward(x)
        # ff_output shape [Batch_Size X Seq_len X Embedding Dim]
        x = self.norm2(x + self.dropout(ff_output))
        # x shape [Batch_Size X Seq_len X Embedding Dim]
        return x


class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x,tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x,enc_output,enc_output,src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [27]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length,device=device), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [28]:
src_vocab_size = len(source_vocab)
tgt_vocab_size = len(target_vocab)
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = get_max_seqlen()
dropout = 0.1
num_workers = 2
num_epochs = 10

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
print(model)

Max length in dataset  1986
Transformer(
  (encoder_embedding): Embedding(27637, 512)
  (decoder_embedding): Embedding(27637, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderL

In [29]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(trainable_params)

86616565


In [30]:
# Specify optimizer and loss function
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)

In [31]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn, num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)

In [32]:
source_dummy,target_dummy = next(iter(train_loader))

In [33]:
print(source_dummy.shape,target_dummy.shape)

torch.Size([4, 167]) torch.Size([4, 71])


In [34]:
print(source_dummy[1])

tensor([    1, 19619, 27499,  1048, 10077,  3514, 17903, 19619, 10575,  8715,
        24938,  6192, 26986,  4867, 27314,  5754,  5549,  4032, 27479, 17131,
         4974,  8441, 23436, 15044,  9673,  5522, 26129, 18915, 14542, 15038,
         1192, 19531, 17903, 27338, 11765,  1443, 10077, 24509, 21344, 27578,
        11865, 18199, 25639,  1498, 27299, 26986,   151, 27316, 27201, 16221,
         4937, 15038, 25249, 19619, 16326, 14881,  6200, 25503,  6121, 24509,
        16792,  3118, 24509, 15499, 11141, 27488, 21344, 24509, 27338, 25794,
        19619, 17252,  8441, 27338,  8875, 17903, 24509, 21344,  8397,  5754,
         5549,  4028, 20345,  1498,  2042, 12315, 23221,  1089,   209, 24509,
         2792, 15600, 26161,  1302, 25794, 19084, 19619, 27314,  5754,  5549,
         4028, 15858, 13934, 27479,  8441, 12461,    78, 10077, 24509,  3173,
         3226,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [35]:
print(torch.min(target_dummy),torch.max(target_dummy))

tensor(0) tensor(27578)


In [38]:
model.to(device)
source_dummy = source_dummy.to(device)
target_dummy = target_dummy.to(device)
print()




In [39]:
y_pred = model(source_dummy,target_dummy)
print(y_pred.shape,target_dummy.shape)

torch.Size([4, 71, 27637]) torch.Size([4, 71])


In [40]:
y_pred = y_pred.reshape(-1,len(target_vocab))
target_dummy = target_dummy.reshape(-1)
print(y_pred.shape,target_dummy.shape)

torch.Size([284, 27637]) torch.Size([284])


In [41]:
def train_loop(model,dataloader,loss_fun,optimizer,device):
    model.train()
    model.to(device)
    min_loss = None
    for epoch in range(num_epochs):
        losses = []
        accuracies = []
        loop = tqdm(enumerate(dataloader), total=len(dataloader), leave=True)
        for batch,(x,y) in loop:
            # put on cuda
            x = x.to(device)
            y = y.to(device)
    
            # forward pass
            y_pred = model(x,y)
            
            # calculate loss & accuracy
            loss = loss_fun(y_pred.reshape(-1,len(target_vocab)),y.reshape(-1))
            losses.append(loss.detach().item())
            
            accuracy = check_accuracy(y_pred.reshape(-1,len(target_vocab)),y.reshape(-1))
            accuracies.append(accuracy.detach().item())
            
            # zero out prior gradients
            optimizer.zero_grad()
            
            # backprop
            loss.backward()
            
            # update weights
            optimizer.step()
            scheduler.step()
            
            # Update TQDM progress bar
            loop.set_description(f"Epoch [{epoch}/{num_epochs}] ")
            loop.set_postfix(loss=loss.detach().item(), accuracy=accuracy.detach().item())

        moving_loss = sum(losses) / len(losses)
        moving_accuracy = sum(accuracies) / len(accuracies)
        checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
        # Save check point
        if min_loss == None:
            min_loss = moving_loss
            save_checkpoint(checkpoint)
        elif moving_loss < min_loss:
            min_loss = moving_loss
            save_checkpoint(checkpoint)
        print('Epoch {0} : Loss = {1} , Training Accuracy={2}'.format(epoch, moving_loss, moving_accuracy))

In [42]:
train_loop(model,train_loader,criterion,optimizer,device)/

Epoch [0/20] : 100%|██████████| 445/445 [00:46<00:00,  9.57it/s, accuracy=30.5, loss=5.09]


Saving weights-->
Epoch 0 : Loss = 6.908239778240075 , Training Accuracy=22.35531384620104


Epoch [1/20] : 100%|██████████| 445/445 [00:46<00:00,  9.66it/s, accuracy=52.6, loss=3.74]


Saving weights-->
Epoch 1 : Loss = 3.831281638949105 , Training Accuracy=45.11859307449855


Epoch [2/20] : 100%|██████████| 445/445 [00:45<00:00,  9.70it/s, accuracy=52.5, loss=2.21]


Saving weights-->
Epoch 2 : Loss = 2.6866609562648818 , Training Accuracy=52.25829454914907


Epoch [3/20] : 100%|██████████| 445/445 [00:45<00:00,  9.81it/s, accuracy=69.9, loss=2.17]


Saving weights-->
Epoch 3 : Loss = 2.0379160607798714 , Training Accuracy=57.111336024423665


Epoch [4/20] : 100%|██████████| 445/445 [00:46<00:00,  9.65it/s, accuracy=48.3, loss=1.7]  


Saving weights-->
Epoch 4 : Loss = 1.6177765642659048 , Training Accuracy=58.71403843740399


Epoch [5/20] : 100%|██████████| 445/445 [00:45<00:00,  9.68it/s, accuracy=57.7, loss=1.35] 


Saving weights-->
Epoch 5 : Loss = 1.3235397495580523 , Training Accuracy=60.27891213492061


Epoch [6/20] : 100%|██████████| 445/445 [00:45<00:00,  9.70it/s, accuracy=56.9, loss=0.614]


Saving weights-->
Epoch 6 : Loss = 1.1019645950767432 , Training Accuracy=62.13116863122147


Epoch [7/20] : 100%|██████████| 445/445 [00:46<00:00,  9.67it/s, accuracy=70.2, loss=0.748]


Saving weights-->
Epoch 7 : Loss = 0.9267714869440271 , Training Accuracy=62.74302706772022


Epoch [8/20] : 100%|██████████| 445/445 [00:46<00:00,  9.63it/s, accuracy=76.6, loss=1.05] 


Saving weights-->
Epoch 8 : Loss = 0.784919796331545 , Training Accuracy=63.98580661302202


Epoch [9/20] : 100%|██████████| 445/445 [00:46<00:00,  9.66it/s, accuracy=60.5, loss=1.27] 


Saving weights-->
Epoch 9 : Loss = 0.6648119542036164 , Training Accuracy=64.68886709749029


Epoch [10/20] : 100%|██████████| 445/445 [00:46<00:00,  9.66it/s, accuracy=59.7, loss=0.258]


Saving weights-->
Epoch 10 : Loss = 0.5669622163759189 , Training Accuracy=64.90695223433248


Epoch [11/20] : 100%|██████████| 445/445 [00:45<00:00,  9.76it/s, accuracy=69.3, loss=0.437]


Saving weights-->
Epoch 11 : Loss = 0.47678658459963424 , Training Accuracy=66.56710979590255


Epoch [12/20] : 100%|██████████| 445/445 [00:45<00:00,  9.73it/s, accuracy=57.4, loss=0.574]


Saving weights-->
Epoch 12 : Loss = 0.40671895399187386 , Training Accuracy=67.30556015700436


Epoch [13/20] : 100%|██████████| 445/445 [00:45<00:00,  9.76it/s, accuracy=62.5, loss=0.472] 


Saving weights-->
Epoch 13 : Loss = 0.3395778112699476 , Training Accuracy=67.07324917396802


Epoch [14/20] : 100%|██████████| 445/445 [00:45<00:00,  9.75it/s, accuracy=84.7, loss=0.265] 


Saving weights-->
Epoch 14 : Loss = 0.2860099634720703 , Training Accuracy=67.17069450549864


Epoch [15/20] : 100%|██████████| 445/445 [00:46<00:00,  9.67it/s, accuracy=80.6, loss=0.176] 


Saving weights-->
Epoch 15 : Loss = 0.23685560856809776 , Training Accuracy=66.76500736193711


Epoch [16/20] : 100%|██████████| 445/445 [00:45<00:00,  9.79it/s, accuracy=84.9, loss=0.254] 


Saving weights-->
Epoch 16 : Loss = 0.19521482404363288 , Training Accuracy=67.18848783085855


Epoch [17/20] : 100%|██████████| 445/445 [00:45<00:00,  9.75it/s, accuracy=69.1, loss=0.249] 


Saving weights-->
Epoch 17 : Loss = 0.16056065750674586 , Training Accuracy=67.22025282356176


Epoch [18/20] : 100%|██████████| 445/445 [00:45<00:00,  9.76it/s, accuracy=63.3, loss=0.1]   


Saving weights-->
Epoch 18 : Loss = 0.12928644109182477 , Training Accuracy=67.06444563276312


Epoch [19/20] : 100%|██████████| 445/445 [00:45<00:00,  9.68it/s, accuracy=86.1, loss=0.0905]


Saving weights-->
Epoch 19 : Loss = 0.10339200578713685 , Training Accuracy=67.06262954540468


In [47]:
def test_loop(model,dataloader,loss_fun,device):
    model.eval()
    model.to(device)
    losses = []
    samples,correct = 0,0
    loop = tqdm(enumerate(dataloader), total=len(dataloader), leave=True)
    with torch.no_grad():
        for batch,(x,y) in loop:
            # put on cuda
            x = x.to(device)
            y = y.to(device)

            # forward pass
            y_pred = model(x,y)
            
            # caclulate test loss
            loss = loss_fun(y_pred.reshape(-1,len(target_vocab)),y.reshape(-1))
            losses.append(loss.detach().item())

            # accuracy over entire dataset
            _,predpos=y_pred.reshape(-1,len(target_vocab)).max(1)
            samples+=len(y.reshape(-1))
            correct+=(predpos==y.reshape(-1)).sum().item()
            
            # Update TQDM progress bar
            loop.set_postfix(loss=loss.item())

    print("Final Test Accuracy = ",100 * (correct/samples))

In [48]:
test_loop(model,test_loader,criterion,device)

100%|██████████| 112/112 [00:03<00:00, 30.23it/s, loss=0.22] 

Final Test Accuracy =  60.11538962071808



