# Import all of the used libraries

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import os
import pickle

from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, BucketIterator, TabularDataset
from Utlis import calculate_bleu, saveCheckpoint, loadCheckpoint

# Create a field and vocab to split and tokenize the articles

In [4]:
import random

# Set a random seed for reproducibility
SEED = 42
random.seed(SEED)

In [10]:
# Define the Fields for tokenization
articles = Field(tokenize="spacy", tokenizer_language="en_core_web_sm", init_token="<sos>", eos_token="<eos>")

# Define the fields dictionary for the TabularDataset
fields = {"article": ("src", articles), "highlights": ("trg", articles)}

# Path to your data file
data_path = "Datasets/cnn_dailymail/test.csv"

# Create the TabularDataset
dataset = TabularDataset(
    path=data_path,
    format="csv",  
    fields=fields
)


tinyDataset, _ = dataset.split(split_ratio=[0.2, 0.8])

# Split the dataset into training, validation, and test sets
trainData, validData, testData = tinyDataset.split(split_ratio=[0.7, 0.15, 0.15])

# Build the vocabulary for the Fields
articles.build_vocab(trainData, max_size=10000, min_freq=2)

# Print some statistics
print(f"Number of training examples: {len(trainData.examples)}")
print(f"Number of validation examples: {len(validData.examples)}")
print(f"Number of testing examples: {len(testData.examples)}")
print(f"Unique tokens in source vocabulary: {len(articles.vocab)}")

Number of training examples: 1609
Number of validation examples: 344
Number of testing examples: 345
Unique tokens in source vocabulary: 10004


In [11]:
   
articles = torch.load("Vocab/Field.pt")

In [12]:
exampleArticle = " ".join(vars(trainData.examples[0])['src'])
print(exampleArticle)

Forget Swansea , Stoke City and West Ham , the top seven would be seriously challenged by ' Team Pardew ' should this manager have had a full season in charge at one club . That is after Crystal Palace , managed by Alan Pardew , beat Manchester City 2 - 1 at Selhurst Park on Monday night , the English manager 's ninth win   in South London since leaving his role at Newcastle at the end of December . His former club have since plummeted in the Premier League while Palace now find themselves threatening the top half , and combining his records at both clubs this season would see ' Team Pardew ' sit in eighth place , just five points behind Southampton . Alan Pardew could be a candidate for manager of the year with his record at Newcastle and Crystal Palace . The Premier League table with Team Pardew in - as well as a team complied from pre / post Pardew results . Newcastle fans were calling for Pardew 's head for many years , seen as owner Mike Ashley 's puppet , and were sitting in 10th

# Create the model structure

In [13]:
class Transformer(nn.Module):
    def __init__(self, 
                 embeddingSize,
                 sourceVocabSize,
                 targetVocabSize,
                 sourcePadIndex,
                 numberHeads,
                 numberEncoderLayers,
                 numberDecoderLayers,
                 forwardExpansion,
                 dropout,
                 maxLength,
                 device
                ) -> None:
        super(Transformer, self).__init__()
        
        # Creating a map to turn words into vectors, similar to Word2Vec
        self.sourceWordEmbedding = nn.Embedding(sourceVocabSize, embeddingSize)
        
        # Creating a map to turn the position of the word into a vec
        self.sourcePositionEmbedding = nn.Embedding(maxLength, embeddingSize)
        
        # Same same, but for the target, (Need to double check to see if this needed as both the text are in english)
        self.targetWordEmbedding = nn.Embedding(targetVocabSize, embeddingSize)
        self.targetPositionEmbedding = nn.Embedding(maxLength, embeddingSize)
        
        # Set the device, GPU or CPU
        self.device = device
        
        self.transformer = nn.Transformer(
            d_model=embeddingSize,
            nhead=numberHeads,
            num_encoder_layers=numberEncoderLayers,
            num_decoder_layers=numberDecoderLayers,
            dim_feedforward=forwardExpansion,
            dropout=dropout
        )
        
        # Create a Linear and softmax function to turn word vectors into words
        self.fcOut = nn.Linear(embeddingSize, targetVocabSize)
        self.dropout = nn.Dropout(dropout)
        self.sourcePadIdx = sourcePadIndex
        
    def getSourceMask(self, src):
        # We are changing the shape of the mask from (srcLen, N) -> (N, srcLen)
        # it needed to be in this format for pytorch to use it :)
        sourceMask = src.transpose(0, 1) == self.sourcePadIdx
        
        return sourceMask
    
    def forward(self, source, target):
        # Handle 1-dimensional tensor (vector) case
        sourceSeqLength, N, = source.shape[0],source.shape[1]
        targetSeqLength, N, = target.shape[0],target.shape[1]
        
        # Creating the positions used for the position embeddings
        sourcePositions = (
            torch.arange(0, sourceSeqLength).unsqueeze(1).expand(sourceSeqLength, N)
            .to(self.device)
        )

        targetPositions = (
            torch.arange(0, targetSeqLength).unsqueeze(1).expand(targetSeqLength, N)
            .to(self.device)
        )
        
        # We are combining both the word embedding with the position of the words 
        embedSource = self.dropout(
            (self.sourceWordEmbedding(source) + self.sourcePositionEmbedding(sourcePositions))
        )
        
        embedTarget = self.dropout(
            (self.targetWordEmbedding(target) + self.targetPositionEmbedding(targetPositions))
        )
        
        # Now we are creating a mask that can be used on all the text
        sourcePaddingMask = self.getSourceMask(source)
        targetMask = self.transformer.generate_square_subsequent_mask(targetSeqLength).to(self.device)
        
        out = self.transformer(
            embedSource,
            embedTarget,
            src_key_padding_mask = sourcePaddingMask,
            tgt_mask = targetMask
        )
        
        out = self.fcOut(out)
        
        return out

# Get the article Highlights

In [14]:
def getHighlights(article, articles_field, model, device):
    # Tokenize the article
    tokenized_article = articles_field.tokenize(article)
    
    # Convert tokens to numerical indices
    numerical_article = [articles_field.vocab.stoi[token] for token in tokenized_article]
    
    # Convert to tensor and add batch dimension
    numerical_article = torch.LongTensor(numerical_article).unsqueeze(1).to(device)
    
    # Generate highlights
    model.eval()
    with torch.no_grad():
        # Assuming target is not used for generating highlights
        target = torch.zeros((numerical_article.shape[0], 1)).long().to(device)
        output = model(numerical_article, target)
    
    # Get the generated highlights as numerical indices
    generated_indices = output.argmax(dim=-1)
    
    # Convert indices to tokens
    generated_tokens = [articles_field.vocab.itos[idx] for idx in generated_indices.squeeze()]
    
    # Remove special tokens and join tokens into a single string
    # generated_highlights = ' '.join(token for token in generated_tokens if token not in ['<sos>', '<eos>', '<pad>'])
    # generated_highlights = ' '.join(token for token in generated_tokens if token not in ['<pad>'])
    
    return_tokens = ''
    for i in generated_tokens:
        if i == '<eos>':
            break
        if i != '<pad>':
            return_tokens += ' ' + i
        
    return return_tokens

# Set Training parameters    

In [16]:
# Create a Training phase
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loadTheModel = True
loadModelFolder = "P:/New Checkpoints"
saveModel = True

# Training hyperparameters
numberEpochs = 3000
learningRate = 3e-4
batchSize = (32, 32, 32)

# Model hyperparameters
sourceVocabSize, targetVocabSize = len(articles.vocab), len(articles.vocab)
embeddingSize = 512
numberHeads = 8
numberEncoderLayers = 3
numberDecoderLayers = 3
dropout = 0.10
maxLength = 10000
forwardExpansion = 4
sourcePadIndex = articles.vocab.stoi['<pad>']

# Create Model and optimizer 

In [17]:
# Tensorboard to be fancy
writer = SummaryWriter('runs/lossPlot')
step = 0

trainIterator, validIterator, testIterator = BucketIterator.splits(
    (trainData, validData, testData),
    batch_sizes=batchSize,
    sort_within_batch = True, # Note the reason that you are sorting it is because when you are in batches it wont have to calculate extra padding size
    sort_key = lambda x: len(x.src),
    device = device
)

model = Transformer(
    embeddingSize=embeddingSize,
    sourceVocabSize=sourceVocabSize,
    targetVocabSize=targetVocabSize,
    sourcePadIndex=sourcePadIndex,
    numberHeads=numberHeads,
    numberEncoderLayers=numberEncoderLayers,
    numberDecoderLayers=numberDecoderLayers,
    forwardExpansion=forwardExpansion,
    dropout=dropout,
    maxLength=maxLength,
    device=device
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learningRate)


padIndex = articles.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index = padIndex)




# Remove useless warnings

In [18]:
import warnings

# Suppress the warning
warnings.filterwarnings("ignore", message="1Torch was not compiled with flash attention.", category=UserWarning)

# Training

In [20]:
def loadModel(model, optimizer, filepath):
    # epochs = sorted([int(x[5:][:-4]) for x in os.listdir(filepath)])
    epochs = sorted([int(x[5:-4]) for x in os.listdir(filepath) if x.endswith(".pth")])
    print(epochs)
    
    start = epochs[-1]
    
    loadCheckpoint(model, optimizer, f'{filepath}/point{start}.pth')
    
    return start + 1, model, optimizer

In [21]:
# Train
import os, datetime
from tqdm import tqdm
import sys

time = str(datetime.datetime.now()).replace(":", " ")


if loadTheModel:
    start, model, optimizer = loadModel(model, optimizer, loadModelFolder)
    fileDir = loadModelFolder
else:
    start = 0
    fileDir = f'Checkpoints/{time}'
    os.makedirs(f'Checkpoints/{time}')
# sys.exit()
    
exampleArticle = "Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee. 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for space in the overhead lockers, crashing elbows and seat back kicking? Tests conducted by the FAA use planes with a 31 inch pitch, a standard which on some airlines has decreased . Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches . Cynthia Corbertt, a human factors researcher with the Federal Aviation Administration, that it conducts tests on how quickly passengers can leave a plane. But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News. The distance between two seats from one point on a seat to the same point on the seat behind it is known as the pitch. While most airlines stick to a pitch of 31 inches or above, some fall below this. While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches. British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31."
example_idx = 0
exampleArticle = " ".join(vars(trainData.examples[example_idx])['src'])
print(exampleArticle)
exampleHighlight = " ".join(vars(trainData.examples[example_idx])['trg'])
print(exampleHighlight)
# "Experts question if  packed out planes are putting passengers at risk .
# "U.S consumer advisory group says minimum space must be stipulated ."
# "Safety tests conducted on planes with more leg room than airlines offer ."
for epoch in range(start, numberEpochs):
    
    # Train the model
    model.train()
    
    for batchIndex, batch in tqdm(enumerate(trainIterator), total=len(trainIterator), desc=f"[Epoch {epoch} / {numberEpochs}]"):
        inputData = batch.src.to(device)
        target = batch.trg.to(device)
        
        # Shift the target tensor by one time step
        target_input = target[:-1, :]
        
        # forward prop
        output = model(inputData, target_input)
        
        output = output.reshape(-1, output.shape[2])
        target_output = target[1:, :].reshape(-1)
        optimizer.zero_grad()
        
        loss = criterion(output, target_output)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        
        optimizer.step()
        
        writer.add_scalar("Training Loss", loss, global_step=step)
        step += 1
        
    # Save the model info
    if saveModel:
        checkpoint = {
            "stateDict" : model.state_dict(),
            "optimizer" : optimizer.state_dict()
        }
        
        # save the checkpoint
        saveCheckpoint(model, optimizer, f'{fileDir}/point{epoch}.pth')
        
        if epoch % 50 == 0:
            for i in range(epoch-49,epoch):
                file_path = f'{fileDir}/point{i}.pth'
                if os.path.exists(file_path):
                    os.remove(file_path)
        elif epoch % 50 != 1:
            file_path = f'{fileDir}/point{epoch-1}.pth'
            if os.path.exists(file_path):
                os.remove(file_path)
            
        
    # Give the test output a show
    model.eval()
    try:
        testHighlights = getHighlights(exampleArticle, articles, model, device)

        print(f"    Key Dot Points: \n      {testHighlights}")
    except:
        print("    Cant Load Highlights")
        

[456]
    Checkpoint loaded from 'P:/New Checkpoints/point456.pth'
Forget Swansea , Stoke City and West Ham , the top seven would be seriously challenged by ' Team Pardew ' should this manager have had a full season in charge at one club . That is after Crystal Palace , managed by Alan Pardew , beat Manchester City 2 - 1 at Selhurst Park on Monday night , the English manager 's ninth win   in South London since leaving his role at Newcastle at the end of December . His former club have since plummeted in the Premier League while Palace now find themselves threatening the top half , and combining his records at both clubs this season would see ' Team Pardew ' sit in eighth place , just five points behind Southampton . Alan Pardew could be a candidate for manager of the year with his record at Newcastle and Crystal Palace . The Premier League table with Team Pardew in - as well as a team complied from pre / post Pardew results . Newcastle fans were calling for Pardew 's head for many yea

[Epoch 457 / 3000]: 100%|██████████| 51/51 [01:56<00:00,  2.29s/it]


    Checkpoint saved to 'P:/New Checkpoints/point457.pth'
    Key Dot Points: 
       11th 11th what three three Tyne three left may career three . . . . reduce . killing reduce . what Alan . . paint killing and 1 place what three black and 1 black what reduce and left black and left left left but . place place may side chance and side black place left reduce 3 game and left through left Tyne ceremony nine but place team ' Anfield left regardless United 5 and black may Lord team . Premier cities and high left cities cities denied players place . denied denied cities between Tyne ' cities may ' judge Boyd high cities formed ' Tyne post high statement three arrest vehicles statement ceremony lack ceremony cities cities ceremony high left cities anyone Ferguson nine statement cities ceremony cities black Pennsylvania high Congress statement nine high high left cities Spain high Tyne ceremony when high Tyne Tyne 22 place left high high through formed high high high 2 statement matches Tyne

[Epoch 458 / 3000]:  10%|▉         | 5/51 [00:47<07:20,  9.58s/it]


KeyboardInterrupt: 

# Evaluating the model

In [None]:
score = calculate_bleu(testData, model, articles, highlights, device)
print(f"The BLEU score is: {score}")