# Import all of the used libraries

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy

from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from Utlis import calculate_bleu, saveCheckpoint, loadCheckpoint
from torchtext.data import Field, TabularDataset

# Create a field and vocab to split and tokenize the articles

In [4]:
# Define the Fields for tokenization
articles = Field(tokenize="spacy", tokenizer_language="en_core_web_sm", init_token="<sos>", eos_token="<eos>")
highlights = Field(tokenize="spacy", tokenizer_language="en_core_web_sm", init_token="<sos>", eos_token="<eos>")

# Define the fields dictionary for the TabularDataset
fields = {"article": ("src", articles), "highlights": ("trg", highlights)}

# Path to your data file
data_path = "Datasets/cnn_dailymail/train.csv"

# Create the TabularDataset
dataset = TabularDataset(
    path=data_path,
    format="csv",  
    fields=fields
)

# Split the dataset into training, validation, and test sets
trainData, validData, testData = dataset.split(split_ratio=[0.7, 0.15, 0.15])

# Build the vocabulary for the Fields
articles.build_vocab(trainData, max_size=10000, min_freq=2)
highlights.build_vocab(trainData, max_size=10000, min_freq=2)

# Print some statistics
print(f"Number of training examples: {len(trainData.examples)}")
print(f"Number of validation examples: {len(validData.examples)}")
print(f"Number of testing examples: {len(testData.examples)}")
print(f"Unique tokens in source vocabulary: {len(articles.vocab)}")
print(f"Unique tokens in target vocabulary: {len(highlights.vocab)}")

Number of training examples: 8043
Number of validation examples: 1723
Number of testing examples: 1724
Unique tokens in source vocabulary: 10004
Unique tokens in target vocabulary: 10004


# Create the model structure

In [5]:
class Transformer(nn.Module):
    def __init__(self, 
                 embeddingSize,
                 sourceVocabSize,
                 targetVocabSize,
                 sourcePadIndex,
                 numberHeads,
                 numberEncoderLayers,
                 numberDecoderLayers,
                 forwardExpansion,
                 dropout,
                 maxLength,
                 device
                ) -> None:
        super(Transformer, self).__init__()
        
        # Creating a map to turn words into vectors, similar to Word2Vec
        self.sourceWordEmbedding = nn.Embedding(sourceVocabSize, embeddingSize)
        
        # Creating a map to turn the position of the word into a vec
        self.sourcePositionEmbedding = nn.Embedding(maxLength, embeddingSize)
        
        # Same same, but for the target, (Need to double check to see if this needed as both the text are in english)
        self.targetWordEmbedding = nn.Embedding(targetVocabSize, embeddingSize)
        self.targetPositionEmbedding = nn.Embedding(maxLength, embeddingSize)
        
        # Set the device, GPU or CPU
        self.device = device
        
        self.transformer = nn.Transformer(
            d_model=embeddingSize,
            nhead=numberHeads,
            num_encoder_layers=numberEncoderLayers,
            num_decoder_layers=numberDecoderLayers,
            dim_feedforward=forwardExpansion,
            dropout=dropout
        )
        
        # Create a Linear and softmax function to turn word vectors into words
        self.fcOut = nn.Linear(embeddingSize, targetVocabSize)
        self.dropout = nn.Dropout(dropout)
        self.sourcePadIdx = sourcePadIndex
        
    def getSourceMask(self, src):
        # We are changing the shape of the mask from (srcLen, N) -> (N, srcLen)
        # it needed to be in this format for pytorch to use it :)
        sourceMask = src.transpose(0, 1) == self.sourcePadIdx
        
        return sourceMask
    
    def forward(self, source, target):
        # Handle 1-dimensional tensor (vector) case
        sourceSeqLength, N, = source.shape[0],source.shape[1]
        targetSeqLength, N, = target.shape[0],target.shape[1]
        
        # Creating the positions used for the position embeddings
        sourcePositions = (
            torch.arange(0, sourceSeqLength).unsqueeze(1).expand(sourceSeqLength, N)
            .to(self.device)
        )

        targetPositions = (
            torch.arange(0, targetSeqLength).unsqueeze(1).expand(targetSeqLength, N)
            .to(self.device)
        )
        
        # We are combining both the word embedding with the position of the words 
        embedSource = self.dropout(
            (self.sourceWordEmbedding(source) + self.sourcePositionEmbedding(sourcePositions))
        )
        
        embedTarget = self.dropout(
            (self.targetWordEmbedding(target) + self.targetPositionEmbedding(targetPositions))
        )
        
        # Now we are creating a mask that can be used on all the text
        sourcePaddingMask = self.getSourceMask(source)
        targetMask = self.transformer.generate_square_subsequent_mask(targetSeqLength).to(self.device)
        
        out = self.transformer(
            embedSource,
            embedTarget,
            src_key_padding_mask = sourcePaddingMask,
            tgt_mask = targetMask
        )
        
        out = self.fcOut(out)
        
        return out

# Get the article Highlights

In [6]:
def getHighlights(article, articles_field, highlights_field, model, device):
    # Tokenize the article
    tokenized_article = articles_field.tokenize(article)
    
    # Convert tokens to numerical indices
    numerical_article = [articles_field.vocab.stoi[token] for token in tokenized_article]
    
    # Convert to tensor and add batch dimension
    numerical_article = torch.LongTensor(numerical_article).unsqueeze(1).to(device)
    
    # Generate highlights
    model.eval()
    with torch.no_grad():
        # Assuming target is not used for generating highlights
        target = torch.zeros((numerical_article.shape[0], 1)).long().to(device)
        output = model(numerical_article, target)
    
    # Get the generated highlights as numerical indices
    generated_indices = output.argmax(dim=-1)
    
    # Convert indices to tokens
    generated_tokens = [highlights_field.vocab.itos[idx] for idx in generated_indices.squeeze()]
    
    # Remove special tokens and join tokens into a single string
    generated_highlights = ' '.join(token for token in generated_tokens if token not in ['<sos>', '<eos>', '<pad>'])
    
    return generated_highlights

# Set Training parameters    

In [7]:
# Create a Training phase
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loadModel = False
saveModel = True

# Training hyperparameters
numberEpochs = 100
learningRate = 3e-4
batchSize = (32, 32, 32)

# Model hyperparameters
sourceVocabSize, targetVocabSize = len(articles.vocab), len(highlights.vocab)
embeddingSize = 512
numberHeads = 8
numberEncoderLayers = 3
numberDecoderLayers = 3
dropout = 0.10
maxLength = 10000
forwardExpansion = 4
sourcePadIndex = articles.vocab.stoi['<pad>']

# Create Model and optimizer 

In [8]:
# Tensorboard to be fancy
writer = SummaryWriter('runs/lossPlot')
step = 0

trainIterator, validIterator, testIterator = BucketIterator.splits(
    (trainData, validData, testData),
    batch_sizes=batchSize,
    sort_within_batch = True, # Note the reason that you are sorting it is because when you are in batches it wont have to calculate extra padding size
    sort_key = lambda x: len(x.src),
    device = device
)

model = Transformer(
    embeddingSize=embeddingSize,
    sourceVocabSize=sourceVocabSize,
    targetVocabSize=targetVocabSize,
    sourcePadIndex=sourcePadIndex,
    numberHeads=numberHeads,
    numberEncoderLayers=numberEncoderLayers,
    numberDecoderLayers=numberDecoderLayers,
    forwardExpansion=forwardExpansion,
    dropout=dropout,
    maxLength=maxLength,
    device=device
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learningRate)


padIndex = articles.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index = padIndex)




# Remove useless warnings

In [None]:
import warnings

# Suppress the warning
warnings.filterwarnings("ignore", message="1Torch was not compiled with flash attention.", category=UserWarning)

# Training

In [9]:
import os, datetime
from tqdm import tqdm

time = str(datetime.datetime.now()).replace(":", " ")
os.makedirs(f'Checkpoints/{time}')

if loadModel:
    model = loadCheckpoint(model, optimizer, 'transformer_checkpoint.pth')
    
exampleArticle = "Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee. 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for space in the overhead lockers, crashing elbows and seat back kicking? Tests conducted by the FAA use planes with a 31 inch pitch, a standard which on some airlines has decreased . Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches . Cynthia Corbertt, a human factors researcher with the Federal Aviation Administration, that it conducts tests on how quickly passengers can leave a plane. But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News. The distance between two seats from one point on a seat to the same point on the seat behind it is known as the pitch. While most airlines stick to a pitch of 31 inches or above, some fall below this. While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches. British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31."

for epoch in range(numberEpochs):
    
    # Train the model
    model.train()
    
    for batchIndex, batch in tqdm(enumerate(trainIterator), total=len(trainIterator), desc=f"[Epoch {epoch} / {numberEpochs}]"):
        inputData = batch.src.to(device)
        target = batch.trg.to(device)
        
        # Shift the target tensor by one time step
        target_input = target[:-1, :]
        
        # forward prop
        output = model(inputData, target_input)
        
        output = output.reshape(-1, output.shape[2])
        target_output = target[1:, :].reshape(-1)
        optimizer.zero_grad()
        
        loss = criterion(output, target_output)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        
        optimizer.step()
        
        writer.add_scalar("Training Loss", loss, global_step=step)
        step += 1
        
    # Save the model info
    if saveModel:
        checkpoint = {
            "stateDict" : model.state_dict(),
            "optimizer" : optimizer.state_dict()
        }
        
        # save the checkpoint
        saveCheckpoint(model, optimizer, f'Checkpoints/{time}/point{epoch}.pth')
        
    # Give the test output a show
    model.eval()
    testHighlights = getHighlights(exampleArticle, articles, highlights, model, device)
    
    print(f"    Key Dot Points: \n      {testHighlights}")
        

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
[Epoch 0 / 100]: 100%|██████████| 252/252 [05:17<00:00,  1.26s/it]


    Checkpoint saved to 'Checkpoints/2024-05-05 14 10 54.090010/point0.pth'
    Key Dot Points: 
      <unk> <unk> , <unk> <unk> to <unk> <unk> to . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


[Epoch 1 / 100]:   8%|▊         | 21/252 [00:35<06:30,  1.69s/it]


KeyboardInterrupt: 

# Evaluating the model

In [None]:
score = calculate_bleu(testData, model, articles, highlights, device)
print(f"The BLEU score is: {score}")