In [2]:
import torch  # PyTorch is a popular deep learning framework.
import torch.nn as nn  # Provides a set of building blocks for defining neural networks.
import torch.optim as optim  # Implements various optimization algorithms for training models.
import torch.nn.functional as F  # Contains functional interface for operations such as activation functions.
import math  # Provides mathematical functions and constants.
from torch.utils.data import DataLoader  # Helps with loading and batching data during training.
from torch.utils.tensorboard import SummaryWriter  # Allows logging of training information for visualization in TensorBoard.
from sklearn.model_selection import train_test_split  # Splits data into training and validation/test sets.
import sentencepiece as spm  # A library for tokenization and subword encoding.
import os  # Provides functions for interacting with the operating system.
import numpy as np  # A powerful library for numerical computing.
from torch.nn.utils.rnn import pad_sequence  # Helps with padding sequences to the same length in a batch.
from torch.optim.lr_scheduler import ReduceLROnPlateau  # Implements a learning rate scheduler that adjusts the learning rate based on validation loss.
import pandas as pd  # A library for data manipulation and analysis.
import csv  # Provides functionality for reading and writing CSV files.


# Global Paths
SP_MODEL_PATH = "spm_model.model"  # Path for SentencePiece model file
MODEL_PATH = "model_weights.pth"  # Path for trained model weights file
DATA_FILE = "data.xlsx"  # Path for input data file in Excel format
PLAIN_TEXT_FILE = "data.txt"  # Path for plain text data file

# Global Parameters
RANDOM_SEED = 42  # Random seed for reproducibility

D_MODEL = 4096  # Dimensionality of the model's input and output features.
N_HEAD = 128  # Number of attention heads.
NUM_LAYERS = 96  # Number of transformer layers.
DIM_FEEDFORWARD = 16384  # Dimensionality of the feed-forward neural networks.
DROPOUT = 0.1  # Dropout probability.
MAX_LEN = 5000  # Maximum sequence length.
INPUT_DIM = 50257  # Determines the number of unique tokens in the input language.
OUTPUT_DIM = 50257  # Determines the number of unique tokens in the output language.
BATCH_SIZE = 8  # Number of samples processed in parallel.
SRC_SEQ_LENGTH = 1024  # Determines the maximum length of the input sequence the model can handle.
TGT_SEQ_LENGTH = 128  # Determines the maximum length of the output sequence the model can generate.

# Convert Excel to plain text file
df = pd.read_excel(DATA_FILE)  # Read the Excel file into a pandas DataFrame
html_data = df['HTML'].tolist()  # Extract the 'HTML' column as a list

# Write HTML data to plain text file
with open(PLAIN_TEXT_FILE, 'w', encoding='utf-8') as file:
    writer = csv.writer(file)  # Create a CSV writer
    writer.writerows([[html] for html in html_data])  # Write each HTML content as a separate row

# Initialize SentencePiece tokenizer
if os.path.isfile(SP_MODEL_PATH):
    sp = spm.SentencePieceProcessor()  # Create a SentencePiece tokenizer instance
    sp.Load(SP_MODEL_PATH)  # Load the existing SentencePiece model
else:
    # Train SentencePiece tokenizer on your dataset
    spm.SentencePieceTrainer.Train(f"--input={PLAIN_TEXT_FILE} --model_prefix=spm_model --vocab_size=15000")  # Train the tokenizer with specified options
    sp = spm.SentencePieceProcessor()  # Create a SentencePiece tokenizer instance
    sp.Load(SP_MODEL_PATH)  # Load the trained SentencePiece model
    sp.Save(SP_MODEL_PATH)  # Save the SentencePiece model for future use

# Class for positional encoding in the transformer model
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)  # Create a tensor for positional encodings
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # Create a tensor for position indices
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # Create a tensor for division terms
        pe[:, 0::2] = torch.sin(position * div_term)  # Compute sine of even-indexed positions
        pe[:, 1::2] = torch.cos(position * div_term)  # Compute cosine of odd-indexed positions
        pe = pe.unsqueeze(0).transpose(0, 1)  # Reshape the positional encodings
        self.register_buffer('pe', pe)  # Register the positional encodings as a buffer

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]  # Add positional encodings to the input
        return self.dropout(x)  # Apply dropout to the output

# Function for inference with beam search
def inference(model, src, beam_width=5, max_length=100):
    model.eval()  # Set the model to evaluation mode
    src = src.unsqueeze(0).to(device)  # Add a batch dimension and move to the device
    src = src.repeat(beam_width, 1)  # Repeat the source sequence for beam search

    with torch.no_grad():
        src_encoding = model.embedding(src) * math.sqrt(D_MODEL)  # Embed the source sequence
        src_encoding = model.pos_encoder(src_encoding)  # Apply positional encoding to the source sequence
        memory = model.transformer_encoder(src_encoding)  # Encode the source sequence

        tgt = torch.ones(beam_width, 1).long().to(device)  # Initialize target sequence with start token
        tgt_lengths = torch.ones(beam_width).long().to(device)  # Initialize target sequence lengths
        eos_flags = torch.zeros(beam_width).byte().to(device)  # Flags to track if beam search paths have reached end-of-sequence

        scores_beam = torch.zeros(beam_width).to(device)  # Initialize scores_beam tensor

        for _ in range(max_length):
            tgt_encoding = model.embedding(tgt) * math.sqrt(D_MODEL)  # Embed the target sequence
            tgt_encoding = model.pos_encoder(tgt_encoding)  # Apply positional encoding to the target sequence
            output = model.transformer_decoder(tgt_encoding, memory)  # Decode the target sequence

            output = model.decoder(output[:, -1, :])  # Get logits for the last token
            output = F.log_softmax(output, dim=-1)  # Apply log softmax to convert logits to probabilities

            output = output.view(beam_width, -1, OUTPUT_DIM)  # Reshape logits for beam search

            if _ == 0:
                scores, candidates = output.topk(beam_width, dim=-1)  # Get top-k scores and candidates
            else:
                scores, candidates = output.topk(beam_width, dim=-1)  # Get top-k scores and candidates
                scores = scores + scores_beam.unsqueeze(2)  # Add scores of previous beam search paths

            scores = scores.view(beam_width, -1)  # Reshape scores for beam search
            candidates = candidates.view(beam_width, -1)  # Reshape candidates for beam search

            if _ == 0:
                scores_flat = scores.squeeze()  # Flatten scores for beam search
            else:
                scores_flat = scores.view(-1)  # Flatten scores for beam search

            scores_beam, indices_beam = scores_flat.topk(beam_width, dim=-1)  # Get top-k scores and indices

            tgt_candidates = candidates.view(-1)  # Flatten candidates for beam search
            tgt_candidates_beam = tgt_candidates[indices_beam]  # Select candidates for beam search

            tgt = torch.cat((tgt, tgt_candidates_beam.unsqueeze(1)), dim=1)  # Append selected candidates to target sequence

            eos_flags = eos_flags | (tgt_candidates_beam == 1)  # Check if any of the selected candidates is the end token
            if eos_flags.all():  # Break if all beam search paths have reached end-of-sequence
                break

            tgt_lengths = tgt_lengths + (~eos_flags).long()  # Update target sequence lengths

        best_sequence_index = scores_beam.argmax().item()  # Find the index of the best sequence
        best_sequence = tgt[best_sequence_index].tolist()  # Convert the best sequence to a list

    return best_sequence[1:]  # Remove the start token from the best sequence

# Class definition for the main ThinkTAI model
class ThinkTAI(nn.Module):
    def __init__(self, input_dim, output_dim, pretrained_weights=None):
        super(ThinkTAI, self).__init__()

        self.embedding = nn.Embedding(input_dim, D_MODEL)  # Embedding layer for input tokens
        self.pos_encoder = PositionalEncoding(D_MODEL, DROPOUT, max_len=MAX_LEN)  # Positional encoding layer

        encoder_layers = nn.TransformerEncoderLayer(D_MODEL, N_HEAD, DIM_FEEDFORWARD, DROPOUT)  # Encoder layers for the transformer
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, NUM_LAYERS)  # Transformer encoder

        decoder_layers = nn.TransformerDecoderLayer(D_MODEL, N_HEAD, DIM_FEEDFORWARD, DROPOUT)  # Decoder layers for the transformer
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, NUM_LAYERS)  # Transformer decoder

        self.decoder = nn.Linear(D_MODEL, output_dim)  # Linear layer for output prediction

        self.init_weights(pretrained_weights)  # Initialize weights of the model

    def init_weights(self, pretrained_weights=None):
        if pretrained_weights is not None:
            self.load_state_dict(torch.load(pretrained_weights))  # Load pretrained weights if available
        else:
            initrange = 0.1
            self.embedding.weight.data.uniform_(-initrange, initrange)  # Initialize embedding weights uniformly
            self.decoder.weight.data.uniform_(-initrange, initrange)  # Initialize decoder weights uniformly

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(D_MODEL)  # Embed the source sequence
        src = self.pos_encoder(src)  # Apply positional encoding to the source sequence
        memory = self.transformer_encoder(src)  # Encode the source sequence
        tgt = self.embedding(tgt) * math.sqrt(D_MODEL)  # Embed the target sequence
        tgt = self.pos_encoder(tgt)  # Apply positional encoding to the target sequence
        output = self.transformer_decoder(tgt, memory)  # Decode the target sequence
        output = self.decoder(output)  # Predict the output
        return output

# Function to preprocess input and target data
def preprocess_data(input_data, target_data):
    input_data = [tokenize_sentence(sentence) for sentence in input_data]  # Tokenize input sentences
    target_data = [tokenize_sentence(sentence) for sentence in target_data]  # Tokenize target sentences

    input_data = [torch.tensor(tokens) for tokens in input_data]  # Convert input tokens to tensors
    target_data = [torch.tensor(tokens) for tokens in target_data]  # Convert target tokens to tensors

    return input_data, target_data

# Function to tokenize a sentence using SentencePiece tokenizer
def tokenize_sentence(sentence):
    return sp.EncodeAsIds(sentence)

# Function to pad a sequence to a target length with a padding token
def pad_sequence_to_length(sequence, target_length, padding_token):
    if len(sequence) < target_length:
        # Pad the sequence to the target length
        pad_length = target_length - len(sequence)
        sequence = sequence + [padding_token] * pad_length

    return sequence

# Function to collate a batch of sequences
def collate_fn(batch):
    src_sequences = []
    tgt_sequences = []
    for src, tgt in batch:
        src_sequences.append(src)
        tgt_sequences.append(tgt)

    max_len = max(len(seq) for seq in src_sequences + tgt_sequences)  # Compute the maximum sequence length
    src_padded = pad_sequence([torch.tensor(pad_sequence_to_length(seq, max_len, ' ')) for seq in src_sequences],
                              batch_first=True)  # Pad and create a batch of source sequences
    tgt_padded = pad_sequence([torch.tensor(pad_sequence_to_length(seq, max_len, ' ')) for seq in tgt_sequences],
                              batch_first=True)  # Pad and create a batch of target sequences
    return src_padded, tgt_padded

# Load and preprocess the dataset
df = pd.read_excel(DATA_FILE)  # Read the Excel file into a pandas DataFrame
input_data = df['HTML'].tolist()  # Extract the 'HTML' column as a list
target_data = df['Subject'].tolist()  # Extract the 'Subject' column as a list

input_data, target_data = preprocess_data(input_data, target_data)  # Preprocess the input and target data

# Split the dataset into train, validation, and test sets
input_train, input_val_test, target_train, target_val_test = train_test_split(input_data, target_data, test_size=0.2,
                                                                              random_state=RANDOM_SEED)  # Split the data into train and validation/test sets
input_val, input_test, target_val, target_test = train_test_split(input_val_test, target_val_test, test_size=0.5,
                                                                  random_state=RANDOM_SEED)  # Split the validation/test set into validation and test sets

input_train = [tokenize_sentence(seq) for seq in input_train]  # Tokenize the input train sequences
target_train = [tokenize_sentence(seq) for seq in target_train]  # Tokenize the target train sequences
input_val = [tokenize_sentence(seq) for seq in input_val]  # Tokenize the input validation sequences
target_val = [tokenize_sentence(seq) for seq in target_val]  # Tokenize the target validation sequences
input_test = [tokenize_sentence(seq) for seq in input_test]  # Tokenize the input test sequences
target_test = [tokenize_sentence(seq) for seq in target_test]  # Tokenize the target test sequences

input_train = [torch.tensor(tokens) for tokens in input_train]  # Convert input train tokens to tensors
target_train = [torch.tensor(tokens) for tokens in target_train]  # Convert target train tokens to tensors
input_val = [torch.tensor(tokens) for tokens in input_val]  # Convert input validation tokens to tensors
target_val = [torch.tensor(tokens) for tokens in target_val]  # Convert target validation tokens to tensors
input_test = [torch.tensor(tokens) for tokens in input_test]  # Convert input test tokens to tensors
target_test = [torch.tensor(tokens) for tokens in target_test]  # Convert target test tokens to tensors

train_dataset = list(zip(input_train, target_train))  # Create a list of train data samples
val_dataset = list(zip(input_val, target_val))  # Create a list of validation data samples
test_dataset = list(zip(input_test, target_test))  # Create a list of test data samples

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)  # Create a data loader for train data
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)  # Create a data loader for validation data
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)  # Create a data loader for test data

model = ThinkTAI(len(sp), len(sp), pretrained_weights=MODEL_PATH) if os.path.isfile(MODEL_PATH) else ThinkTAI(len(sp), len(sp))  # Initialize the ThinkTAI model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Choose the device for training (GPU if available, else CPU)
model.to(device)  # Move the model to the device

optimizer = optim.Adam(model.parameters())  # Initialize the optimizer
lr_scheduler = ReduceLROnPlateau(optimizer, patience=3)  # Learning rate scheduler

writer = SummaryWriter(log_dir="logs")  # Initialize the SummaryWriter for TensorBoard logging

NUM_EPOCHS = 10  # Number of training epochs
best_val_loss = float('inf')  # Initialize the best validation loss
early_stop_counter = 0  # Counter for early stopping
early_stop_patience = 5  # Patience for early stopping

for epoch in range(NUM_EPOCHS):
    model.train()  # Set the model to training mode
    train_loss = 0.0  # Initialize the train loss
    for src, tgt in train_loader:
        src = src.to(device)  # Move the source sequences to the device
        tgt = tgt.to(device)  # Move the target sequences to the device

        optimizer.zero_grad()  # Zero the gradients
        output = model(src, tgt[:, :-1])  # Forward pass through the model
        loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))  # Compute the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update the parameters

        train_loss += loss.item()  # Accumulate the train loss

    train_loss /= len(train_loader)  # Compute the average train loss

    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0  # Initialize the validation loss
    with torch.no_grad():
        for src, tgt in val_loader:
            src = src.to(device)  # Move the source sequences to the device
            tgt = tgt.to(device)  # Move the target sequences to the device

            output = model(src, tgt[:, :-1])  # Forward pass through the model
            loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))  # Compute the loss
            val_loss += loss.item()  # Accumulate the validation loss

        val_loss /= len(val_loader)  # Compute the average validation loss

        writer.add_scalar("Loss/Train", train_loss, epoch)  # Write the train loss to TensorBoard
        writer.add_scalar("Loss/Validation", val_loss, epoch)  # Write the validation loss to TensorBoard

        if val_loss < best_val_loss:
            best_val_loss = val_loss  # Update the best validation loss
            early_stop_counter = 0  # Reset the early stop counter
            torch.save(model.state_dict(), MODEL_PATH)  # Save the model weights
        else:
            early_stop_counter += 1  # Increment the early stop counter

        if early_stop_counter >= early_stop_patience:
            break  # Perform early stopping if the counter exceeds the patience

    lr_scheduler.step(val_loss)  # Update the learning rate based on the validation loss

    print(f"Epoch: {epoch + 1}/{NUM_EPOCHS} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}")

model.load_state_dict(torch.load(MODEL_PATH))  # Load the best model weights
model.eval()  # Set the model to evaluation mode
test_loss = 0.0  # Initialize the test loss
with torch.no_grad():
    for src, tgt in test_loader:
        src = src.to(device)  # Move the source sequences to the device
        tgt = tgt.to(device)  # Move the target sequences to the device

        output = model(src, tgt[:, :-1])  # Forward pass through the model
        loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))  # Compute the loss
        test_loss += loss.item()  # Accumulate the test loss

test_loss /= len(test_loader)  # Compute the average test loss

print(f"Test Loss: {test_loss:.3f}")  # Print the test loss

# Example usage
input_sequence = "Hello, how are you?"  # Example input sequence
input_tokens = tokenize_sentence(input_sequence)  # Tokenize the input sequence
input_tokens = torch.LongTensor(input_tokens).unsqueeze(0).to(device)  # Convert the input tokens to a tensor and move to the device

output_tokens = inference(model, input_tokens)  # Perform inference using the model
output_sequence = sp.DecodeIds(output_tokens)  # Decode the output tokens using SentencePiece tokenizer
print(output_sequence)  # Print the predicted output sequence

writer.close()  # Close the SummaryWriter