In [2]:
import math
import os
import csv
import numpy as np
import pandas as pd
import sentencepiece as spm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.model_selection import train_test_split

# Global Paths
SP_MODEL_PATH = "spm_model.model"
MODEL_PATH = "model_weights.pth"
DATA_FILE = "data.xlsx"
PLAIN_TEXT_FILE = "data.txt"

# Global Parameters
RANDOM_SEED = 42

D_MODEL = 4096  # Dimensionality of the model's input and output features.
N_HEAD = 128  # Number of attention heads.
NUM_LAYERS = 96  # Number of transformer layers.
DIM_FEEDFORWARD = 16384  # Dimensionality of the feed-forward neural networks.
DROPOUT = 0.1  # Dropout probability.
MAX_LEN = 5000  # Maximum sequence length.
INPUT_DIM = 50257  # Determines the number of unique tokens in the input language.
OUTPUT_DIM = 50257  # Determines the number of unique tokens in the output language.
BATCH_SIZE = 8  # Number of samples processed in parallel.
SRC_SEQ_LENGTH = 1024  # Determines the maximum length of the input sequence the model can handle.
TGT_SEQ_LENGTH = 128  # Determines the maximum length of the output sequence the model can generate.

# Convert Excel to plain text file
df = pd.read_excel(DATA_FILE)
html_data = df['HTML'].tolist()

# Write HTML data to plain text file
with open(PLAIN_TEXT_FILE, 'w', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows([[html] for html in html_data])

# Initialize SentencePiece tokenizer
if os.path.isfile(SP_MODEL_PATH):
    sp = spm.SentencePieceProcessor()
    sp.Load(SP_MODEL_PATH)
else:
    # Train SentencePiece tokenizer on your dataset
    spm.SentencePieceTrainer.Train(f"--input={PLAIN_TEXT_FILE} --model_prefix=spm_model --vocab_size=15000")
    sp = spm.SentencePieceProcessor()
    sp.Load(SP_MODEL_PATH)
    sp.Save(SP_MODEL_PATH)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


# Inference loop with beam search
def inference(model, src, beam_width=5, max_length=100):
    model.eval()
    src = src.unsqueeze(0).to(device)
    src = src.repeat(beam_width, 1)  # Repeat source sequence for beam search

    with torch.no_grad():
        src_encoding = model.embedding(src) * math.sqrt(D_MODEL)
        src_encoding = model.pos_encoder(src_encoding)
        memory = model.transformer_encoder(src_encoding)

        tgt = torch.ones(beam_width, 1).long().to(device)  # Initialize target sequence with start token
        tgt_lengths = torch.ones(beam_width).long().to(device)  # Initialize target sequence lengths
        eos_flags = torch.zeros(beam_width).byte().to(device)  # Flags to track if beam search paths have reached end-of-sequence

        scores_beam = torch.zeros(beam_width).to(device)  # Initialize scores_beam tensor

        for _ in range(max_length):
            tgt_encoding = model.embedding(tgt) * math.sqrt(D_MODEL)
            tgt_encoding = model.pos_encoder(tgt_encoding)
            output = model.transformer_decoder(tgt_encoding, memory)

            output = model.decoder(output[:, -1, :])  # Get logits for the last token
            output = F.log_softmax(output, dim=-1)

            output = output.view(beam_width, -1, OUTPUT_DIM)  # Reshape logits for beam search

            if _ == 0:
                scores, candidates = output.topk(beam_width, dim=-1)
            else:
                scores, candidates = output.topk(beam_width, dim=-1)
                scores = scores + scores_beam.unsqueeze(2)  # Add scores of previous beam search paths

            scores = scores.view(beam_width, -1)  # Reshape scores for beam search
            candidates = candidates.view(beam_width, -1)  # Reshape candidates for beam search

            if _ == 0:
                scores_flat = scores.squeeze()
            else:
                scores_flat = scores.view(-1)  # Flatten scores for beam search

            scores_beam, indices_beam = scores_flat.topk(beam_width, dim=-1)

            tgt_candidates = candidates.view(-1)  # Flatten candidates for beam search
            tgt_candidates_beam = tgt_candidates[indices_beam]  # Select candidates for beam search

            tgt = torch.cat((tgt, tgt_candidates_beam.unsqueeze(1)), dim=1)  # Append selected candidates to target sequence

            eos_flags = eos_flags | (tgt_candidates_beam == 1)  # Check if any of the selected candidates is the end token
            if eos_flags.all():  # Break if all beam search paths have reached end-of-sequence
                break

            tgt_lengths = tgt_lengths + (~eos_flags).long()  # Update target sequence lengths

        best_sequence_index = scores_beam.argmax().item()
        best_sequence = tgt[best_sequence_index].tolist()

    return best_sequence[1:]  # Remove start token


class ThinkTAI(nn.Module):
    def __init__(self, input_dim, output_dim, pretrained_weights=None):
        super(ThinkTAI, self).__init__()

        self.embedding = nn.Embedding(input_dim, D_MODEL)
        self.pos_encoder = PositionalEncoding(D_MODEL, DROPOUT, max_len=MAX_LEN)

        encoder_layers = nn.TransformerEncoderLayer(D_MODEL, N_HEAD, DIM_FEEDFORWARD, DROPOUT)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, NUM_LAYERS)

        decoder_layers = nn.TransformerDecoderLayer(D_MODEL, N_HEAD, DIM_FEEDFORWARD, DROPOUT)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, NUM_LAYERS)

        self.decoder = nn.Linear(D_MODEL, output_dim)

        self.init_weights(pretrained_weights)

    def init_weights(self, pretrained_weights=None):
        if pretrained_weights is not None:
            self.load_state_dict(torch.load(pretrained_weights))
        else:
            initrange = 0.1
            self.embedding.weight.data.uniform_(-initrange, initrange)
            self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(D_MODEL)
        src = self.pos_encoder(src)
        memory = self.transformer_encoder(src)
        tgt = self.embedding(tgt) * math.sqrt(D_MODEL)
        tgt = self.pos_encoder(tgt)
        output = self.transformer_decoder(tgt, memory)
        output = self.decoder(output)
        return output


def preprocess_data(input_data, target_data):
    input_data = [tokenize_sentence(sentence) for sentence in input_data]
    target_data = [tokenize_sentence(sentence) for sentence in target_data]

    input_data = [torch.tensor(tokens) for tokens in input_data]
    target_data = [torch.tensor(tokens) for tokens in target_data]

    return input_data, target_data


def tokenize_sentence(sentence):
    return sp.EncodeAsIds(sentence)


def pad_sequence_to_length(sequence, target_length, padding_token):
    if len(sequence) < target_length:
        # Pad the sequence to the target length
        pad_length = target_length - len(sequence)
        sequence = sequence + [padding_token] * pad_length

    return sequence


def collate_fn(batch):
    src_sequences = []
    tgt_sequences = []
    for src, tgt in batch:
        src_sequences.append(src)
        tgt_sequences.append(tgt)

    max_len = max(len(seq) for seq in src_sequences + tgt_sequences)
    src_padded = pad_sequence([torch.tensor(pad_sequence_to_length(seq, max_len, ' ')) for seq in src_sequences],
                              batch_first=True)
    tgt_padded = pad_sequence([torch.tensor(pad_sequence_to_length(seq, max_len, ' ')) for seq in tgt_sequences],
                              batch_first=True)
    return src_padded, tgt_padded


# Load and preprocess your dataset
df = pd.read_excel(DATA_FILE)
input_data = df['HTML'].tolist()
target_data = df['Subject'].tolist()

input_data, target_data = preprocess_data(input_data, target_data)

# Split the dataset into train, validation, and test sets
input_train, input_val_test, target_train, target_val_test = train_test_split(input_data, target_data, test_size=0.2,
                                                                              random_state=RANDOM_SEED)
input_val, input_test, target_val, target_test = train_test_split(input_val_test, target_val_test, test_size=0.5,
                                                                  random_state=RANDOM_SEED)

input_train = [tokenize_sentence(seq) for seq in input_train]
target_train = [tokenize_sentence(seq) for seq in target_train]
input_val = [tokenize_sentence(seq) for seq in input_val]
target_val = [tokenize_sentence(seq) for seq in target_val]
input_test = [tokenize_sentence(seq) for seq in input_test]
target_test = [tokenize_sentence(seq) for seq in target_test]

input_train = [torch.tensor(tokens) for tokens in input_train]
target_train = [torch.tensor(tokens) for tokens in target_train]
input_val = [torch.tensor(tokens) for tokens in input_val]
target_val = [torch.tensor(tokens) for tokens in target_val]
input_test = [torch.tensor(tokens) for tokens in input_test]
target_test = [torch.tensor(tokens) for tokens in target_test]

train_dataset = list(zip(input_train, target_train))
val_dataset = list(zip(input_val, target_val))
test_dataset = list(zip(input_test, target_test))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Initialize model and optimizer
model = ThinkTAI(len(sp), len(sp), pretrained_weights=MODEL_PATH) if os.path.isfile(MODEL_PATH) else ThinkTAI(len(sp), len(sp))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters())
lr_scheduler = ReduceLROnPlateau(optimizer, patience=3)

# Initialize SummaryWriter for TensorBoard logging
writer = SummaryWriter(log_dir="logs")

# Training loop
NUM_EPOCHS = 10
best_val_loss = float('inf')
early_stop_counter = 0
early_stop_patience = 5

for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0.0
    for src, tgt in train_loader:
        src = src.to(device)
        tgt = tgt.to(device)

        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for src, tgt in val_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            output = model(src, tgt[:, :-1])
            loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
            val_loss += loss.item()

        val_loss /= len(val_loader)

        writer.add_scalar("Loss/Train", train_loss, epoch)
        writer.add_scalar("Loss/Validation", val_loss, epoch)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), MODEL_PATH)
        else:
            early_stop_counter += 1

        if early_stop_counter >= early_stop_patience:
            break

    lr_scheduler.step(val_loss)  # Update learning rate based on validation loss

    print(f"Epoch: {epoch + 1}/{NUM_EPOCHS} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}")

# Test loop
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()
test_loss = 0.0
with torch.no_grad():
    for src, tgt in test_loader:
        src = src.to(device)
        tgt = tgt.to(device)

        output = model(src, tgt[:, :-1])
        loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
        test_loss += loss.item()

test_loss /= len(test_loader)

print(f"Test Loss: {test_loss:.3f}")

# Example usage
input_sequence = "Hello, how are you?"
input_tokens = tokenize_sentence(input_sequence)
input_tokens = torch.LongTensor(input_tokens).unsqueeze(0).to(device)

output_tokens = inference(model, input_tokens)

output_sequence = sp.DecodeIds(output_tokens)
print(output_sequence)

# Close the SummaryWriter
writer.close()