In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
import nltk
import os
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import csv
import requests
from bs4 import BeautifulSoup
from collections import Counter

MODEL_PATH = r"E:/GitHub/ThinkTAI/ThinkTAI/Model/model_weights.pth"
DATA_FILE = r"E:/GitHub/ThinkTAI/ThinkTAI/Data/data.xlsx"
CLEANED_DATA_FILE = r"E:/GitHub/ThinkTAI/ThinkTAI/Data/cleaned_data.csv"

RANDOM_SEED = 42

D_MODEL = 128
N_HEAD = 4
NUM_LAYERS = 32
DIM_FEEDFORWARD = 512
DROPOUT = 0.1
MAX_LEN = 5000
BATCH_SIZE = 7

class TrainingSet:
    def __init__(self, url, subject, html):
        self.url = url
        self.subject = subject
        self.html = html

def load_data_from_excel(file_path):
    training_data = []
    df = pd.read_excel(file_path)
    for _, row in df.iterrows():
        url = row['URL']
        subject = row['Subject']
        html = ""
        training_data.append(TrainingSet(url, subject, html))
    return training_data

def retrieve_html_content(data):
    try:
        response = requests.get(data.url)
        if response.ok and response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            if soup.find():
                data.html = response.content.decode('utf-8')
            else:
                data.html = ""
        else:
            data.html = ""
    except requests.RequestException:
        data.html = ""

def remove_empty_html(data_list):
    return [data for data in data_list if data.html]

def save_cleaned_data_to_csv(file_path, data_list):
    cleaned_data = {'URL': [data.url for data in data_list],
                    'Subject': [data.subject for data in data_list],
                    'HTML': [data.html for data in data_list]}
    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df.to_csv(file_path, index=False)

def create_plain_text_file(data_list, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        writer = csv.writer(file)
        for data in data_list:
            if data.html:
                writer.writerow([data.html])

def load_data_from_csv(file_path):
    cleaned_df = pd.read_csv(file_path)
    input_data = cleaned_df['HTML'].tolist()
    target_data = cleaned_df['Subject'].tolist()
    return input_data, target_data

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def preprocess_data(input_data, target_data):
    tokenized_input_data = []
    for sentence in input_data:
        tokenized_input_data.append(tokenize_sentence(sentence))

    tokenized_target_data = []
    for sentence in target_data:
        tokenized_target_data.append(tokenize_sentence(sentence))
    
    return tokenized_input_data, tokenized_target_data

def tokenize_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)
    return tokens

def pad_sequence_to_length(sequence, target_length, padding_token):
    if len(sequence) < target_length:
        pad_length = target_length - len(sequence)
        sequence = sequence + [padding_token] * pad_length

    return sequence

def collate_fn(batch):
    src_sequences = []
    tgt_sequences = []
    for src, tgt in batch:
        src_sequences.append(src)
        tgt_sequences.append(tgt)

    max_len = max(len(seq) for seq in src_sequences + tgt_sequences)
    src_padded = pad_sequence([torch.tensor(pad_sequence_to_length(seq, max_len, ' ')) for seq in src_sequences],
                              batch_first=True)
    tgt_padded = pad_sequence([torch.tensor(pad_sequence_to_length(seq, max_len, ' ')) for seq in tgt_sequences],
                              batch_first=True)
    return src_padded, tgt_padded

def inference(model, src, beam_width=5, max_length=100):
    model.eval()  # Set the model to evaluation mode
    src = src.unsqueeze(0).to(device)  # Add a batch dimension and move to the device
    src = src.repeat(beam_width, 1)  # Repeat the source sequence for beam search

    with torch.no_grad():
        src_encoding = model.embedding(src) * math.sqrt(D_MODEL)  # Embed the source sequence
        src_encoding = model.pos_encoder(src_encoding)  # Apply positional encoding to the source sequence
        memory = model.transformer_encoder(src_encoding)  # Encode the source sequence

        tgt = torch.ones(beam_width, 1).long().to(device)  # Initialize target sequence with start token
        tgt_lengths = torch.ones(beam_width).long().to(device)  # Initialize target sequence lengths
        eos_flags = torch.zeros(beam_width).byte().to(device)  # Flags to track if beam search paths have reached end-of-sequence

        scores_beam = torch.zeros(beam_width).to(device)  # Initialize scores_beam tensor

        for _ in range(max_length):
            tgt_encoding = model.embedding(tgt) * math.sqrt(D_MODEL)  # Embed the target sequence
            tgt_encoding = model.pos_encoder(tgt_encoding)  # Apply positional encoding to the target sequence
            output = model.transformer_decoder(tgt_encoding, memory)  # Decode the target sequence

            output = model.decoder(output[:, -1, :])  # Get logits for the last token
            output = F.log_softmax(output, dim=-1)  # Apply log softmax to convert logits to probabilities

            output = output.view(beam_width, -1, OUTPUT_DIM)  # Reshape logits for beam search

            if _ == 0:
                scores, candidates = output.topk(beam_width, dim=-1)  # Get top-k scores and candidates
            else:
                scores, candidates = output.topk(beam_width, dim=-1)  # Get top-k scores and candidates
                scores = scores + scores_beam.unsqueeze(2)  # Add scores of previous beam search paths

            scores = scores.view(beam_width, -1)  # Reshape scores for beam search
            candidates = candidates.view(beam_width, -1)  # Reshape candidates for beam search

            if _ == 0:
                scores_flat = scores.squeeze()  # Flatten scores for beam search
            else:
                scores_flat = scores.view(-1)  # Flatten scores for beam search

            scores_beam, indices_beam = scores_flat.topk(beam_width, dim=-1)  # Get top-k scores and indices

            tgt_candidates = candidates.view(-1)  # Flatten candidates for beam search
            tgt_candidates_beam = tgt_candidates[indices_beam]  # Select candidates for beam search

            tgt = torch.cat((tgt, tgt_candidates_beam.unsqueeze(1)), dim=1)  # Append selected candidates to target sequence

            eos_flags = eos_flags | (tgt_candidates_beam == 1)  # Check if any of the selected candidates is the end token
            if eos_flags.all():  # Break if all beam search paths have reached end-of-sequence
                break

            tgt_lengths = tgt_lengths + (~eos_flags).long()  # Update target sequence lengths

        best_sequence_index = scores_beam.argmax().item()  # Find the index of the best sequence
        best_sequence = tgt[best_sequence_index].tolist()  # Convert the best sequence to a list

    return best_sequence[1:]  # Remove the start token from the best sequence

# Class definition for the main ThinkTAI model
class ThinkTAI(nn.Module):
    def __init__(self, input_dim, output_dim, pretrained_weights=None):
        super(ThinkTAI, self).__init__()

        self.embedding = nn.Embedding(input_dim, D_MODEL)  # Embedding layer for input tokens
        self.pos_encoder = PositionalEncoding(D_MODEL, DROPOUT, max_len=MAX_LEN)  # Positional encoding layer

        encoder_layers = nn.TransformerEncoderLayer(D_MODEL, N_HEAD, DIM_FEEDFORWARD, DROPOUT)  # Encoder layers for the transformer
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, NUM_LAYERS)  # Transformer encoder

        decoder_layers = nn.TransformerDecoderLayer(D_MODEL, N_HEAD, DIM_FEEDFORWARD, DROPOUT)  # Decoder layers for the transformer
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, NUM_LAYERS)  # Transformer decoder

        self.decoder = nn.Linear(D_MODEL, output_dim)  # Linear layer for output prediction

        self.init_weights(pretrained_weights)  # Initialize weights of the model

    def init_weights(self, pretrained_weights=None):
        if pretrained_weights is not None:
            self.load_state_dict(torch.load(pretrained_weights))  # Load pretrained weights if available
        else:
            initrange = 0.1
            self.embedding.weight.data.uniform_(-initrange, initrange)  # Initialize embedding weights uniformly
            self.decoder.weight.data.uniform_(-initrange, initrange)  # Initialize decoder weights uniformly

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(D_MODEL)  # Embed the source sequence
        src = self.pos_encoder(src)  # Apply positional encoding to the source sequence
        memory = self.transformer_encoder(src)  # Encode the source sequence
        tgt = self.embedding(tgt) * math.sqrt(D_MODEL)  # Embed the target sequence
        tgt = self.pos_encoder(tgt)  # Apply positional encoding to the target sequence
        output = self.transformer_decoder(tgt, memory)  # Decode the target sequence
        output = self.decoder(output)  # Predict the output
        return output

# Load data from Excel file
training_data = load_data_from_excel(DATA_FILE)

# Retrieve HTML content from URLs
for data in training_data:
    retrieve_html_content(data)

# Remove objects with empty or null HTML
training_data = remove_empty_html(training_data)

# Save cleaned data to CSV file
save_cleaned_data_to_csv(CLEANED_DATA_FILE, training_data)

nltk.download('punkt')  # Download the required NLTK resource

# Load cleaned CSV file into input and target data
input_data, target_data = load_data_from_csv(CLEANED_DATA_FILE)

input_data, target_data = preprocess_data(input_data, target_data)

input_train, input_val_test, target_train, target_val_test = train_test_split(input_data, target_data, test_size=0.2, random_state=RANDOM_SEED)
input_val, input_test, target_val, target_test = train_test_split(input_val_test, target_val_test, test_size=0.5, random_state=RANDOM_SEED)

# Build the vocabulary
vocab_counter = Counter()
for seq in input_train + target_train:
    vocab_counter.update(seq)

# Create a mapping from tokens to integers
vocab = [token for token, count in vocab_counter.items()]

# Create a mapping from integers to tokens
int2token = {i: token for i, token in enumerate(vocab)}

# Create a mapping from tokens to integers
token2int = {token: i for i, token in int2token.items()}

# Tokenize and convert to integers
input_train = [[token2int[token] for token in seq] for seq in input_train]
target_train = [[token2int[token] for token in seq] for seq in target_train]
input_val = [[token2int[token] for token in seq] for seq in input_val]
target_val = [[token2int[token] for token in seq] for seq in target_val]
input_test = [[token2int[token] for token in seq] for seq in input_test]
target_test = [[token2int[token] for token in seq] for seq in target_test]

# Convert to tensors
input_train = [torch.LongTensor(seq) for seq in input_train]
target_train = [torch.LongTensor(seq) for seq in target_train]
input_val = [torch.LongTensor(seq) for seq in input_val]
target_val = [torch.LongTensor(seq) for seq in target_val]
input_test = [torch.LongTensor(seq) for seq in input_test]
target_test = [torch.LongTensor(seq) for seq in target_test]

train_dataset = list(zip(input_train, target_train))
val_dataset = list(zip(input_val, target_val))
test_dataset = list(zip(input_test, target_test))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

OUTPUT_DIM = len(target_train)  # Set OUTPUT_DIM based on the size of the target vocabulary

model = ThinkTAI(len(input_train), OUTPUT_DIM, pretrained_weights=MODEL_PATH) if os.path.isfile(MODEL_PATH) else ThinkTAI(len(input_train), OUTPUT_DIM)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters())
lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)  # Adjust the parameters as needed

writer = SummaryWriter(log_dir="logs")

NUM_EPOCHS = 1
best_val_loss = float('inf')
early_stop_counter = 0
early_stop_patience = 5

for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0.0
    for src, tgt in train_loader:
        src = src.to(device)
        tgt = tgt.to(device)

        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        print("Output shape:", output.shape)
        print("Target shape:", tgt[:, 1:].contiguous().shape)
        loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for src, tgt in val_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            output = model(src, tgt[:, :-1])
            loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
            val_loss += loss.item()

        val_loss /= len(val_loader)

        writer.add_scalar("Loss/Train", train_loss, epoch)
        writer.add_scalar("Loss/Validation", val_loss, epoch)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), MODEL_PATH)
        else:
            early_stop_counter += 1

        if early_stop_counter >= early_stop_patience:
            break

    lr_scheduler.step(val_loss)

    print(f"Epoch: {epoch + 1}/{NUM_EPOCHS} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}")

model.load_state_dict(torch.load(MODEL_PATH))
model.eval()
test_loss = 0.0
with torch.no_grad():
    for src, tgt in test_loader:
        src = src.to(device)
        tgt = tgt.to(device)

        output = model(src, tgt[:, :-1])
        loss = F.cross_entropy(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
        test_loss += loss.item()

test_loss /= len(test_loader)

print(f"Test Loss: {test_loss:.3f}")

input_sequence = "Hello, how are you?"
input_tokens = tokenize_sentence(input_sequence)
input_tokens = torch.LongTensor(input_tokens).unsqueeze(0).to(device)

output_tokens = inference(model, input_tokens)

output_sequence = " ".join(output_tokens)  # Convert the output tokens to a string
print(output_sequence)

writer.close()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jjbor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyError: '19XFL1H76PE010333'