In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.metrics import bleu_score
from sklearn.model_selection import train_test_split
import spacy
import random
import numpy as np
import pandas as pd
import os
import heapq
from torch.nn import functional as F
import argparse
import math

DATA_FILE = r"E:/GitHub/ThinkTAI/ThinkTAI/Data/data.txt"
MODEL_PATH = r"E:/GitHub/ThinkTAI/ThinkTAI/Model"
BATCH_SIZE = 4

# Replace argparse section with this
class Args:
    def __init__(self, model_path, data_file):
        self.model_path = model_path
        self.data_file = data_file

args = Args(MODEL_PATH, DATA_FILE)

# Set seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Loading the spacy tokenizer
spacy_en = spacy.load('en_core_web_sm')
tokenizer = get_tokenizer('spacy', language='en')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def tokenize_and_convert_to_indices(text, vocab):
    tokens = tokenize_en(text)
    return [vocab[tok] for tok in tokens]

def yield_tokens(data_iter, tokenize_func):
    for item in data_iter:
        print(item)
        text, _ = item
        yield tokenize_func(text)


def collate_fn(batch, vocab):
    src_batch, trg_batch = [], []
    for src_item, trg_item in batch:
        src_batch.append(torch.tensor(src_item, dtype=torch.long))
        trg_batch.append(torch.tensor(trg_item, dtype=torch.long))
    src_batch = pad_sequence(src_batch, padding_value=vocab['<pad>'])
    trg_batch = pad_sequence(trg_batch, padding_value=vocab['<pad>'])
    return src_batch, trg_batch

def build_vocab(file_path, tokenizer):
    with open(file_path, 'r') as f:
        data = f.read().splitlines()
    vocab = build_vocab_from_iterator(yield_tokens(data))
    # ensure <pad> token is in the vocab
    if '<pad>' not in vocab:
        vocab.append('<pad>')
    return vocab

def read_data(file_path):
    with open(file_path, 'r') as f:
        data = f.read().splitlines()
    dataset = [(item.split('|')[0], item.split('|')[1]) for item in data]
    return dataset

def split_data(data, train_size=0.7, val_size=0.15, test_size=0.15):
    assert train_size + val_size + test_size == 1.0, "The sum of the split ratios should be 1.0"

    train_data, rest_data = train_test_split(data, test_size=1-train_size, random_state=SEED)
    val_data, test_data = train_test_split(rest_data, test_size=test_size/(val_size + test_size), random_state=SEED)
    return train_data, val_data, test_data

def process_data(data, vocab):
    dataset = [(tokenize_and_convert_to_indices(src, vocab), tokenize_and_convert_to_indices(trg, vocab)) for src, trg in data]
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda x: collate_fn(x, vocab))
    return dataloader

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

# Define your model here
class TransformerModel(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, nhead, num_layers, dim_feedforward, dropout, pad_idx):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Embedding(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)  # Added positional encoding
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.fc_out = nn.Linear(d_model, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx

    def forward(self, src, trg):
        src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(0)).to(trg.device) & \
            (trg.unsqueeze(-1).permute(1, 0, 2) != self.pad_idx).type_as(src_mask)
        src = self.embedding(src)
        src = self.pos_encoder(src)  # Added positional encoding
        trg = self.embedding(trg)
        trg = self.pos_encoder(trg)  # Added positional encoding
        src = src.permute(1, 0, 2)
        trg = trg.permute(1, 0, 2)
        output = self.transformer(src, trg, src_mask=src_mask, tgt_mask=trg_mask)
        output = output.permute(1, 0, 2)
        return self.fc_out(self.dropout(output))

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        # Assumes that the first token is a start-of-sequence token that should be ignored by the loss
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch
            output = model(src, trg)  # turn off teacher forcing
            output_dim = output.shape[-1]
            # Assumes that the first token is a start-of-sequence token that should be ignored by the loss
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def main():
    train_data, val_data, test_data = split_data(read_data(DATA_FILE))

    # Now, use these lists in place of reading files
    SRC_VOCAB = build_vocab_from_iterator(yield_tokens(train_data, tokenize_en))
    TRG_VOCAB = build_vocab_from_iterator(yield_tokens(val_data, tokenize_en))

    PAD_IDX = SRC_VOCAB['<pad>']  # Assuming '<pad>' is your padding token

    train_iterator = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=lambda x: collate_fn(x, SRC_VOCAB))
    valid_iterator = DataLoader(val_data, batch_size=BATCH_SIZE, collate_fn=lambda x: collate_fn(x, TRG_VOCAB))
    test_iterator = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=lambda x: collate_fn(x, TRG_VOCAB))

    # Parameters for the model
    INPUT_DIM = len(SRC_VOCAB)
    OUTPUT_DIM = len(TRG_VOCAB)
    D_MODEL = 256
    NHEAD = 8
    NUM_LAYERS = 3
    DIM_FEEDFORWARD = 512
    DROPOUT = 0.1
    CLIP = 1.0

    # Instantiate the model
    model = TransformerModel(INPUT_DIM, OUTPUT_DIM, D_MODEL, NHEAD, NUM_LAYERS, DIM_FEEDFORWARD, DROPOUT, PAD_IDX)

    # Use appropriate device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Load model if it exists
    if os.path.exists(args.model_path):
        model.load_state_dict(torch.load(args.model_path))
        model = model.to(device)

    # Define optimizer and loss
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

    # Training loop
    N_EPOCHS = 10
    best_valid_loss = float('inf')
    for epoch in range(N_EPOCHS):
        train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
        valid_loss = evaluate(model, valid_iterator, criterion)

        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), args.model_path)

    # Evaluate the model on the test set
    test_loss = evaluate(model, test_iterator, criterion)
    print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

if __name__ == '__main__':
    main()




('Discover the joy of driving a brand-new 2023 Kia Sorento at Sunrise Kia. Available in the LX trim level, this Sorento is set to redefine your driving experience.', 'NEW 2023 Kia Sorento LX')
("At Elite Autos, experience the elegance of a pre-owned 2017 BMW 5 Series. Crafted in 2017, this 5 Series comes with a 530i trim level, a true testament to BMW's luxury and performance.", 'USED 2017 BMW 5 Series 530i')
('At Sunshine Honda, be the first to own a new 2023 Honda Pilot. This Pilot is a masterpiece, crafted in the EX-L trim level, promising to make every ride an adventure.', 'NEW 2023 Honda Pilot EX-L')
("At Silverline Autos, experience the grandeur of 2015 with a pre-owned Lexus NX. This NX, in the 200t trim level, is a symbol of 2015's luxury and design.", 'USED 2015 Lexus NX 200t')
('At Vintage Autos, get a taste of 2019 with a pre-owned Toyota Highlander. This Highlander comes in the LE trim level and represents the best of what 2019 had to offer.', 'USED 2019 Toyota Highlander L

PermissionError: [Errno 13] Permission denied: 'E:/GitHub/ThinkTAI/ThinkTAI/Model'