In [None]:
import math
from pathlib import Path

import IPython
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

# Specify the folder path containing the text files
folder_path = Path("C:/Users/jjbor/Documents/Machine Learning/Training_Data")

# Initialize an empty list to store the loaded examples
examples = []

# Specify the path to the saved model
model_path = Path("C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Model")

# Define hyperparameters
batch_size = 4
num_epochs = 1
learning_rate = 1e-5

# Get the paths of all .txt files in the folder
data_files = list(folder_path.glob("*.txt"))

In [None]:
# Boolean variable to determine whether to load all files or specify the upper limit for the number of files to load
# Will only apply max_files if load_all_files = False
load_all_files, max_files = False, 10000

# Offset to start loading files from a specific index
offsets = [351, 50]
offset = sum(offsets) * batch_size  # Change this value to the desired offset

# Load examples from each file based on the specified condition
if load_all_files:
    for i, data_file in enumerate(data_files[offset:]):
        with open(data_file, "r") as file:
            examples.extend(file.readlines())
else:
    for i, data_file in enumerate(data_files[offset:]):
        if i >= max_files:
            break
        with open(data_file, "r") as file:
            examples.extend(file.readlines())


In [None]:
# Determine the maximum length among all examples
max_length = 1024 # max(len(example) for example in examples)
print(max_length)

In [None]:
class TextDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        text = self.examples[idx]

        # Sliding window approach
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = inputs.input_ids.squeeze()
        attention_mask = inputs.attention_mask.squeeze()

        return input_ids, attention_mask

In [None]:
# Define the GPT2 configuration with the desired changes
gpt2_config = GPT2Config.from_pretrained("gpt2")
gpt2_config.n_layer = 48
gpt2_config.n_embd = 1024
gpt2_config.n_head = 16  # Update num_heads to match embed_dim

In [None]:
# Check if the saved model exists, else fallback to the GPT2 model
if (model_path / "ThinkTAI").is_dir():
    model = GPT2LMHeadModel.from_pretrained(model_path / "ThinkTAI")
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    print("Successfully loaded the existing model.")
else:
    model = GPT2LMHeadModel(gpt2_config)
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding=True)
    model.config.pad_token_id = model.config.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    print("Failed to load the existing model, falling back to the GPT2 model.")

    # Save the new model
    model.save_pretrained(model_path / "ThinkTAI")
    tokenizer.save_pretrained(model_path)
    print("Successfully saved the new model.")

In [None]:
# Check if GPU is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Calculate the number of iterations based on the total number of examples and batch_size
total_examples = len(examples)
num_iterations = math.ceil(total_examples / batch_size)

# Create the dataset and data loader
dataset = TextDataset(examples, tokenizer, max_length)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Training loop
failed_iterations = 0
successful_iterations = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    print("-" * 10)
    
    for iteration, (input_ids, attention_mask) in enumerate(data_loader):
        IPython.display.clear_output(wait=True)
        
        try:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            if (iteration + 1) % 100 == 0:
                print(f"Iteration {iteration+1}/{num_iterations}, Loss: {loss.item()}")

            # Save the model checkpoint after each epoch
            model.save_pretrained(model_path / "ThinkTAI")
            tokenizer.save_pretrained(model_path)
            print("Model checkpoint saved.")
            print("Iteration: " + str(iteration))
            successful_iterations += 1
            print("Successful iterations: " + str(successful_iterations))
            print("Failed iterations: " + str(failed_iterations))
            print("Successfully trained the model.")
            print("Successfully saved the trained model.")
        except:
            print("Iteration: " + str(iteration))
            print("Successful iterations: " + str(successful_iterations))
            failed_iterations += 1
            print("Failed iterations: " + str(failed_iterations))
            print("Failed to train the model.")
            print("Exception: " + str(Exception.args))

In [None]:
# Interactive user input loop
while True:
    user_input = input("Enter your question (or 'q' to quit): ")
    IPython.display.clear_output(wait=True)
    if user_input.lower() == "q":
        break
    encoded_input = tokenizer.encode_plus(user_input, return_tensors="pt")
    input_ids = encoded_input.input_ids.to(device)
    attention_mask = encoded_input.attention_mask.to(device)
    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print("Response:", response)
