In [1]:
import os
import glob
import numpy as np
import IPython
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Specify the folder path containing the text files
folder_path = "C:/Users/jjbor/Documents/Projects/LM/Training_Data/"

# Get the paths of all .txt files in the folder
data_files = glob.glob(os.path.join(folder_path, "*.txt"))

In [3]:
# Initialize an empty list to store the loaded examples
examples = []

# Load examples from each file
for data_file in data_files:
    with open(data_file, "r") as file:
        examples.extend(file.readlines())

In [4]:
# Determine the maximum length among all examples
max_length = max(len(example) for example in examples)
print(max_length)

4171


In [5]:
# # Pad or truncate examples to the maximum length
# for i in range(len(examples)):
#     example = examples[i]
#     print(len(example))
#     if len(example) < max_length:
#         # Pad the example with spaces at the end
#         examples[i] = example.rstrip() + " " * (max_length - len(example))
#     elif len(example) > max_length:
#         # Truncate the example to the maximum length
#         examples[i] = example[:max_length]
#     print(len(examples[i]))

In [6]:
# Define the TextDataset class
class TextDataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.examples = examples
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.examples["input_ids"][idx]
        attention_mask = self.examples["attention_mask"][idx]
        return input_ids, attention_mask

In [7]:
model_path = "C:/Users/jjbor/Documents/Projects/LM/Model"

try:
    # Try to load the saved model
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path, padding=True)
    tokenizer.pad_token = tokenizer.eos_token
    print("Successfully loaded the existing model.")
except:
    # Fallback to the GPT2 model if loading fails
    print("Failed to load the existing model. Fallback to the GPT2 model.")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding=True)
    tokenizer.pad_token = tokenizer.eos_token

Successfully loaded the existing model.


In [8]:
# Pad or truncate examples to the maximum length using the tokenizer's padding method
padded_examples = tokenizer(examples, padding=True, truncation=True, max_length=max_length)

In [9]:
batch_size = 1
num_epochs = 1
learning_rate = 1e-5

In [10]:
dataset = TextDataset(padded_examples, tokenizer)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [11]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()



In [12]:
for epoch in range(num_epochs):
    for input_ids, attention_mask in data_loader:
        optimizer.zero_grad()
        # Convert input_ids and attention_mask to tensors
        input_ids = torch.tensor(input_ids).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        attention_mask = torch.tensor(attention_mask).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        labels = input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

IndexError: index out of range in self

In [None]:
# Save the trained model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
# Load the saved 
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [None]:
while True:
    user_input = input("Enter your question (or 'q' to quit): ")
    IPython.display.clear_output(wait=True)
    if user_input.lower() == "q":
        break
    encoded_input = tokenizer.encode_plus(user_input, return_tensors="pt")
    input_ids = encoded_input.input_ids.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    attention_mask = encoded_input.attention_mask.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print("Response:", response)