In [1]:
import torch
torch.cuda.empty_cache()
from torch.utils.data import Dataset, random_split
from transformers import (
    GPT2LMHeadModel, 
    GPT2Tokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from peft import get_peft_model, LoraConfig, TaskType
import matplotlib.pyplot as plt
import math
import os
import random
import numpy as np

2024-09-03 22:33:51.326758: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-03 22:33:51.371686: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Set seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed(42)  # You can change this to any integer value

In [3]:
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        tokenized_text = tokenizer.encode(text)
        self.examples = [tokenized_text[i:i + block_size] for i in range(0, len(tokenized_text) - block_size + 1, block_size)]
        print(f"Loaded {len(self.examples)} examples.")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

In [4]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.train_loss = []
        self.val_loss = []

    def training_step(self, model, inputs):
        loss = super().training_step(model, inputs)
        self.train_loss.append(loss.item())
        return loss
    
    def evaluation_loop(self, *args, **kwargs):
        output = super().evaluation_loop(*args, **kwargs)
        self.val_loss.append(output.metrics['eval_loss'])
        return output

In [5]:
class SaveEpochCallback(TrainerCallback):
    def __init__(self, save_epochs, output_dir, tokenizer):
        self.save_epochs = save_epochs
        self.output_dir = output_dir
        self.tokenizer = tokenizer

    def on_epoch_end(self, args, state, control, **kwargs):
        epoch = int(state.epoch)
        print(f"Callback triggered for epoch {epoch}")
        if epoch % self.save_epochs == 0:
            checkpoint_dir = os.path.join(self.output_dir, f"checkpoint-epoch-{epoch}")
            print(f"Attempting to save checkpoint for epoch {epoch} to {checkpoint_dir}")
            if 'model' in kwargs:
                kwargs['model'].save_pretrained(checkpoint_dir)
                self.tokenizer.save_pretrained(checkpoint_dir)
                print(f"Saved checkpoint for epoch {epoch} to {checkpoint_dir}")
            else:
                print("Model not found in kwargs, unable to save checkpoint")

In [6]:
def generate_text(model, tokenizer, max_length=100):
    input_ids = torch.tensor([[tokenizer.bos_token_id]]).to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.75,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

def generate_text_with_prompt(model, tokenizer, prompt, max_length=200):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length + len(input_ids[0]),
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.75,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

model.to(device)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"]
)

model = get_peft_model(model, peft_config)

Using device: cuda




In [None]:
file_path = "trump_speeches_combined_processed.txt"
block_size = 128
full_dataset = TextDataset(file_path, tokenizer, block_size)

train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

In [None]:
# Training setup
total_epochs = 1000
save_epochs = 10

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=total_epochs,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=0.001,
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="no",  # We'll handle saving with our custom callback
    fp16=True,
    optim="adamw_torch",
    load_best_model_at_end=False,  # We're not using the default saving strategy
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Pass the tokenizer to the callback
save_callback = SaveEpochCallback(save_epochs, training_args.output_dir, tokenizer)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[save_callback],
)

In [None]:
# Train the model
print("Starting training...")
trainer.train()
print("Training completed")

In [None]:
# Manually save the final state if needed
final_checkpoint_dir = os.path.join(training_args.output_dir, "final-checkpoint")
trainer.save_model(final_checkpoint_dir)
tokenizer.save_pretrained(final_checkpoint_dir)
print(f"Saved final checkpoint to {final_checkpoint_dir}")

In [None]:
prompt = "What are your thoughts on Kamala Harris running as president against you, do you think she'll win?"

for epoch in range(save_epochs, total_epochs + 1, save_epochs):
    checkpoint_dir = f"./results/checkpoint-epoch-{epoch}"
    if os.path.exists(checkpoint_dir):
        loaded_model = GPT2LMHeadModel.from_pretrained(checkpoint_dir)
        loaded_model = get_peft_model(loaded_model, peft_config)
        loaded_model.to(device)
        
        print(f"\nGenerated text after epoch {epoch}:")
        print(generate_text(loaded_model, tokenizer))

        print(f"\nGenerated text with prompt after epoch {epoch}:")
        print(f"Prompt: {prompt}")
        print(f"Response: {generate_text_with_prompt(loaded_model, tokenizer, prompt)}")

# Calculate final perplexity on validation set
val_loss = trainer.evaluate()['eval_loss']
val_perplexity = math.exp(val_loss)
print(f"Final Validation Perplexity: {val_perplexity:.2f}")

# Generate text with final model
print("\nFinal generated text:")
print(generate_text(model, tokenizer))

print("\nFinal generated text with prompt:")
print(f"Prompt: {prompt}")
print(f"Response: {generate_text_with_prompt(model, tokenizer, prompt)}")

In [None]:
final_output_dir = "./fine_tuned_gpt2_final"
trainer.save_model(final_output_dir)
tokenizer.save_pretrained(final_output_dir)
print(f"Final model saved to {final_output_dir}")