In [25]:
import torch
torch.cuda.empty_cache()
from torch.utils.data import Dataset, random_split
from transformers import (
    GPT2LMHeadModel, 
    GPT2Tokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from peft import get_peft_model, LoraConfig, TaskType
import matplotlib.pyplot as plt
import math
import os
import random
import numpy as np

In [26]:
# Set seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed(42)  # You can change this to any integer value

In [27]:
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        tokenized_text = tokenizer.encode(text)
        self.examples = [tokenized_text[i:i + block_size] for i in range(0, len(tokenized_text) - block_size + 1, block_size)]
        print(f"Loaded {len(self.examples)} examples.")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

In [28]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.train_loss = []
        self.val_loss = []

    def training_step(self, model, inputs):
        loss = super().training_step(model, inputs)
        self.train_loss.append(loss.item())
        return loss
    
    def evaluation_loop(self, *args, **kwargs):
        output = super().evaluation_loop(*args, **kwargs)
        self.val_loss.append(output.metrics['eval_loss'])
        return output

In [29]:
class SaveEpochCallback(TrainerCallback):
    def __init__(self, save_epochs, output_dir, tokenizer):
        self.save_epochs = save_epochs
        self.output_dir = output_dir
        self.tokenizer = tokenizer

    def on_epoch_end(self, args, state, control, **kwargs):
        epoch = int(state.epoch)
        print(f"Callback triggered for epoch {epoch}")
        if epoch % self.save_epochs == 0:
            checkpoint_dir = os.path.join(self.output_dir, f"checkpoint-epoch-{epoch}")
            print(f"Attempting to save checkpoint for epoch {epoch} to {checkpoint_dir}")
            if 'model' in kwargs:
                kwargs['model'].save_pretrained(checkpoint_dir)
                self.tokenizer.save_pretrained(checkpoint_dir)
                print(f"Saved checkpoint for epoch {epoch} to {checkpoint_dir}")
            else:
                print("Model not found in kwargs, unable to save checkpoint")

In [30]:
def generate_text(model, tokenizer, max_length=100):
    input_ids = torch.tensor([[tokenizer.bos_token_id]]).to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.75,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

def generate_text_with_prompt(model, tokenizer, prompt, max_length=200):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length + len(input_ids[0]),
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.75,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

model.to(device)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"]
)

model = get_peft_model(model, peft_config)

Using device: cuda




In [32]:
file_path = "trump_speeches_combined_processed.txt"
block_size = 128
full_dataset = TextDataset(file_path, tokenizer, block_size)

train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

Token indices sequence length is longer than the specified maximum sequence length for this model (1022810 > 1024). Running this sequence through the model will result in indexing errors


Loaded 7990 examples.


In [33]:
# Training setup
total_epochs = 1000
save_epochs = 10

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=total_epochs,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=0.001,
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="no",  # We'll handle saving with our custom callback
    fp16=True,
    optim="adamw_torch",
    load_best_model_at_end=False,  # We're not using the default saving strategy
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Pass the tokenizer to the callback
save_callback = SaveEpochCallback(save_epochs, training_args.output_dir, tokenizer)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[save_callback],
)



In [None]:
# Train the model
print("Starting training...")
trainer.train()
print("Training completed")

In [38]:
# Manually save the final state if needed
final_checkpoint_dir = os.path.join(training_args.output_dir, "final-checkpoint")
trainer.save_model(final_checkpoint_dir)
tokenizer.save_pretrained(final_checkpoint_dir)
print(f"Saved final checkpoint to {final_checkpoint_dir}")

Saved final checkpoint to ./results/final-checkpoint


In [43]:
prompt = "What are your thoughts on free healthcare for all Americans?"
interval = 100
for epoch in range(interval, total_epochs + 1, interval):
    checkpoint_dir = f"./results/checkpoint-epoch-{epoch}"
    if os.path.exists(checkpoint_dir):
        loaded_model = GPT2LMHeadModel.from_pretrained(checkpoint_dir)
        loaded_model = get_peft_model(loaded_model, peft_config)
        loaded_model.to(device)
        
        print(f"\nGenerated text after epoch {epoch}:")
        print(generate_text(loaded_model, tokenizer))

        print(f"\nGenerated text with prompt after epoch {epoch}:")
        print(f"Prompt: {prompt}")
        print(f"Response: {generate_text_with_prompt(loaded_model, tokenizer, prompt)}")

# Calculate final perplexity on validation set
val_loss = trainer.evaluate()['eval_loss']
val_perplexity = math.exp(val_loss)
print(f"Final Validation Perplexity: {val_perplexity:.2f}")

# Generate text with final model
print("\nFinal generated text:")
print(generate_text(model, tokenizer))

print("\nFinal generated text with prompt:")
print(f"Prompt: {prompt}")
print(f"Response: {generate_text_with_prompt(model, tokenizer, prompt)}")


Generated text after epoch 100:
, the latest on the rise of anti-immigration sentiment. (Peter Stevenson/The Washington Post)

In a statement released Monday, Trump's campaign announced that it had hired an outside lawyer to assist the campaign and that its senior staff was "working on legal issues," with the final deadline to hire an attorney by July 14.
- 'It's not the government that's paying attention to the president's tweets,' says former GOP strategist
, former Trump campaign adviser, and Trump loyal

Generated text with prompt after epoch 100:
Prompt: What are your thoughts on free healthcare for all Americans?
Response: What are your thoughts on free healthcare for all Americans? Let us know in the comments below!

Images via Flickr, Shutterstock, and Creative Commons.

Generated text after epoch 200:
Bruno Massaro is an American singer and songwriter, and he is one of the most famous and influential artists of his generation. His solo work has made him one the biggest and mo

In [40]:
final_output_dir = "./fine_tuned_gpt2_final"
trainer.save_model(final_output_dir)
tokenizer.save_pretrained(final_output_dir)
print(f"Final model saved to {final_output_dir}")

Final model saved to ./fine_tuned_gpt2_final
