In [None]:
# Step 1: Import Libraries
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import matplotlib.pyplot as plt
import pandas as pd
import re
import math
from datasets import load_dataset
import wandb
wandb.login()


In [None]:
# Step 2: Load and Prepare Data
def load_and_clean_text(filepath):
    df = pd.read_csv(filepath)

    def clean_text(text):
        text = str(text).strip()
        text = re.sub(r'\d+', '', text)              # Remove digits
        text = re.sub(r'\s+', ' ', text).strip()     # Normalize whitespace
        return text.lower()                          # Lowercase for consistency

    df['cleaned'] = df['Text'].fillna("").apply(clean_text)
    return df['cleaned'].tolist()  # Return as list of lines


In [None]:
# Load your CSV file (upload manually in Colab or mount Google Drive)
text = load_and_clean_text("alchtexts2_.csv")

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
# Save to .txt for dataset loading
with open("alchemy_cleaned.txt", "w") as f:
    for line in text:
        f.write(line + "\n")

In [None]:
# Step 3: Load Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

In [None]:
# Step 4: Create Dataset for Fine-Tuning
def load_dataset_for_training(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    return dataset, data_collator

In [None]:
dataset, data_collator = load_dataset_for_training("alchemy_cleaned.txt", tokenizer)

In [None]:
# Step 5: Training Setup
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-alchemy",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    prediction_loss_only=True,
    report_to="wandb",  # Enables wandb logging
    run_name="gpt2-alchemy-run1"  # Optional: useful to track runs
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

In [None]:
# Step 6: Train the Model
trainer.train()

In [None]:
# Step 6.5: Evaluate the Model
eval_results = trainer.evaluate()
print("Perplexity:", math.exp(eval_results["eval_loss"]))


In [None]:
# Step 7: Save Fine-Tuned Model
model.save_pretrained("./gpt2-alchemy")
tokenizer.save_pretrained("./gpt2-alchemy")