<a href="https://colab.research.google.com/github/Jashkine/Finetune_llm/blob/main/finetune_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade fsspec datasets

Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# ✅ Load pretrained GPT-2 and tokenizer
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-2 does not have a padding token by default
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))




In [None]:
# ✅ Load a public dataset — we’ll use wikitext
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

def tokenize_function(examples):
    result = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


In [None]:
# ✅ Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-wikitext-results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,  # Use mixed precision if on GPU
)

# ✅ Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:

# ✅ Start training
trainer.train()

In [None]:
# Save the model and tokenizer to a directory
save_directory = "./gpt2-finetuned"

trainer.save_model(save_directory)          # Saves model and config
tokenizer.save_pretrained(save_directory)  # Saves tokenizer files too


In [None]:
results = trainer.evaluate(eval_dataset=tokenized_dataset["validation"])
print(results)

In [None]:
import math

eval_loss = results["eval_loss"]
perplexity = math.exp(eval_loss)
print(f"Perplexity: {perplexity:.2f}")


In [None]:
test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
test_perplexity = math.exp(test_results["eval_loss"])
print(f"Test Perplexity: {test_perplexity:.2f}")


In [None]:
import matplotlib.pyplot as plt
import json

# Path to your training log file (adjust if different)
log_path = "./logs/events.out.tfevents..."

# If using tensorboard logs, you can convert them or parse
# But simplest: enable Trainer to save logs in JSON

# Alternative: if you saved logs manually or have 'trainer.state.log_history':
log_history = trainer.state.log_history

# Extract losses and steps
steps = [x['step'] for x in log_history if 'loss' in x]
train_losses = [x['loss'] for x in log_history if 'loss' in x]
eval_losses = [x['eval_loss'] for x in log_history if 'eval_loss' in x]

plt.plot(steps, train_losses, label="Train Loss")
if eval_losses:
    plt.plot(steps[:len(eval_losses)], eval_losses, label="Eval Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.show()


In [None]:
from transformers import pipeline

# Assuming 'model' and 'tokenizer' are your in-memory fine-tuned objects

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "Once upon a time"
outputs = generator(prompt, max_length=50, num_return_sequences=3)

for i, output in enumerate(outputs):
    print(f"Generated #{i+1}: {output['generated_text']}\n")


In [None]:
from transformers import pipeline

# Load the model and tokenizer (if you restarted your session)
model_name_or_path = "./gpt2-finetuned"  # or wherever you saved your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "Once upon a time"
outputs = generator(prompt, max_length=50, num_return_sequences=3)

for i, output in enumerate(outputs):
    print(f"Generated #{i+1}: {output['generated_text']}\n")


In [None]:
%load_ext tensorboard
%tensorboard --logdir ./logs
