**Using GPT-2 for model Traing**

In [None]:
!pip install openpyxl
!pip install transformers
!pip install datasets
!pip install peft
!pip install torch torchvision torchaudio
!pip install torch_xla[tpu] -f https://storage.googleapis.com/tpu-pytorch/wheels/colab.html

setting the dataset format to JSON

In [None]:
# file: preprocess_dataset.py
import pandas as pd
import re
import json
from tqdm import tqdm
# Load dataset
file_path = "/content/dataset_FYP.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

# Clean helper
def clean_text(text: str) -> str:
    if pd.isna(text):
        return ""
    return re.sub(r'["{}]', '', str(text)).strip()

# Build input-output pairs
pairs = []
for _, row in tqdm(df.iterrows()):
    question = clean_text(row["Question statment"])
    start = clean_text(row["Start State"])
    final = clean_text(row["Final States"])
    transitions = clean_text(row["Transitions"])

    # Normalize transitions into list format
    transitions = transitions.replace("),", ")|")  # temporary separator
    transitions = transitions.replace("(", "[").replace(")", "]")
    transitions = transitions.replace("|", ", ")

    input_text = f"INPUT: {question}"
    output_text = f"OUTPUT: START={start}; FINAL={final}; TRANSITIONS={transitions}"

    pairs.append({"input": input_text, "output": output_text})

# Save as JSONL for HuggingFace
jsonl_path = "dataset_gpt2.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for p in pairs:
        f.write(json.dumps(p) + "\n")

# Also save as plain TXT (optional)
txt_path = "dataset_gpt2.txt"
with open(txt_path, "w", encoding="utf-8") as f:
    for p in pairs:
        f.write(p["input"] + "\n" + p["output"] + "\n\n")

print(f"Saved {len(pairs)} samples to {jsonl_path} and {txt_path}")


In [None]:
# file: finetune_gpt2.py
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

# 1. Load dataset (JSONL format)
dataset = load_dataset("json", data_files="/content/dataset_gpt2.jsonl")
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)  # 80/20 split

# 2. Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

# 3. Preprocess function
def preprocess(example):
    text = f"INPUT: {example['input']} OUTPUT: {example['output']}"
    tokens = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(preprocess, batched=False)

# 4. Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# 5. Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# 6. Accuracy metric (exact match)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Replace -100 with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Exact match: all tokens equal
    matches = (predictions == labels).all(axis=1).astype(float)
    acc = matches.mean()

    return {"exact_match_acc": acc}

# 7. Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-fsm",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=4,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    eval_strategy="epoch",  # Evaluate each epoch
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
)

# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 9. Train
trainer.train()

# 10. Save model + tokenizer
#trainer.save_model("./gpt2-finetuned-fsm")
#tokenizer.save_pretrained("./gpt2-finetuned-fsm")

print("✅ Training complete. Model + tokenizer saved at ./gpt2-finetuned-fsm")


having memory issues need to consult some one about how to solve this

In [None]:
dataset = load_dataset("json", data_files="/content/dataset_gpt2.jsonl")

In [None]:
dataset["train"].features["input"]