**Using T5 without dependences lets see how well this works**

In [None]:
# file: preprocess_t5_dataset_cleaned.py
import pandas as pd
import json
import re

# 1. Load Excel dataset
df = pd.read_excel("/content/dataset_FYP.xlsx")

# 2. Clean generic text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r"[\" '\[\]\(\)\{\}]", "", text)  # remove brackets, quotes
    text = re.sub(r"\s+", " ", text)               # remove extra spaces/newlines
    text = re.sub(r"[^\x00-\x7F]+", "", text)      # remove non-ASCII chars
    return text

# 3. Normalize FSM transitions from "{(q0,0,q0),...}" → "q0,0->q0; q0,1->q1; ..."
def normalize_transitions(transitions: str) -> str:
    if not isinstance(transitions, str):
        return ""

    transitions = transitions.strip().strip("\"'{}")


    transitions = transitions.replace("(", "").replace(")", "")

    parts = [p.strip() for p in transitions.split(",") if p.strip()]


    grouped = []
    for i in range(0, len(parts), 3):
        if i + 2 < len(parts):
            #print(parts[i+2])
            src, symbol, dest = parts[i], parts[i+1], parts[i+2]
            grouped.append(f"{src},{symbol}->{dest}")

    #print(grouped)
    clean_transitions = "; ".join(grouped)

    #print(clean_transitions)

    return clean_transitions


# 4. Apply cleaning and normalization
df["Question statment"] = df["Question statment"].apply(clean_text)
df["Start State"] = df["Start State"].apply(clean_text)
df["Final States"] = df["Final States"].apply(clean_text)
df["Transitions"] = df["Transitions"].apply(lambda x: normalize_transitions(clean_text(x)))

#print(df['Transitions'])

df = df.dropna(subset=["Question statment", "Start State", "Final States", "Transitions"])
df = df[
    (df["Question statment"].str.strip() != "") &
    (df["Start State"].str.strip() != "") &
    (df["Final States"].str.strip() != "") &
    (df["Transitions"].str.strip() != "")
]

#print(df['Transitions'])
# 5. Build T5 input/output pairs
pairs = []
for _, row in df.iterrows():
    question = row["Question statment"]
    start = row["Start State"]
    final = row["Final States"]
    transitions = row["Transitions"]

    input_text = f"fsm description: {question}"
    output_text = f"start: {start}; final: {final}; transitions: {transitions}"
    #print(output_text)
    pairs.append({
        "input_text": input_text,
        "target_text": output_text
    })
# 6. Save as JSONL (HuggingFace compatible)
output_path = "/content/dataset_t5.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for p in pairs:
        #print(p)
        json.dump(p, f, ensure_ascii=False)
        f.write("\n")


In [None]:
#!pip install -q transformers datasets accelerate peft evaluate rouge_score


import os
import numpy as np
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from peft import get_peft_model, LoraConfig, TaskType

# 2) Config
dataset_path = "/content/dataset_t5.jsonl"
model_name = "t5-small"
output_dir = "./t5-finetuned-fsm"
max_input_length = 128
max_target_length = 128

# 3) Load dataset and split
ds = load_dataset("json", data_files=dataset_path)
ds = ds["train"].train_test_split(test_size=0.2, seed=42)
print(f"Train size: {len(ds['train'])}, Eval size: {len(ds['test'])}")

# 4) Tokenizer & model - FIXED
tokenizer = T5Tokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = T5ForConditionalGeneration.from_pretrained(model_name)

# 5) Preprocessing with safety checks - FIXED
def preprocess_batch(batch):
    # Tokenize inputs
    inputs = tokenizer(
        batch["input_text"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )

    # Tokenize targets
    labels = tokenizer(
        batch["target_text"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length",
    )

    # Convert labels and ensure valid token IDs - FIXED
    labels_input_ids = []
    vocab_size = len(tokenizer)

    for seq in labels["input_ids"]:
        new_seq = []
        for token_id in seq:
            if token_id == tokenizer.pad_token_id:
                new_seq.append(-100)
            elif token_id >= vocab_size:
                # Replace out-of-vocabulary tokens with unk_token_id
                new_seq.append(tokenizer.unk_token_id)
            else:
                new_seq.append(token_id)
        labels_input_ids.append(new_seq)

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels_input_ids,
    }

tokenized = ds.map(preprocess_batch, batched=True, remove_columns=ds["train"].column_names)

# 6) VERIFY TOKEN IDs BEFORE TRAINING - ADD THIS
print("Verifying token IDs...")
vocab_size = len(tokenizer)
max_input_id = max(max(seq) for seq in tokenized["train"]["input_ids"])
max_label_id = max(max(seq) for seq in tokenized["train"]["labels"] if any(tok != -100 for tok in seq))

print(f"Vocabulary size: {vocab_size}")
print(f"Max input token ID: {max_input_id}")
print(f"Max label token ID: {max_label_id}")

if max_input_id >= vocab_size or max_label_id >= vocab_size:
    print("⚠️  WARNING: Some token IDs exceed vocabulary size!")
    # Apply emergency fix
    def fix_token_ids(example):
        example["input_ids"] = [min(token_id, vocab_size - 1) for token_id in example["input_ids"]]
        example["labels"] = [min(token_id, vocab_size - 1) if token_id != -100 else -100 for token_id in example["labels"]]
        return example

    tokenized = tokenized.map(fix_token_ids)
    print("✅ Applied emergency token ID fix")
else:
    print("✅ All token IDs are within valid range")

# 7) Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

# 8) Enable LoRA + gradient checkpointing
model.gradient_checkpointing_enable()
model.config.use_cache = False

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
model = get_peft_model(model, peft_config)

# 9) Metrics
rouge = evaluate.load("rouge")

def normalize_text(s):
    return " ".join(s.lower().strip().split()) if isinstance(s, str) else ""

def postprocess_texts(preds, labels):
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    preds = [normalize_text(p) for p in preds]
    labels = [normalize_text(l) for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple): preds = preds[0]
    if preds.ndim == 3: preds = np.argmax(preds, axis=-1)
    preds, labels = postprocess_texts(preds, labels)
    rouge_result = rouge.compute(predictions=preds, references=labels, use_stemmer=True)
    exact_match = np.mean([p == l for p, l in zip(preds, labels)])
    return {
        "rouge1": float(rouge_result["rouge1"]),
        "rougeL": float(rouge_result["rougeL"]),
        "exact_match": float(exact_match),
    }

# 10) Training arguments - FIXED TYPOS
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    predict_with_generate=True,
    generation_max_length=max_target_length,
    eval_strategy="no",  # FIXED: was "eval_strategy"
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    num_train_epochs=6,
    learning_rate=2e-5,
    fp16=torch.cuda.is_available(),
    weight_decay=0.01,
    remove_unused_columns=False,
    report_to="none",
)

# 11) Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

"""
# 13) Save model and adapters
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir + "/peft_adapters")

print("✅ Training finished and saved to", output_dir)

# 14) Quick inference test
def generate_fsm(prompt: str, max_new_tokens: int = 128):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=max_input_length).to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=max_target_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

example_prompt = "fsm description: odd number of 1's"
print("EXAMPLE INFERENCE:", generate_fsm(example_prompt))
"""

In [None]:
metrics = trainer.evaluate() #piece id is out of range. ye error aa raha ha evaluation ma iss ki samjh ni aa rahi mujhay kesy sahi karu
print(metrics)