# Fine-tune GPT-2 for Health Q&A

This notebook is a segmented conversion of `re.py` into runnable steps.

## Notes
- You need a CSV with two columns: **question**, **answer** (or it will be renamed).
- Fine-tuning GPT-2 can be slow on CPU; a GPU is strongly recommended.
- Hugging Face authentication is required only for pushing the model to the Hub.

## 1) Install dependencies

In [None]:
!pip -q install transformers datasets torch pandas huggingface_hub

## 2) (Optional) Login to Hugging Face
Run this if you plan to push the fine-tuned model to the Hub.

In [None]:
from huggingface_hub import login

# This will prompt you for a token in the notebook output area
# Create a token at: https://huggingface.co/settings/tokens
login()

## 3) Load dataset from CSV
Update `DATA_PATH` to your local CSV file path.

In [None]:
import pandas as pd

# TODO: set your dataset path
DATA_PATH = r"Final data set for llm based health assitance (1).csv"

df = pd.read_csv(DATA_PATH)
df = df.dropna()

# Ensure exactly these column names
if list(df.columns[:2]) != ["question", "answer"]:
    df = df.iloc[:, :2].copy()
    df.columns = ["question", "answer"]

df.head()

## 4) Tokenizer + prompt formatting

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# GPT-2 has no pad token by default
tokenizer.pad_token = tokenizer.eos_token

df["prompt"] = "Question: " + df["question"].astype(str) + "\nAnswer: "
df["response"] = df["answer"].astype(str)

df[["prompt", "response"]].head()

## 5) PyTorch dataset

In [None]:
import torch
from torch.utils.data import Dataset

class GPTQADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        prompt = self.data.loc[index, "prompt"]
        response = self.data.loc[index, "response"]
        full_text = prompt + response

        inputs = self.tokenizer(
            full_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            # Causal LM training: labels are the same as input_ids
            "labels": input_ids.clone(),
        }

# Quick sanity check
sample = GPTQADataset(df.head(2), tokenizer)[0]
{k: v.shape for k, v in sample.items()}

## 6) Train/validation split

In [None]:
train_size = int(0.8 * len(df))
train_df = df.iloc[:train_size].copy()
val_df = df.iloc[train_size:].copy()

train_dataset = GPTQADataset(train_df, tokenizer)
val_dataset = GPTQADataset(val_df, tokenizer)

len(train_dataset), len(val_dataset)

## 7) Fine-tune GPT-2

In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

training_args = TrainingArguments(
    output_dir="./gpt_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

## 8) Save the fine-tuned model locally

In [None]:
MODEL_OUTPUT_DIR = "./fine_tuned_gpt"
model.save_pretrained(MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)
print(f"Model saved to: {MODEL_OUTPUT_DIR}")

## 9) (Optional) Upload model to Hugging Face Hub
Make sure you ran the login step first.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

HF_REPO = "Ganesh19128734/fine-tuned-gpt-project"  # TODO: change to your username/repo

upload_model = AutoModelForCausalLM.from_pretrained(MODEL_OUTPUT_DIR)
upload_tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_DIR)

upload_model.push_to_hub(HF_REPO)
upload_tokenizer.push_to_hub(HF_REPO)

print(f"Model uploaded to: https://huggingface.co/{HF_REPO}")

## 10) Test the fine-tuned model
You can test either your local saved model (`MODEL_OUTPUT_DIR`) or your HF repo id.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_answer(question: str, model_path: str = MODEL_OUTPUT_DIR) -> str:
    local_tokenizer = AutoTokenizer.from_pretrained(model_path)
    local_model = AutoModelForCausalLM.from_pretrained(model_path)
    local_tokenizer.pad_token = local_tokenizer.eos_token

    prompt = f"Question: {question}\nAnswer:"
    inputs = local_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

    local_model.eval()
    with torch.no_grad():
        output = local_model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=128,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=local_tokenizer.eos_token_id,
        )

    generated_text = local_tokenizer.decode(output[0], skip_special_tokens=True)

    if "Answer:" in generated_text:
        answer = generated_text.split("Answer:", 1)[1].strip()
    else:
        answer = generated_text[len(prompt):].strip()

    # Optional cleanup carried over from the original script
    if "?" in answer:
        answer = answer.split("?", 1)[-1].strip()

    return answer

sample_question = "What are the potential side effects of ibuprofen?"
response = get_answer(sample_question)

print("Question:", sample_question)
print("Answer:", response)