In [None]:
import json
from datasets import Dataset, DatasetDict
# from sklearn.model_selection import train_test_split # Removed sklearn import

# === Load RAFT JSONL ===
with open("/content/drive/MyDrive/final_corvit_raft_dataset_enhanced.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# === Convert to HuggingFace Dataset ===
full_dataset = Dataset.from_list(data)

# === Split into train (80%), val (10%), test (10%) using datasets library ===
# First split into train_val (90%) and test (10%)
train_val_test_split = full_dataset.train_test_split(test_size=0.1, seed=42)
train_val = train_val_test_split['train']
test = train_val_test_split['test']

# Then split train_val (90%) into train (80% of original) and val (10% of original)
# Since train_val is 90% of the original, 10% of the original is 10/90 = 1/9 = 0.1111... of train_val
train_val_split = train_val.train_test_split(test_size=0.1111, seed=42)
train = train_val_split['train']
val = train_val_split['test']


# === Create dataset dict and save ===
raft_dataset = DatasetDict({
    "train": train,
    "validation": val,
    "test": test
})

raft_dataset.save_to_disk("/content/corvit_raft_splits1")
print("✅ Split and saved to /content/corvit_raft_splits1")

Saving the dataset (0/1 shards):   0%|          | 0/3435 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/430 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/430 [00:00<?, ? examples/s]

✅ Split and saved to /content/corvit_raft_splits


In [None]:
# Check if any examples have missing answers
for example in raft_dataset["train"]:
    if not example["cot_answer"].strip():
        print("❌ Empty answer found:", example)


In [None]:
from transformers import AutoTokenizer

# === 1. Load your saved dataset ===
from datasets import load_from_disk
dataset = load_from_disk("file:///content/corvit_raft_splits1")

# === 2. Set model & tokenizer ===
from transformers import AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
tokenizer.pad_token = tokenizer.eos_token  # ✅ Add this!


# === 3. Define preprocessing function ===
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 256

def preprocess_function(examples):
    inputs = [f"question: {q}  context: {ctx}" for q, ctx in zip(examples["question"], examples["context"])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["cot_answer"], max_length=128, truncation=True, padding="max_length")

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# === 4. Apply preprocessing ===
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_val = dataset["validation"].map(preprocess_function, batched=True)
tokenized_test = dataset["test"].map(preprocess_function, batched=True)  # ← if you have a test split

# Combine into one dataset dictionary
from datasets import DatasetDict

tokenized_dataset = DatasetDict({
    "train": tokenized_train,
    "validation": tokenized_val,
    "test": tokenized_test   # ← include this if available
})

# Save the complete tokenized dataset
tokenized_dataset.save_to_disk("/content/corvit_raft_tokenized1")

print("✅ Preprocessing and tokenization complete.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/3435 [00:00<?, ? examples/s]



Map:   0%|          | 0/430 [00:00<?, ? examples/s]



Map:   0%|          | 0/430 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3435 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/430 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/430 [00:00<?, ? examples/s]

✅ Preprocessing and tokenization complete.


In [None]:
print(tokenized_train[0]["labels"])


[3, 30345, 346, 122, 77, 834, 8270, 15, 30345, 1593, 10, 363, 4064, 33, 2303, 16, 8, 7833, 16574, 447, 7, 503, 44, 2487, 5566, 5479, 58, 71, 10, 37, 7833, 16574, 447, 7, 503, 3792, 20407, 7, 4431, 7, 6, 1437, 2267, 6, 20407, 7, 6020, 6, 11, 3236, 20407, 7, 408, 11, 12001, 1195, 3, 30345, 989, 834, 8270, 15, 30345, 4063, 6, 100, 3606, 7, 24, 37, 7833, 16574, 447, 7, 503, 3792, 20407, 7, 4431, 7, 6, 1437, 2267, 6, 20407, 7, 6020, 6, 11, 3236, 20407, 7, 408, 11, 12001, 1195, 5, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score
import numpy as np

# === Load tokenized dataset ===
from datasets import load_from_disk

tokenized_dataset = load_from_disk("file:///content/corvit_raft_tokenized1")
tokenized_train = tokenized_dataset["train"]
tokenized_val = tokenized_dataset["validation"]
tokenized_test = tokenized_dataset["test"]


# === Load model and tokenizer ===
model_name = "google/flan-t5-base"  # or "flan-t5-large" if you have enough VRAM
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# === Data Collator ===
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
!pip install rouge_score
!pip install evaluate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=aa55c1505c8c542f0d89187cc6e383801e2ee82c60ce767609be324db2280674
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
# === Metric Function ===
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert logits to token IDs (argmax)
    predictions = np.argmax(predictions[0], axis=-1)  # Extract logits from tuple
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return {
        "rouge1": rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rouge1"])["rouge1"],
        "rouge2": rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rouge2"])["rouge2"],
        "rougeL": rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rougeL"])["rougeL"],
    }
# === Training Arguments ===
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/raft_model_chatbot",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    num_train_epochs=2,
    learning_rate=1e-5,  # Lower for RAFT
    lr_scheduler_type="cosine",  # Better convergence
    fp16=False,
    max_grad_norm=1.0,  # Prevent explosions
    report_to="none",
    metric_for_best_model="eval_loss",  # Enables early stopping
    load_best_model_at_end=True,  # Saves the best checkpoint
)

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
    compute_metrics=compute_metrics
)
trainer.train()

print("✅ Training completed.")

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
50,1.2168,0.422549,0.703031,0.660777,0.699719
100,0.4016,0.13037,0.907925,0.87979,0.907392
150,0.1611,0.070971,0.957177,0.935623,0.956606
200,0.1059,0.043496,0.968519,0.947868,0.968486
250,0.0744,0.03642,0.969473,0.949183,0.969473
300,0.061,0.03376,0.969777,0.949371,0.969777
350,0.0557,0.032659,0.969857,0.949597,0.969857
400,0.0549,0.033625,0.966504,0.946086,0.966504
450,0.0489,0.031581,0.969911,0.949607,0.969911
500,0.0495,0.031349,0.969879,0.949616,0.969879


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


✅ Training completed.


In [None]:
# Save final model to Drive
output_path = "/content/drive/MyDrive/raft_model_chatbot"

model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

print("✅ Final model saved successfully!")

✅ Final model saved successfully!


In [None]:
!ls /content/drive/MyDrive/raft_model_chatbot


checkpoint-600	generation_config.json	 spiece.model
checkpoint-860	model.safetensors	 tokenizer_config.json
config.json	special_tokens_map.json  tokenizer.json


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import evaluate

# === Load dataset and model ===
tokenized_dataset = load_from_disk("file:///content/corvit_raft_tokenized")
tokenized_test = tokenized_dataset["test"]

# Correcting the model path to the saved model directory
model_path = "/content/drive/MyDrive/raft_model_chatbot"  # <-- adjust if needed
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# === Define ROUGE metric ===
rouge = evaluate.load("rouge")

# === Compute metrics ===
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions[0], axis=-1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {
        "rouge1": rouge_scores["rouge1"],
        "rouge2": rouge_scores["rouge2"],
        "rougeL": rouge_scores["rougeL"]
    }

# === New training arguments just for evaluation ===
eval_args = TrainingArguments(
    output_dir="/content/raft_eval_logs",
    per_device_eval_batch_size=16,  # 🚀 Boosted for A100
    dataloader_num_workers=2,
    report_to="none",
    prediction_loss_only=False,
    remove_unused_columns=True,
    do_train=False,
    do_eval=True,
)

# === Create Trainer for Evaluation ===
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
    compute_metrics=compute_metrics,
)

# === Run Evaluation ===
results = trainer.evaluate()
print("\n📊 Test Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

  trainer = Trainer(



📊 Test Evaluation Results:
eval_loss: 0.0313
eval_model_preparation_time: 0.0061
eval_rouge1: 0.9693
eval_rouge2: 0.9483
eval_rougeL: 0.9693
eval_runtime: 12.9559
eval_samples_per_second: 33.1890
eval_steps_per_second: 2.0840
