# 0. Mount to Google Drive and Redirect to the Project Notebook Folder.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Change directory to the project notebooks folder and install dependencies
%cd /content/drive/MyDrive/pj/notebooks
%pip install -r ../requirements.txt

# 1. Import necessary packages and load raw dataset

In [None]:
import os, torch, time, sys, seaborn as sns, numpy as np, evaluate, pandas as pd, matplotlib.pyplot as plt, warnings
from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from sklearn.metrics import f1_score
os.sys.path.append('../src')
from config import *
from utils import build_default_training_args, build_default_lora_config, train_and_evaluate_unified, few_shot_train_and_evaluate_unified, set_precision_for_gpu, run_epoch_control_unified, load_and_tokenize_dataset
#from data import get_robust_train_dataset
from noisy_data import create_robust_dataset

In [None]:
# 2. Basic configurations
sns.set_theme(style="whitegrid")
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
_, precision_str, _ = set_precision_for_gpu()

In [None]:
raw_dataset = load_dataset("glue", TASK) # → DatasetDict
tokenizer, train_dataset, eval_dataset = load_and_tokenize_dataset(TASK) # → DatasetDict
print("\n--- Datasets loaded and tokenized successfully. ---")
print(f"Train dataset size: {len(train_dataset)}") # type: ignore
print(f"Eval dataset size: {len(eval_dataset)}\n") # type: ignore
print(raw_dataset['train'][0]) # type: ignore
print(train_dataset[0]) # type: ignore

# 2. Dataset Exploration and Visualization

In [None]:
sns.countplot(x='label', data=pd.DataFrame(raw_dataset['train']))
plt.title('Class Distribution in SST-2 Dataset')
plt.show()

# plot sentence length distribution
sentence_lengths = [len(sentence.split()) for sentence in raw_dataset['train']['sentence']]
plt.hist(sentence_lengths, bins=30, edgecolor='k')
plt.title('Sentence Length Distribution in SST-2 Dataset')
plt.xlabel('Length of Sentence')
plt.ylabel('Number of Sentences')
plt.show()

# 3. Full Finetune vs LoRA vs QLoRA

In [None]:
experiments = {
"Full Finetune (Default)": {"method": "full", "lr": FULL_FINETUNE_LR, "seed": SEED},
f"Full Finetune ({precision_str})": {"method": "full_16bit", "lr": FULL_FINETUNE_LR, "seed": SEED},
f"LoRA ({precision_str})": {"method": "lora_16bit", "lr": LEARNING_RATE, "seed": SEED},
"LoRA (8-bit)": {"method": "lora_8bit", "lr": LEARNING_RATE, "seed": SEED},
"QLoRA (4-bit)": {"method": "qlora_4bit", "lr": LEARNING_RATE, "seed": SEED},
}
results = []
for name, config in experiments.items():
    print(f"\n=== Experiment: {name} ===")
    train_and_evaluate_unified(name, config, results, train_dataset, eval_dataset, bootstrap=False, delete_checkpoints=False)

print("\n--- 5. Comparative Results Summary ---")
df = pd.DataFrame(results)
df.to_csv("../results/csvs/results_of_initialize.csv", index=False)

print(df)

# 4. Exploration on learning rate  and seeds (Bootstrap is applied to obtain confidence score.)

In [None]:
# 4.1 lr experiements
ls_lr = [1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3]

results_lr = []
print("\n--- 6. Additional Experiments: Varying Learning Rate ---")
for lr in ls_lr:
    print(f"LoRA ({precision_str}) - LR {lr}")
    train_and_evaluate_unified(
        f"LoRA ({precision_str}) - LR {lr}",
        {
            "method": "lora_16bit",
            "lr": lr,
            "seed": SEED
        },
        results_lr,
        train_dataset,
        eval_dataset,
    )
BEST_LR = max(results_lr, key=lambda x: x['Accuracy'])['Learning Rate']
print(f"\nBest Learning Rate from previous experiments: {BEST_LR}\n")
df = pd.DataFrame(results_lr)
df.to_csv("../results/csvs/results_lr.csv", index=False)
print(df)


In [None]:
ls_seed = [42, 100, 2025, 6657]
results_seed = []
for seed in ls_seed:
    print(f"LoRA ({precision_str}) - Seed {seed}")
    train_and_evaluate_unified(
        f"LoRA ({precision_str}) - Seed {seed}",
        {
            "method": "lora_16bit",
            "lr": BEST_LR,
            "seed": seed
        },
        results_seed,
        train_dataset,
        eval_dataset,
        bootstrap = True
    )
BEST_SEED = max(results_seed, key=lambda x: x['Accuracy'])['Seed']
print(f"\nBest Seed from previous experiments: {BEST_SEED}\n")
df = pd.DataFrame(results_seed)
df.to_csv("../results/csvs/results_seed.csv", index=False)
print(df)

# 5. Toxic Datasets

In [None]:
# 4.3 toxic datasets
print("\n--- Creating Noisy Dataset ---")
def preprocess_function(examples):
    return tokenizer(examples['sentence'], truncation=True, max_length=MAX_LENGTH)
raw_train_dataset = load_dataset("glue", TASK)["train"]
train_robust = create_robust_dataset(raw_train_dataset, augment_factor=2, seed=BEST_SEED)
tokenized_train_robust = train_robust.map(preprocess_function, batched=True)

base_robust_config = {
    "method": "lora_16bit",
    "lr": BEST_LR,
    "seed": BEST_SEED
}
results_robust=[]
train_and_evaluate_unified('lora16bit_robust', base_robust_config, results_robust, tokenized_train_robust, eval_dataset, bootstrap=True)
print(f"Results of model trained on noisy dataset - Accuracy: {results_robust[0]['Accuracy']}, F1: {results_robust[0]['F1 Score']}")
print(f"Results of model trained on original dataset - Accuracy: {results[3]['Accuracy']}, F1: {results[3]['F1 Score']}")

# 5. Zero shot vs few shot vs full-finetune

In [None]:
# 5. Zero-shot and Few-shot Experiments
few_shot_results_lora, few_shot_results_ft = [], []
few_shot_sizes=[0,10,50,100,200,500,1000,2000,5000,10000,len(train_dataset)]
seeds=[42, 6657, 2025]
config_few_shot_lora = {
    "method": "lora_16bit",
    "lr": BEST_LR,
    "batch_size": 16,}

print("\n--- 5. Few-Shot Experiments ---")
for k in few_shot_sizes:
  if k<=500:
    config_few_shot_lora["batch_size"]=4
  else:
    config_few_shot_lora["batch_size"]=16
  for seed in seeds:
      name = f"Few-Shot (k={k}) - LoRA ({precision_str}) - Seed {seed}"
      config_few_shot_lora["shot_size"] = k
      print(f"\n=== Experiment: {name} ===")
      few_shot_train_and_evaluate_unified(
          name,
          config_few_shot_lora,
          seed,
          few_shot_results_lora,
          raw_dataset,
      )
df_few_shot = pd.DataFrame(few_shot_results_lora)
df_few_shot.to_csv("../results/csvs/few_shot_results_lora.csv", index=False)
print(df_few_shot)


In [None]:
few_shot_results_ft=[]
config_few_shot_ft = {
    "method": "full",
    "lr": FULL_FINETUNE_LR,
    "batch_size": 16,}
for k in few_shot_sizes:
  if k<=500:
    config_few_shot_ft["batch_size"]=4
  else:
    config_few_shot_ft["batch_size"]=16
  for seed in seeds:
      name = f"Few-Shot (k={k}) - Full Finetune ({precision_str}) - Seed {seed}"
      config_few_shot_ft["shot_size"] = k
      print(f"\n=== Experiment: {name} ===")
      few_shot_train_and_evaluate_unified(
          name,
          config_few_shot_ft,
          seed,
          few_shot_results_ft,
          raw_dataset,
      )
df_few_shot = pd.DataFrame(few_shot_results_ft)
df_few_shot.to_csv("../results/csvs/few_shot_results_ft.csv", index=False)
print(df_few_shot)

few_shot_results = few_shot_results_lora + few_shot_results_ft

df_few_shot = pd.DataFrame(few_shot_results)
df_few_shot.to_csv("../results/csvs/few_shot_results.csv", index=False)
print(df_few_shot)

# 6. Impact of r in LoRA 16 bit

In [None]:
### LoRA Sweep
# r sweep
from utils3 import update_lora_config
print("\n--- 7. LoRA Hyperparameter Sweep ---")
rs = [1,4,8,16,32,64]
base_lora_config = build_default_lora_config()
sweep_results_r=[]
for r in rs:
    print(f"\n--- Sweeping R={r} ---")
    lora_config = base_lora_config
    lora_config = build_default_lora_config(r=r)
    train_and_evaluate_unified(
        name=f"LoRA Sweep - R={r}",
        config={
            "method": "lora_16bit",
            "lr": BEST_LR,
            "seed": BEST_SEED,
        },
        lora_config=lora_config,
        results_list=sweep_results_r,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    sweep_results_r[-1]['r'] = r
BEST_R = max(sweep_results_r, key=lambda x: x['Accuracy'])['r']
print(f"\nBest R from sweep experiments: {BEST_R}\n")
df_sweep_r = pd.DataFrame(sweep_results_r)
df_sweep_r.to_csv("../results/csvs/sweep_results_r.csv", index=False)
print(df_sweep_r)


In [None]:
# alpha sweep
alphas = [8, 16, 32, 64, 128, 256]
sweep_results_alpha=[]
for alpha in alphas:
    print(f"\n--- Sweeping Alpha={alpha} ---")
    lora_config = build_default_lora_config(r=BEST_R, lora_alpha=alpha)
    train_and_evaluate_unified(
        name=f"LoRA Sweep - Alpha={alpha}",
        config={
            "method": "lora_16bit",
            "lr": BEST_LR,
            "seed": SEED,
        },
        lora_config=lora_config,
        results_list=sweep_results_alpha,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    sweep_results_alpha[-1]['Alpha'] = alpha  # Ensure alpha is recorded correctly
for res in sweep_results_alpha:
    print(f"Alpha: {res['Alpha']}, Accuracy: {res['Accuracy']}")
BEST_ALPHA = max(sweep_results_alpha, key=lambda x: x['Accuracy'])['Alpha']
print(f"\nBest Alpha from sweep experiments: {BEST_ALPHA}\n")
df_sweep_alpha = pd.DataFrame(sweep_results_alpha)
df_sweep_alpha.to_csv("../results/csvs/sweep_results_alpha.csv", index=False)
print(df_sweep_alpha)

In [None]:
# target modules sweep
target_modules_options = [
    ["query_key_value", "dense"],
    ["query_key_value"],
    ["dense"],
    ["dense", "dense_h_to_4h", "dense_4h_to_h"]
]
sweep_results_tm=[]
for target_modules in target_modules_options:
    print(f"\n--- Sweeping Target Modules={target_modules} ---")
    lora_config = build_default_lora_config(r=BEST_R, lora_alpha=BEST_ALPHA, target_modules=target_modules)
    train_and_evaluate_unified(
        name=f"LoRA Sweep - Target Modules={target_modules}",
        config={
            "method": "lora_16bit",
            "lr": BEST_LR,
            "seed": SEED,
        },
        results_list=sweep_results_tm,
        lora_config=lora_config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
BEST_TM = max(sweep_results_tm, key=lambda x: x['Accuracy'])['Target Modules']
print(f"\nBest Target Modules from sweep experiments: {BEST_TM}\n")
df_sweep_tm = pd.DataFrame(sweep_results_tm)
df_sweep_tm.to_csv("../results/csvs/sweep_results_target_modules.csv", index=False)
print(df_sweep_tm)

# 7. Data Centric Experiment

In [None]:
from data_centric import get_centric_dataset, compute_sample_losses, compute_metrics

In [None]:
# Train a Base Student Model


In [None]:
# Base Student Model
N_centric = 1000
print(NUM_LABELS)


base_student_config = {
    "method": "full",
    "lr": BEST_LR,
    "seed": BEST_SEED,
    "shot_size": N_centric
}
_=[]
# train_and_evaluate_unified("base_student_model",base_student_config, _, train_dataset, eval_dataset, delete_checkpoints=False)
# Teacher Model Training
teacher_model = AutoModelForSequenceClassification.from_pretrained('../models/Full Finetune (FP32)/', num_labels=NUM_LABELS).to(device)
teacher_tokenizer = AutoTokenizer.from_pretrained('../models/Full Finetune (FP32)/')
raw_datasets = load_dataset("glue", "sst2")
full_train = raw_datasets["train"].map(
    lambda x: teacher_tokenizer(
        x["sentence"],
        truncation=True,
        max_length=MAX_LENGTH
    ),
    batched=True,
    remove_columns=["sentence", "idx"]
).rename_column("label", "labels")

val_data = raw_datasets["validation"].map(
    lambda x: teacher_tokenizer(x["sentence"], truncation=True, max_length=MAX_LENGTH),
    batched=True, remove_columns=["sentence", "idx"]
).rename_column("label", "labels")
losses = compute_sample_losses(teacher_model, teacher_tokenizer, full_train)
sorted_losses = sorted(losses, key=lambda x: x[1])



In [None]:
SIZE=1000
subset_centric=get_centric_dataset(
    sorted_losses=sorted_losses,
    dataset=full_train,
    total_size=SIZE,
    easiest_range=0.1,
    easiest_proportion=0.025,
    hardest_range=0.1,
    hardest_proportion=0.025
)
print(len(subset_centric))
print(f"Class 0: {sum(l == 0 for l in subset_centric['labels'])}")
print(f"Class 1: {sum(l == 1 for l in subset_centric['labels'])}")
subset_random=get_centric_dataset(
    sorted_losses=sorted_losses,
    dataset=full_train,
    total_size=SIZE,
    easiest_range=0,
    easiest_proportion=0,
    hardest_range=0,
    hardest_proportion=0
)
print(len(subset_random))
print(f"Class 0: {sum(l == 0 for l in subset_random['labels'])}")
print(f"Class 1: {sum(l == 1 for l in subset_random['labels'])}")

In [None]:
results_data_centric = []
# BASE_STUDENT_MODEL_PATH = f"base_student_model_fewshot_{N_centric}_results/checkpoint-315/" # Path to the base student model trained with N_centric samples
BASE_STUDENT_MODEL_PATH='student_model'
# 1. Prepare data-centric and random datasets

# Data-centric subset
# Random subset (using the balanced_sample_for_random helper)
# First, create a list of (original_idx, label) for the full_train dataset


# 2. Define common training arguments for continuation training
continuation_training_args = TrainingArguments(
    output_dir="./data_centric_continuation_results",
    per_device_train_batch_size=FEW_SHOT_BATCH_SIZE, # Use FEW_SHOT_BATCH_SIZE (16) for consistency with few-shot runs
    num_train_epochs=FEW_SHOT_EPOCHS, # Use FEW_SHOT_EPOCHS (5) for consistency
    learning_rate=BEST_LR,
    seed=BEST_SEED,
    eval_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="epoch",
    save_strategy="best",
    save_total_limit=1,
    report_to="none", # Disable wandb for now
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True, # Use FP16 for T4 compatibility as per precision_str and previous code
)


# 3. Perform continuation training for centric_ds
print("\n--- Continuation Training on Data-Centric Subset ---")
model_centric = AutoModelForSequenceClassification.from_pretrained(
    BASE_STUDENT_MODEL_PATH,
    num_labels=NUM_LABELS
)
trainer_centric = Trainer(
    model=model_centric,
    args=continuation_training_args,
    train_dataset=subset_centric,
    eval_dataset=val_data, # Evaluate on the original validation set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer_centric.train()
eval_results_centric = trainer_centric.evaluate()
print(f"Data Centric (Centric DS) Evaluation Results: {eval_results_centric}")
results_data_centric.append({
    "Method": "Data Centric (Centric DS)",
    "Accuracy": round(eval_results_centric['eval_accuracy'], 4),
    "F1 Score": round(eval_results_centric['eval_f1'], 4),
    "Shot Size": len(subset_centric)
})


# 4. Perform continuation training for random_ds
print("\n--- Continuation Training on Random Subset ---")
model_random = AutoModelForSequenceClassification.from_pretrained(
    BASE_STUDENT_MODEL_PATH,
    num_labels=NUM_LABELS
)
trainer_random = Trainer(
    model=model_random,
    args=continuation_training_args,
    train_dataset=subset_random,
    eval_dataset=val_data, # Evaluate on the original validation set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer_random.train()
eval_results_random = trainer_random.evaluate()
print(f"Data Centric (Random DS) Evaluation Results: {eval_results_random}")
results_data_centric.append({
    "Method": "Data Centric (Random DS)",
    "Accuracy": round(eval_results_random['eval_accuracy'], 4),
    "F1 Score": round(eval_results_random['eval_f1'], 4),
    "Shot Size": len(subset_random)
})

# 5. Save results to CSV
df_data_centric = pd.DataFrame(results_data_centric)
output_csv_path = "../results/csvs/data_centric_results.csv"
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
df_data_centric.to_csv(output_csv_path, index=False)
print(f"\n--- Data Centric Experiment Results ---")
print(df_data_centric)


# 6. Visualize results
plt.figure(figsize=(10, 6))
sns.barplot(x="Method", y="Accuracy", data=df_data_centric, palette="viridis")
plt.title("Accuracy Comparison: Data Centric vs Random Subsets")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x="Method", y="F1 Score", data=df_data_centric, palette="viridis")
plt.title("F1 Score Comparison: Data Centric vs Random Subsets")
plt.ylabel("F1 Score")
plt.ylim(0, 1)
plt.show()

# 8. Overfit Experiment

In [None]:
print("\n--- 8. Overfit Experiment ---")
overfit_results = []
base_config = {
    "method": "lora_16bit",
    "lr": BEST_LR,
    "seed": SEED
}
overfit_results = run_epoch_control_unified(
    config=base_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    max_epochs=20,
)
df_overfit = pd.DataFrame(overfit_results)
df_overfit.to_csv("../results/csvs/overfit_results.csv", index=False)
print(df_overfit)

# 9. Freeze Experiments

In [None]:
# 9. Freeze Layer Experiment
print("\n--- 9. Freeze Layer Experiment ---")
freeze=[0,1,4,8,11]
freeze_results = []
base_config = {
    "method": "full",
    "lr": BEST_LR,
    "seed": BEST_SEED,
    'freeze_layers':None
}
for num_layers in freeze:
    print(f"\n--- Freezing first {num_layers} layers ---")
    base_config['freeze_layers']=num_layers
    train_and_evaluate_unified(
        name=f"Full - Freeze {num_layers} Layers",
        config=base_config,
        results_list=freeze_results,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        epochs = 5
    )
df_freeze = pd.DataFrame(freeze_results)
df_freeze.to_csv("../results/csvs/freeze_layer_results.csv", index=False)
print(df_freeze)

# 10. Transfer Learning

In [None]:
# You can run the following command in a code cell to execute transfer.py with specified arguments. More details in transfer.py
!python -u "../src/transfer.py"   --fp16   --num_workers 4   --batch_size 32 --lora_r 16

In [None]:
!python -u "../src/transfer.py"   --fp16   --num_workers 4   --batch_size 32 --lora_r 8