In [1]:
# =================================================================
# Project: Adapting from Sentence-level to Document-level Sentiment Analysis
# Phase 1: Student Model Fine-tuning on SST2 and Evaluation
# =================================================================

# Import libraries
import torch
import time
import os
import random
import json
import numpy as np
from tqdm import tqdm

# =================================================================
# ENVIRONMENT SETUP
# =================================================================

# Set Hugging Face cache directory to a larger, persistent volume in VESSL
# to prevent "No space left on device" errors.
cache_dir = "/output/huggingface_cache"
os.environ['HF_HOME'] = cache_dir
os.environ['HF_DATASETS_CACHE'] = os.path.join(cache_dir, "datasets")
os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, "models")

# Set the Weights & Biases project name
os.environ['WANDB_PROJECT'] = "prompt-adaptation-sent-analysis"

# Ensure the cache directory exists
os.makedirs(cache_dir, exist_ok=True)
print(f"✅ [SETUP] Hugging Face cache directory set to: {cache_dir}")
print(f"✅ [SETUP] WandB project set to: {os.environ['WANDB_PROJECT']}")


device = 'cuda' if torch.cuda.is_available() else 'cpu'

try:
    from unsloth import FastLanguageModel, is_bfloat16_supported
    from trl import SFTTrainer
    from peft import PeftModel
    from datasets import load_dataset
    from transformers import (
        AutoTokenizer,
        TrainingArguments,
        TextStreamer,
    )
    print("✅ [CHECKPOINT] Imports successful")
except ImportError as e:
    print(f"❌ ImportError: {e}")
    raise

# --- Basic Setup ---
print("CUDA available:", torch.cuda.is_available())
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"- Device: {torch.cuda.get_device_name(0)}")

# Seed for reproducibility
def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)
print("✅ [CHECKPOINT] Seed set")

✅ [SETUP] Hugging Face cache directory set to: /output/huggingface_cache
✅ [SETUP] WandB project set to: prompt-adaptation-sent-analysis
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




🦥 Unsloth Zoo will now patch everything to make training faster!
✅ [CHECKPOINT] Imports successful
CUDA available: True
Using device: cuda
- Device: NVIDIA GeForce RTX 3090
✅ [CHECKPOINT] Seed set


In [2]:
# --- Load Model and Tokenizer ---
print("Loading base model - this may take a moment...")
model_load_start = time.time()

max_seq_length = 2048
dtype = None
load_in_4bit = True

try:
    # Reverted to the originally intended model as the cache issue is resolved.
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-1B-unsloth-bnb-4bit",
        max_seq_length=max_seq_length,
        load_in_4bit=load_in_4bit,
        dtype=dtype,
    )
    print(f"✅ [CHECKPOINT] Model loaded in {time.time() - model_load_start:.2f}s")
except Exception as e:
    print(f"❌ Failed to load model: {e}")
    raise

Loading base model - this may take a moment...
==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ [CHECKPOINT] Model loaded in 8.15s


In [3]:
# --- Configure PEFT (LoRA) ---
peft_start = time.time()
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    use_rslora=True,
    use_gradient_checkpointing=True
)
print(f"✅ [CHECKPOINT] PEFT model configured in {time.time() - peft_start:.2f}s")
model.print_trainable_parameters()

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.6.2 patched 16 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ [CHECKPOINT] PEFT model configured in 2.35s
trainable params: 6,815,744 || all params: 1,242,630,144 || trainable%: 0.5485


In [None]:
# --- Load and Process the SST2 Dataset ---
dataset_start = time.time()
print("Loading and processing SST2 dataset...")

sentiment_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Classify the sentiment of the following movie review sentence as either "Positive" or "Negative".

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
label_map = {0: "Negative", 1: "Positive"}

def formatting_prompts_func(examples):
    sentences = examples["sentence"]
    labels = examples["label"]
    texts = []
    for sentence, label_id in zip(sentences, labels):
        output = label_map.get(label_id, "Unknown")
        texts.append(sentiment_prompt.format(sentence, output) + EOS_TOKEN)
    return {"text": texts}

try:
    dataset = load_dataset("stanfordnlp/sst2", split="train")
    dataset = dataset.map(formatting_prompts_func, batched=True,)
    print(f"✅ [CHECKPOINT] Dataset loaded and processed in {time.time() - dataset_start:.2f}s with {len(dataset)} examples.")
    print("\nSample from formatted dataset:\n" + dataset[0]["text"])
except Exception as e:
    print(f"❌ Failed to load/process dataset: {e}")
    raise

Loading and processing SST2 dataset...


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

✅ [CHECKPOINT] Dataset loaded and processed in 5.28s with 67349 examples.

Sample from formatted dataset:
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Classify the sentiment of the following movie review sentence as either "Positive" or "Negative".

### Input:
hide new secretions from the parental units 

### Response:
Negative<|end_of_text|>


In [7]:
# --- Configure the SFT Trainer ---
# script_dir = os.path.dirname(os.path.abspath(__file__)) # for .py file
script_dir = "./"
output_dir = os.path.join(script_dir, "train_outputs", "sst2_finetune")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=12,
    packing=True,
    args=TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=3.0,
        learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,
        logging_first_step=True,
        optim="adamw_8bit",
        weight_decay=0.02,
        lr_scheduler_type="cosine",
        seed=42,
        output_dir=output_dir,
        report_to="wandb",
        save_strategy="steps",
        save_steps=1000,
        gradient_checkpointing=True,
        max_grad_norm=0.3,
        dataloader_num_workers=8,
        dataloader_pin_memory=True,
    ),
)
print(f"✅ [CHECKPOINT] SFTTrainer configured. Output will be saved to: {output_dir}")

Unsloth: Tokenizing ["text"]:   0%|          | 0/67349 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


✅ [CHECKPOINT] SFTTrainer configured. Output will be saved to: ./train_outputs/sst2_finetune


In [8]:
# --- Train the Model ---
print("\nStarting model training...")
train_start = time.time()
try:
    trainer_stats = trainer.train()
    print(f"✅ [CHECKPOINT] Training completed in {time.time() - train_start:.2f}s")
    print(f"Training stats:\n{trainer_stats}")
except Exception as e:
    print(f"❌ Training failed: {e}")
    raise


Starting model training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 67,349 | Num Epochs = 3 | Total steps = 6,315
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 6,815,744/1,000,000,000 (0.68% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpmw1113[0m ([33mpmw1113-yonsei-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.6522
50,2.9839
100,1.1142
150,0.9085
200,0.9113
250,0.8836
300,0.8782
350,0.8754
400,0.8789
450,0.8545


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✅ [CHECKPOINT] Training completed in 4022.75s
Training stats:
TrainOutput(global_step=6315, training_loss=0.7241084020286916, metrics={'train_runtime': 4021.5317, 'train_samples_per_second': 50.241, 'train_steps_per_second': 1.57, 'total_flos': 9.14812188683305e+16, 'train_loss': 0.7241084020286916})


In [None]:
# --- Save Final Adapter ---
print("\nSaving final LoRA adapter...")
save_start = time.time()
# The final adapter will be saved inside the `output_dir`
final_adapter_path = os.path.join(output_dir, "final_adapter")
try:
    model.save_pretrained(final_adapter_path)
    print(f"✅ [CHECKPOINT] LoRA adapter saved to {final_adapter_path} in {time.time() - save_start:.2f}s")
except Exception as e:
    print(f"❌ Failed to save model: {e}")
    raise



Saving final LoRA adapter...
✅ [CHECKPOINT] LoRA adapter saved to ./train_outputs/sst2_finetune/final_adapter in 0.50s


In [None]:
# =================================================================
# Phase 2: Systematic Evaluation on Validation Set
# =================================================================

print("\nStarting systematic evaluation on validation samples...")

try:
    # Ensure the model is in evaluation mode
    model.eval()

    # Load validation dataset
    val_dataset = load_dataset("stanfordnlp/sst2", split="validation")
    
    num_test_samples = len(val_dataset)
    evaluation_logs = []
    correct_predictions = 0

    # Loop through validation samples and evaluate
    for i in tqdm(range(num_test_samples), desc="Evaluating Samples"):
        sample = val_dataset[i]
        test_sentence = sample["sentence"]
        true_label_id = sample["label"]
        true_label_str = label_map.get(true_label_id)

        # Format the prompt for inference (response part is empty)
        prompt = sentiment_prompt.format(test_sentence, "")
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Generate the model's response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5, # Generate a few tokens to capture "Positive" or "Negative"
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode and clean the prediction
        prediction_text = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True).strip()
        
        # Check if the prediction contains the correct label string
        predicted_label_str = "Unknown"
        if "Positive" in prediction_text:
            predicted_label_str = "Positive"
        elif "Negative" in prediction_text:
            predicted_label_str = "Negative"

        # Check if prediction was successful
        is_correct = (predicted_label_str == true_label_str)
        if is_correct:
            correct_predictions += 1

        # Log the result
        evaluation_logs.append({
            "sample_index": i,
            "input_sentence": test_sentence,
            "prediction_text": prediction_text,
            "prediction": predicted_label_str,
            "ground_truth": true_label_str,
            "is_correct": is_correct,
            "raw_output": prediction_text
        })

    # Calculate final metrics
    accuracy = (correct_predictions / num_test_samples) * 100

    summary_metrics = {
        "total_samples_tested": num_test_samples,
        "correct_predictions": correct_predictions,
        "accuracy": f"{accuracy:.2f}%"
    }

    # Combine summary and individual logs into one JSON object
    final_results = {
        "summary_metrics": summary_metrics,
        "individual_results": evaluation_logs
    }
    
    # Save results to a JSON file
    results_path = os.path.join(output_dir, "evaluation_results.json")
    with open(results_path, 'w', encoding='utf-8') as f:
        json.dump(final_results, f, indent=4, ensure_ascii=False)

    print(f"\n✅ [CHECKPOINT] Evaluation finished. Results saved to {results_path}")
    print("--- Summary ---")
    print(f"Accuracy: {accuracy:.2f}% ({correct_predictions}/{num_test_samples})")

except Exception as e:
    print(f"❌ Evaluation failed: {e}")
    raise

print("\n--- Script Finished ---")



BrokenPipeError: [Errno 32] Broken pipe


Starting systematic evaluation on validation samples...


Evaluating Samples: 100%|██████████| 872/872 [00:58<00:00, 14.87it/s]


✅ [CHECKPOINT] Evaluation finished. Results saved to ./train_outputs/sst2_finetune/evaluation_results.json
--- Summary ---
Accuracy: 95.87% (836/872)

--- Script Finished ---





BrokenPipeError: [Errno 32] Broken pipe