In [1]:
# =================================================================
# Project: Adapting from Sentence-level to Document-level Sentiment Analysis
# Phase 4: Final Evaluation and Inference Demonstration
# =================================================================

# Import libraries
import torch
import time
import os
import random
import json
import numpy as np
from tqdm import tqdm
import re

# =================================================================
# ENVIRONMENT SETUP
# =================================================================

# Set Hugging Face cache directory to a larger, persistent volume
cache_dir = "/output/huggingface_cache"
os.environ['HF_HOME'] = cache_dir
os.environ['HF_DATASETS_CACHE'] = os.path.join(cache_dir, "datasets")
os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, "models")

# Prevent tokenizer parallelism issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Ensure the cache directory exists
os.makedirs(cache_dir, exist_ok=True)
print(f"✅ [SETUP] Hugging Face cache directory set to: {cache_dir}")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

try:
    from unsloth import FastLanguageModel
    from peft import PeftModel
    from datasets import load_dataset
    from transformers import AutoTokenizer
    print("✅ [CHECKPOINT] Imports successful")
except ImportError as e:
    print(f"❌ ImportError: {e}")
    raise

# --- Basic Setup ---
print("CUDA available:", torch.cuda.is_available())
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"- Device: {torch.cuda.get_device_name(0)}")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)
print("✅ [CHECKPOINT] Seed set")

✅ [SETUP] Hugging Face cache directory set to: /output/huggingface_cache
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




🦥 Unsloth Zoo will now patch everything to make training faster!
✅ [CHECKPOINT] Imports successful
CUDA available: True
Using device: cuda
- Device: NVIDIA GeForce RTX 3090
✅ [CHECKPOINT] Seed set


In [2]:
# =================================================================
# LOAD MODELS, DATA, AND BEST INSTRUCTION
# =================================================================

# --- Configuration ---
STUDENT_MODEL_NAME = "unsloth/Llama-3.2-1B-unsloth-bnb-4bit"
ADAPTER_PATH = "./train_outputs/sst2_finetune/final_adapter" 
BEST_RESULT_PATH = "./optimization_results/best_result.json"
OUTPUT_DIR = "./optimization_results"
STUDENT_MAX_SEQ_LENGTH = 8192

# --- Load Student Model (Fine-tuned on SST2) ---
print(f"Loading Student Model: {STUDENT_MODEL_NAME} with adapter from {ADAPTER_PATH}")
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=STUDENT_MODEL_NAME,
        max_seq_length=STUDENT_MAX_SEQ_LENGTH,
        load_in_4bit=True,
    )
    model.load_adapter(ADAPTER_PATH)
    model.eval() # Set model to evaluation mode
    print("✅ [CHECKPOINT] Student model and adapter loaded successfully.")
except Exception as e:
    print(f"❌ Failed to load student model: {e}")
    raise
    
# --- Load Best Instruction ---
print(f"Loading best instruction from {BEST_RESULT_PATH}")
try:
    with open(BEST_RESULT_PATH, 'r') as f:
        best_result_data = json.load(f)
    best_instruction = best_result_data["best_instruction"]
    print("✅ [CHECKPOINT] Best instruction loaded successfully.")
    print(f"    - Best Iteration: {best_result_data['best_iteration']}")
    print(f"    - Best Accuracy: {best_result_data['best_accuracy']}")
    print(f"    - Best Instruction: \"{best_instruction}\"")
except Exception as e:
    print(f"❌ Failed to load best instruction file: {e}")
    raise

# --- Load IMDb Dataset ---
print("Loading IMDb test dataset...")
try:
    imdb_dataset = load_dataset("imdb", split="test")
    print(f"✅ [CHECKPOINT] IMDb dataset loaded with {len(imdb_dataset)} examples.")
except Exception as e:
    print(f"❌ Failed to load IMDb dataset: {e}")
    raise

Loading Student Model: unsloth/Llama-3.2-1B-unsloth-bnb-4bit with adapter from ./train_outputs/sst2_finetune/final_adapter
==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ [CHECKPOINT] Student model and adapter loaded successfully.
Loading best instruction from ./optimization_results/best_result.json
✅ [CHECKPOINT] Best instruction loaded successfully.
    - Best Iteration: 4
    - Best Accuracy: 80.60%
    - Best Instruction: "Classify the overall sentiment of the following movie review as either "Positive" or "Negative". To make this classification, consider the entire text,

In [3]:
# =================================================================
# EVALUATION LOGIC
# =================================================================

label_map = {0: "Negative", 1: "Positive"}
BASE_PROMPT_TEMPLATE = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{review}

### Response:
{response}"""

def run_full_evaluation(instruction_text, dataset_name):
    """Runs evaluation on the full dataset with a given instruction."""
    print(f"\n--- Starting Full Evaluation with '{dataset_name}' Instruction ---")
    
    prompt_template = BASE_PROMPT_TEMPLATE.format(
        instruction=instruction_text,
        review="{review}",
        response="{response}"
    )

    correct_predictions = 0
    num_samples = len(imdb_dataset)

    for i in tqdm(range(num_samples), desc=f"Evaluating ({dataset_name})"):
        sample = imdb_dataset[i]
        text = sample["text"]
        true_label_str = label_map.get(sample["label"])
        
        prompt = prompt_template.format(review=text, response="")
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=STUDENT_MAX_SEQ_LENGTH).to(device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
        
        prediction_text = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True).strip()
        predicted_label = "Positive" if "Positive" in prediction_text else "Negative" if "Negative" in prediction_text else "Unknown"
        
        if predicted_label == true_label_str:
            correct_predictions += 1

    final_accuracy = (correct_predictions / num_samples) * 100
    
    print(f"\n--- {dataset_name.upper()} PERFORMANCE METRICS ---")
    print(f"Instruction Used: \"{instruction_text}\"")
    print(f"Total Samples Evaluated: {num_samples}")
    print(f"Correct Predictions: {correct_predictions}")
    print(f"Final Accuracy: {final_accuracy:.2f}%")
    print("---------------------------------" + "-"*len(dataset_name))
    return final_accuracy

# Run evaluation for the initial instruction
initial_instruction = 'Classify the sentiment of the following movie review as either "Positive" or "Negative".'
initial_accuracy = run_full_evaluation(initial_instruction, "Initial")

# Run evaluation for the best instruction found
best_accuracy = run_full_evaluation(best_instruction, "Best Optimized")

# --- Final Comparison ---
print("\n=================================")
print("=== FINAL PERFORMANCE SUMMARY ===")
print("=================================")
print(f"Initial Baseline Accuracy: {initial_accuracy:.2f}%")
print(f"Optimized Best Accuracy:   {best_accuracy:.2f}%")
print("---------------------------------")
print(f"Total Improvement:         {best_accuracy - initial_accuracy:+.2f}%p")
print("=================================")


--- Starting Full Evaluation with 'Initial' Instruction ---


Evaluating (Initial): 100%|██████████| 25000/25000 [41:18<00:00, 10.09it/s] 



--- INITIAL PERFORMANCE METRICS ---
Instruction Used: "Classify the sentiment of the following movie review as either "Positive" or "Negative"."
Total Samples Evaluated: 25000
Correct Predictions: 14933
Final Accuracy: 59.73%
----------------------------------------

--- Starting Full Evaluation with 'Best Optimized' Instruction ---


Evaluating (Best Optimized): 100%|██████████| 25000/25000 [31:08<00:00, 13.38it/s]


--- BEST OPTIMIZED PERFORMANCE METRICS ---
Instruction Used: "Classify the overall sentiment of the following movie review as either "Positive" or "Negative". To make this classification, consider the entire text, not just a single sentence. If the review expresses a general attitude that is overwhelmingly positive (e.g., praising the movie, enjoying the experience), output "Positive". If the review expresses a general attitude that is overwhelmingly negative (e.g., criticizing the movie, disliking the experience), output "Negative". When dealing with mixed reviews that mention both positive and negative aspects, look for the overall tone of the review. If the review's overall tone is more positive than negative, output "Positive". If the review's overall tone is more negative than positive, output "Negative". If the review is neutral, meaning it doesn't express a clear positive or negative attitude, output "Neutral"."
Total Samples Evaluated: 25000
Correct Predictions: 19911
Final Ac


