# Phase 0: Environment Setup & Verification

## 0.2: Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

# Create project directory in Drive
import os
project_path = '/content/drive/MyDrive/NLP_Project'
os.makedirs(project_path, exist_ok=True)
os.makedirs(f'{project_path}/data', exist_ok=True)
os.makedirs(f'{project_path}/models', exist_ok=True)
os.makedirs(f'{project_path}/results', exist_ok=True)
os.makedirs(f'{project_path}/checkpoints', exist_ok=True)

print(f"✓ Project directory created at: {project_path}")
print(f"\nDirectory structure:")
!ls -la /content/drive/MyDrive/NLP_Project/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Project directory created at: /content/drive/MyDrive/NLP_Project

Directory structure:
total 16
drwx------ 2 root root 4096 Dec 10 00:42 checkpoints
drwx------ 2 root root 4096 Nov 19 01:28 data
drwx------ 2 root root 4096 Dec 10 00:13 models
drwx------ 2 root root 4096 Nov 19 01:28 results


## 0.3: Install Required Libraries

In [3]:
print("Installing required packages...")
!pip install -q peft accelerate bitsandbytes
!pip install -q sentence-transformers faiss-cpu
!pip install -q rouge-score bert-score
!pip install -q datasets
!pip install -U bitsandbytes accelerate

print("\n" + "="*50)
print("VERIFYING INSTALLATIONS")
print("="*50)

# Verify installations
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import peft
import sentence_transformers
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

print("✓ All core libraries imported successfully!")
print(f"\nLibrary versions:")
print(f"  PyTorch: {torch.__version__}")
print(f"  Transformers: {transformers.__version__}")
print(f"  PEFT: {peft.__version__}")
print(f"  Sentence Transformers: {sentence_transformers.__version__}")

Installing required packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone

VERIFYING INSTALLATIONS
✓ All core libraries imported successfully!

Library versions:
  PyTorch: 2.9.0+cu126
  Transformers: 4.57.3
  PEFT: 0.18.0
  Sentence Transformers: 5.2.0


# Hybrid System

In [5]:
import torch
import pandas as pd
import time
import pickle
import faiss
from sentence_transformers import SentenceTransformer
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

print("="*60)
print("4.1: End-to-End Pipeline System")
print("="*60)

# Build complete pipeline class
class HybridChatbot:
    def __init__(self):
        print("\n1. Loading all components...")

        # Classifier
        print("   - Loading classifier...")
        with open('/content/drive/MyDrive/NLP_Project/models/classifier/logistic_regression.pkl', 'rb') as f:
            self.classifier = pickle.load(f)
        with open('/content/drive/MyDrive/NLP_Project/models/classifier/tfidf_vectorizer.pkl', 'rb') as f:
            self.tfidf = pickle.load(f)

        # Retrieval system
        print("   - Loading retrieval system...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.retrieval_index = faiss.read_index('/content/drive/MyDrive/NLP_Project/models/retrieval/faiss_index.bin')
        self.retrieval_data = pd.read_csv('/content/drive/MyDrive/NLP_Project/models/retrieval/deterministic_qa_pairs.csv')

        # LLM
        print("   - Loading fine-tuned LLM...")
        base_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/phi-2",
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16
        )
        self.llm_model = PeftModel.from_pretrained(
            base_model,
            "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model"
        )
        self.llm_tokenizer = AutoTokenizer.from_pretrained(
            "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model"
        )

        print("   ✓ All components loaded!\n")

    def classify_query(self, query):
        """Returns 0 for deterministic, 1 for indeterministic"""
        query_tfidf = self.tfidf.transform([query])
        return self.classifier.predict(query_tfidf)[0]

    def retrieve_response(self, query, k=1):
        """Semantic search for deterministic queries"""
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
        distances, indices = self.retrieval_index.search(query_embedding.astype('float32'), k)
        return self.retrieval_data.iloc[indices[0][0]]['response'], distances[0][0]

    def generate_response(self, query, max_tokens=150):
        """LLM generation for indeterministic queries"""
        prompt = f"Customer: {query}\nAssistant:"
        inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)

        with torch.no_grad():
            outputs = self.llm_model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=self.llm_tokenizer.eos_token_id
            )

        response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("Assistant:")[-1].strip()

    def respond(self, query):
        """Main pipeline: classify → route → respond"""
        start_time = time.time()

        # Step 1: Classify
        prediction = self.classify_query(query)
        route = "RETRIEVAL" if prediction == 0 else "LLM_GENERATION"

        # Step 2: Get response
        if prediction == 0:  # Deterministic
            response, distance = self.retrieve_response(query)
            confidence = 1.0 / (1.0 + distance)  # Convert distance to confidence
        else:  # Indeterministic
            response = self.generate_response(query)
            confidence = None

        latency = (time.time() - start_time) * 1000

        return {
            'query': query,
            'route': route,
            'response': response,
            'latency_ms': latency,
            'confidence': confidence
        }

# Initialize chatbot
chatbot = HybridChatbot()

4.1: End-to-End Pipeline System

1. Loading all components...
   - Loading classifier...
   - Loading retrieval system...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   - Loading fine-tuned LLM...


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

   ✓ All components loaded!



# Latency Comparisons

In [7]:
print("\n" + "="*60)
print("LATENCY COMPARISONS (EXPANDED)")
print("="*60)

df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')

# Sample by natural distribution: 39.8% det, 60.2% indet
n_total = 500
n_det = int(0.398 * n_total)   # 199
n_indet = n_total - n_det       # 301

det_sample = df_test[df_test['label'] == 0].sample(n_det, random_state=42)
indet_sample = df_test[df_test['label'] == 1].sample(n_indet, random_state=42)
test_sample = pd.concat([det_sample, indet_sample]).reset_index(drop=True)

print(f"Selected {len(test_sample)} test queries (natural distribution):")
print(f"  Deterministic: {len(det_sample)} ({len(det_sample)/n_total*100:.1f}%)")
print(f"  Indeterministic: {len(indet_sample)} ({len(indet_sample)/n_total*100:.1f}%)")

# Track latencies
hybrid_latencies = []
finetuned_latencies = []
retrieval_only_latencies = []

comparison_results = []

print("\n" + "-"*60)
print("TESTING QUERIES (this will take a while...)")
print("-"*60)

for i, (idx, row) in enumerate(test_sample.iterrows()):
    query = row['instruction']
    true_label = row['label']
    category = row['category']

    # Progress update every 50 queries
    if (i + 1) % 50 == 0:
        print(f"Progress: {i + 1}/{len(test_sample)} queries completed")

    # 1. HYBRID SYSTEM
    hybrid_result = chatbot.respond(query)
    hybrid_latencies.append(hybrid_result['latency_ms'])

    # 2. FINE-TUNED LLM (all queries through fine-tuned model)
    start_time = time.time()
    prompt = f"Customer: {query}\nAssistant:"
    inputs = chatbot.llm_tokenizer(prompt, return_tensors="pt").to(chatbot.llm_model.device)

    with torch.no_grad():
        outputs = chatbot.llm_model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=chatbot.llm_tokenizer.eos_token_id
        )

    finetuned_response = chatbot.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
    finetuned_latency = (time.time() - start_time) * 1000
    finetuned_latencies.append(finetuned_latency)

    # 3. RETRIEVAL-ONLY
    start_time = time.time()
    retrieval_response, distance = chatbot.retrieve_response(query)
    retrieval_latency = (time.time() - start_time) * 1000
    retrieval_only_latencies.append(retrieval_latency)

    comparison_results.append({
        'query': query,
        'category': category,
        'true_label': true_label,
        'hybrid_latency_ms': hybrid_result['latency_ms'],
        'hybrid_route': hybrid_result['route'],
        'finetuned_latency_ms': finetuned_latency,
        'retrieval_latency_ms': retrieval_latency
    })

print(f"\nCompleted all {len(test_sample)} queries")

# Statistics with confidence intervals
from scipy import stats
import numpy as np

def compute_ci(data, confidence=0.95):
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)
    ci = se * stats.t.ppf((1 + confidence) / 2, n - 1)
    return mean, mean - ci, mean + ci

print("\n" + "="*60)
print(f"LATENCY COMPARISON SUMMARY (n={n_total})")
print("="*60)

hybrid_mean, hybrid_lo, hybrid_hi = compute_ci(hybrid_latencies)
finetuned_mean, finetuned_lo, finetuned_hi = compute_ci(finetuned_latencies)
retrieval_mean, retrieval_lo, retrieval_hi = compute_ci(retrieval_only_latencies)

print(f"\nAverage Latency (95% CI):")
print(f"  Hybrid System:    {hybrid_mean:>8.0f} ms ({hybrid_lo:.0f}, {hybrid_hi:.0f})")
print(f"  Fine-tuned LLM:   {finetuned_mean:>8.0f} ms ({finetuned_lo:.0f}, {finetuned_hi:.0f})")
print(f"  Retrieval-Only:   {retrieval_mean:>8.0f} ms ({retrieval_lo:.0f}, {retrieval_hi:.0f})")

# Speedup
speedup = finetuned_mean / hybrid_mean
print(f"\nSpeedup: Hybrid is {speedup:.2f}x faster than pure LLM")

# Breakdown by route
print("\n" + "="*60)
print("HYBRID SYSTEM BREAKDOWN BY ROUTE")
print("="*60)

results_df = pd.DataFrame(comparison_results)

retrieval_routed = results_df[results_df['hybrid_route'] == 'RETRIEVAL']
llm_routed = results_df[results_df['hybrid_route'] == 'LLM_GENERATION']

print(f"\nRetrieval path: {len(retrieval_routed)}/{n_total} ({len(retrieval_routed)/n_total*100:.1f}%)")
if len(retrieval_routed) > 0:
    ret_mean, ret_lo, ret_hi = compute_ci(retrieval_routed['hybrid_latency_ms'])
    print(f"  Avg latency: {ret_mean:.0f} ms ({ret_lo:.0f}, {ret_hi:.0f})")

print(f"\nLLM path: {len(llm_routed)}/{n_total} ({len(llm_routed)/n_total*100:.1f}%)")
if len(llm_routed) > 0:
    llm_mean, llm_lo, llm_hi = compute_ci(llm_routed['hybrid_latency_ms'])
    print(f"  Avg latency: {llm_mean:.0f} ms ({llm_lo:.0f}, {llm_hi:.0f})")

# Save results
output_path = '/content/drive/MyDrive/NLP_Project/results/'
import os
os.makedirs(output_path, exist_ok=True)

results_df.to_csv(f'{output_path}/latency_comparison_500.csv', index=False)

# Save summary
summary_stats = {
    'sample_size': n_total,
    'distribution': {
        'deterministic': n_det,
        'indeterministic': n_indet
    },
    'latency_ms': {
        'hybrid': {'mean': hybrid_mean, 'ci_low': hybrid_lo, 'ci_high': hybrid_hi},
        'finetuned_llm': {'mean': finetuned_mean, 'ci_low': finetuned_lo, 'ci_high': finetuned_hi},
        'retrieval_only': {'mean': retrieval_mean, 'ci_low': retrieval_lo, 'ci_high': retrieval_hi}
    },
    'speedup_vs_llm': speedup,
    'hybrid_routing': {
        'retrieval_count': len(retrieval_routed),
        'llm_count': len(llm_routed),
        'retrieval_avg_ms': ret_mean if len(retrieval_routed) > 0 else None,
        'llm_avg_ms': llm_mean if len(llm_routed) > 0 else None
    }
}

import json
with open(f'{output_path}/latency_summary_500.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)

print(f"\n✓ Results saved to:")
print(f"   - {output_path}/latency_comparison_500.csv")
print(f"   - {output_path}/latency_summary_500.json")

print("\n" + "="*60)
print("✓ LATENCY COMPARISON COMPLETE")
print("="*60)


LATENCY COMPARISONS (EXPANDED)
Selected 500 test queries (natural distribution):
  Deterministic: 199 (39.8%)
  Indeterministic: 301 (60.2%)

------------------------------------------------------------
TESTING QUERIES (this will take a while...)
------------------------------------------------------------
Progress: 50/500 queries completed
Progress: 100/500 queries completed
Progress: 150/500 queries completed
Progress: 200/500 queries completed
Progress: 250/500 queries completed
Progress: 300/500 queries completed
Progress: 350/500 queries completed
Progress: 400/500 queries completed
Progress: 450/500 queries completed
Progress: 500/500 queries completed

Completed all 500 queries

LATENCY COMPARISON SUMMARY (n=500)

Average Latency (95% CI):
  Hybrid System:        5092 ms (4723, 5460)
  Fine-tuned LLM:       8276 ms (8185, 8366)
  Retrieval-Only:          8 ms (8, 8)

Speedup: Hybrid is 1.63x faster than pure LLM

HYBRID SYSTEM BREAKDOWN BY ROUTE

Retrieval path: 199/500 (39.8%)

# Baseline Response Quality Comparisons

In [None]:
print("\n" + "="*60)
print("BASELINE RESPONSE QUALITY COMPARISONS")
print("="*60)

# Load non-fine-tuned Phi-2 model for baseline
print("\nLoading non-fine-tuned Phi-2 model...")
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model_name = "microsoft/phi-2"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
base_tokenizer.pad_token = base_tokenizer.eos_token
print("✓ Base Phi-2 model loaded")

# Build retrieval system from indeterministic training data
print("\nBuilding retrieval system from indeterministic training queries...")
print("Loading MASTER TRAIN dataset...")
df_train = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/train_dataset.csv')

# Filter for Indeterministic (LLM) rows from TRAIN only
df_indet_train = df_train[df_train['label'] == 1].reset_index(drop=True)
print(f"Indeterministic training samples: {len(df_indet_train)}")

# Build retrieval embeddings for indeterministic queries
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

retrieval_model = SentenceTransformer('all-MiniLM-L6-v2')
indet_instructions = df_indet_train['instruction'].tolist()
indet_responses = df_indet_train['response'].tolist()

print("Encoding indeterministic training queries...")
indet_embeddings = retrieval_model.encode(indet_instructions, show_progress_bar=True)
print(f"✓ Retrieval system built with {len(indet_embeddings)} indeterministic examples")

# Function to retrieve from indeterministic database
def retrieve_from_indet(query, top_k=1):
    query_embedding = retrieval_model.encode([query])
    distances = cosine_distances(query_embedding, indet_embeddings)[0]
    top_idx = np.argmin(distances)
    return indet_responses[top_idx], distances[top_idx]

# Sample diverse test queries from specific categories
print("\nSampling test queries for qualitative comparison...")
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')

# Select queries only from ACCOUNT, ORDER, FEEDBACK categories
selected_queries = []
categories_to_sample = ['ACCOUNT', 'ORDER', 'FEEDBACK']

for category in categories_to_sample:
    # Get 2 indeterministic queries from each category
    cat_queries = df_test[(df_test['category'] == category) & (df_test['label'] == 1)].sample(2, random_state=42)
    selected_queries.extend(cat_queries.to_dict('records'))

print(f"Selected {len(selected_queries)} indeterministic test queries from {categories_to_sample}")

# Store all responses
comparison_results = []

print("\n" + "="*80)
print("GENERATING RESPONSES FROM ALL SYSTEMS")
print("="*80)

for i, query_data in enumerate(selected_queries, 1):
    query = query_data['instruction']
    true_label = query_data['label']
    category = query_data['category']
    reference_response = query_data['response']

    print(f"\n{'='*80}")
    print(f"QUERY {i}/{len(selected_queries)}")
    print(f"{'='*80}")
    print(f"\nQuery: {query}")
    print(f"Category: {category} | True Label: {true_label}")

    # SYSTEM 1: HYBRID (classifier → retrieval/fine-tuned LLM)
    print(f"\n{'-'*80}")
    print("HYBRID SYSTEM")
    print(f"{'-'*80}")
    try:
        hybrid_result = chatbot.respond(query)
        hybrid_response = hybrid_result['response']
        hybrid_route = hybrid_result['route']
        print(f"Route: {hybrid_route}")
        print(f"Response:\n{hybrid_response}")
    except Exception as e:
        hybrid_response = f"ERROR: {str(e)}"
        hybrid_route = "ERROR"
        print(f"ERROR: {str(e)}")

    # SYSTEM 2: FINE-TUNED LLM (zero-shot, no routing)
    print(f"\n{'-'*80}")
    print("FINE-TUNED LLM (Zero-Shot, No Routing)")
    print(f"{'-'*80}")
    try:
        prompt = f"Customer: {query}\nAssistant:"
        inputs = chatbot.llm_tokenizer(prompt, return_tensors="pt").to(chatbot.llm_model.device)

        with torch.no_grad():
            outputs = chatbot.llm_model.generate(
                **inputs,
                max_new_tokens=150,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=chatbot.llm_tokenizer.eos_token_id
            )

        finetuned_response = chatbot.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
        finetuned_response = finetuned_response.split("Assistant:")[-1].strip()
        print(f"Response:\n{finetuned_response}")
    except Exception as e:
        finetuned_response = f"ERROR: {str(e)}"
        print(f"ERROR: {str(e)}")

    # SYSTEM 3: BASE PHI-2 (non-fine-tuned)
    print(f"\n{'-'*80}")
    print("BASE PHI-2 (Non-Fine-Tuned)")
    print(f"{'-'*80}")
    try:
        prompt = f"Customer: {query}\nAssistant:"
        inputs = base_tokenizer(prompt, return_tensors="pt").to(base_model.device)

        with torch.no_grad():
            outputs = base_model.generate(
                **inputs,
                max_new_tokens=150,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=base_tokenizer.eos_token_id
            )

        base_response = base_tokenizer.decode(outputs[0], skip_special_tokens=True)
        base_response = base_response.split("Assistant:")[-1].strip()
        print(f"Response:\n{base_response}")
    except Exception as e:
        base_response = f"ERROR: {str(e)}"
        print(f"ERROR: {str(e)}")

    # SYSTEM 4: RETRIEVAL FROM INDETERMINISTIC TRAINING DATA
    print(f"\n{'-'*80}")
    print("RETRIEVAL-ONLY (Trained on Indeterministic Queries)")
    print(f"{'-'*80}")
    try:
        indet_retrieval_response, distance = retrieve_from_indet(query)
        print(f"Distance: {distance:.3f}")
        print(f"Response:\n{indet_retrieval_response}")
    except Exception as e:
        indet_retrieval_response = f"ERROR: {str(e)}"
        print(f"ERROR: {str(e)}")

    # REFERENCE
    print(f"\n{'-'*80}")
    print("REFERENCE (Ground Truth)")
    print(f"{'-'*80}")
    print(f"Response:\n{reference_response}")

    print(f"\n{'='*80}\n")

    # Store results
    comparison_results.append({
        'query': query,
        'category': category,
        'true_label': true_label,
        'reference_response': reference_response,
        'hybrid_response': hybrid_response,
        'hybrid_route': hybrid_route,
        'finetuned_response': finetuned_response,
        'base_phi2_response': base_response,
        'indet_retrieval_response': indet_retrieval_response
    })

# Save results
print("\n" + "="*60)
print("SAVING RESULTS")
print("="*60)

results_df = pd.DataFrame(comparison_results)
output_path = '/content/drive/MyDrive/NLP_Project/results/'
import os
os.makedirs(output_path, exist_ok=True)

results_df.to_csv(f'{output_path}/baseline_comparison_indet.csv', index=False)
print(f"✓ Results saved to {output_path}/baseline_comparison_indet.csv")

# Summary statistics
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)

# Count routing in hybrid
hybrid_retrieval_count = sum(1 for r in comparison_results if r['hybrid_route'] == 'RETRIEVAL')
hybrid_llm_count = sum(1 for r in comparison_results if r['hybrid_route'] == 'LLM_GENERATION')

print(f"\nHybrid System Routing:")
print(f"  Retrieval: {hybrid_retrieval_count}/{len(comparison_results)} ({hybrid_retrieval_count/len(comparison_results)*100:.1f}%)")
print(f"  LLM Generation: {hybrid_llm_count}/{len(comparison_results)} ({hybrid_llm_count/len(comparison_results)*100:.1f}%)")

# Average response lengths
avg_hybrid_len = sum(len(r['hybrid_response'].split()) for r in comparison_results) / len(comparison_results)
avg_finetuned_len = sum(len(r['finetuned_response'].split()) for r in comparison_results) / len(comparison_results)
avg_base_len = sum(len(r['base_phi2_response'].split()) for r in comparison_results) / len(comparison_results)
avg_indet_retrieval_len = sum(len(r['indet_retrieval_response'].split()) for r in comparison_results) / len(comparison_results)
avg_reference_len = sum(len(r['reference_response'].split()) for r in comparison_results) / len(comparison_results)

print(f"\nAverage Response Length (words):")
print(f"  Hybrid:                          {avg_hybrid_len:.1f}")
print(f"  Fine-Tuned LLM:                  {avg_finetuned_len:.1f}")
print(f"  Base Phi-2:                      {avg_base_len:.1f}")
print(f"  Retrieval (Indet Training):      {avg_indet_retrieval_len:.1f}")
print(f"  Reference:                       {avg_reference_len:.1f}")

# Save summary
summary = {
    "timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
    "num_queries_tested": len(comparison_results),
    "categories": categories_to_sample,
    "retrieval_source": "indeterministic_training_data",
    "indet_training_samples": len(df_indet_train),
    "hybrid_routing": {
        "retrieval": hybrid_retrieval_count,
        "llm": hybrid_llm_count
    },
    "avg_response_lengths": {
        "hybrid": float(avg_hybrid_len),
        "finetuned": float(avg_finetuned_len),
        "base_phi2": float(avg_base_len),
        "indet_retrieval": float(avg_indet_retrieval_len),
        "reference": float(avg_reference_len)
    }
}

import json
with open(f'{output_path}/baseline_summary_indet.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n✓ Summary saved to {output_path}/baseline_summary_indet.json")

print("\n" + "="*60)
print("✓ BASELINE COMPARISON COMPLETE")
print("="*60)


BASELINE RESPONSE QUALITY COMPARISONS

Loading non-fine-tuned Phi-2 model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✓ Base Phi-2 model loaded

Building retrieval system from indeterministic training queries...
Loading MASTER TRAIN dataset...
Indeterministic training samples: 9577
Encoding indeterministic training queries...


Batches:   0%|          | 0/300 [00:00<?, ?it/s]

✓ Retrieval system built with 9577 indeterministic examples

Sampling test queries for qualitative comparison...
Selected 6 indeterministic test queries from ['ACCOUNT', 'ORDER', 'FEEDBACK']

GENERATING RESPONSES FROM ALL SYSTEMS

QUERY 1/6

Query: what do i need to do to recover my user account pin code
Category: ACCOUNT | True Label: 1

--------------------------------------------------------------------------------
HYBRID SYSTEM
--------------------------------------------------------------------------------
Route: LLM_GENERATION
Response:
I'll take care of it! I understand the importance of recovering your user account PIN code. Let's work together to ensure a smooth recovery process:

1. Begin by visiting the "{{Login Page URL}}" of our platform.
2. Look for the option that says "{{Forgot PIN}}" and click on it.
3. You will be prompted to provide the email address associated with your user account.
4. Once you've entered your email, keep an eye on your inbox. You should receive an

LIMITATIONS OF EACH APPROACH

Hybrid System:
- Depends on classifier accuracy - misrouting leads to poor responses
- Retrieval path fails on queries not well-represented in training data
- LLM path slower than pure retrieval (5-6 seconds vs <10ms)
- Complexity: requires maintaining 3 separate components (classifier, retrieval index, fine-tuned LLM)
- Placeholder templates ({{Order Number}}) require post-processing to fill with actual values
- Classifier may struggle with edge cases that blend deterministic and indeterministic characteristics
- System performance degrades if any component fails

Zero-Shot LLM (Base Phi-2, No Fine-Tuning):
- Generic responses lacking domain-specific templates and tone
- No knowledge of company-specific policies, business hours, or procedures
- Slower for all queries - no fast retrieval path for simple factual questions
- May hallucinate facts about policies, return windows, or contact information
- Cannot leverage template-based responses for deterministic queries
- Computational cost: all queries require full LLM inference (5-6 seconds each)
- Inconsistent response quality - may be off-topic for straightforward factual questions
- Lacks empathetic customer service tone present in fine-tuned model

Retrieval-Only (Force All Queries Through Semantic Search):
- Cannot handle novel or complex queries requiring contextual reasoning
- Fails on queries phrased differently from training examples
- No personalization or adaptation to user's specific situation
- Poor performance on edge cases not represented in training set
- Cannot combine information from multiple sources
- Limited to template responses - lacks conversational flexibility
- May retrieve semantically similar but contextually wrong responses
- No ability to handle multi-step reasoning or troubleshooting
- Fails completely on indeterministic queries requiring empathy or problem-solving

# Test 2