# Phase 0: Environment Setup & Verification

## 0.2: Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

# Create project directory in Drive
import os
project_path = '/content/drive/MyDrive/NLP_Project'
os.makedirs(project_path, exist_ok=True)
os.makedirs(f'{project_path}/data', exist_ok=True)
os.makedirs(f'{project_path}/models', exist_ok=True)
os.makedirs(f'{project_path}/results', exist_ok=True)
os.makedirs(f'{project_path}/checkpoints', exist_ok=True)

print(f"✓ Project directory created at: {project_path}")
print(f"\nDirectory structure:")
!ls -la /content/drive/MyDrive/NLP_Project/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Project directory created at: /content/drive/MyDrive/NLP_Project

Directory structure:
total 16
drwx------ 2 root root 4096 Dec 10 00:42 checkpoints
drwx------ 2 root root 4096 Nov 19 01:28 data
drwx------ 2 root root 4096 Dec 10 00:13 models
drwx------ 2 root root 4096 Nov 19 01:28 results


## 0.3: Install Required Libraries

In [3]:
print("Installing required packages...")
!pip install -q peft accelerate bitsandbytes
!pip install -q sentence-transformers faiss-cpu
!pip install -q rouge-score bert-score
!pip install -q datasets
!pip install -U bitsandbytes accelerate

print("\n" + "="*50)
print("VERIFYING INSTALLATIONS")
print("="*50)

# Verify installations
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import peft
import sentence_transformers
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

print("✓ All core libraries imported successfully!")
print(f"\nLibrary versions:")
print(f"  PyTorch: {torch.__version__}")
print(f"  Transformers: {transformers.__version__}")
print(f"  PEFT: {peft.__version__}")
print(f"  Sentence Transformers: {sentence_transformers.__version__}")

Installing required packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone

VERIFYING INSTALLATIONS
✓ All core libraries imported successfully!

Library versions:
  PyTorch: 2.9.0+cu126
  Transformers: 4.57.3
  PEFT: 0.18.0
  Sentence Transformers: 5.2.0


# Phase 4: End-to-End Pipeline Integration

## 4.1: End-to-End Pipeline System

In [None]:
import torch
import pandas as pd
import time
import pickle
import faiss
from sentence_transformers import SentenceTransformer
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

print("="*60)
print("4.1: End-to-End Pipeline System")
print("="*60)

# Build complete pipeline class
class HybridChatbot:
    def __init__(self):
        print("\n1. Loading all components...")

        # Classifier
        print("   - Loading classifier...")
        with open('/content/drive/MyDrive/NLP_Project/models/classifier/logistic_regression.pkl', 'rb') as f:
            self.classifier = pickle.load(f)
        with open('/content/drive/MyDrive/NLP_Project/models/classifier/tfidf_vectorizer.pkl', 'rb') as f:
            self.tfidf = pickle.load(f)

        # Retrieval system
        print("   - Loading retrieval system...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.retrieval_index = faiss.read_index('/content/drive/MyDrive/NLP_Project/models/retrieval/faiss_index.bin')
        self.retrieval_data = pd.read_csv('/content/drive/MyDrive/NLP_Project/models/retrieval/deterministic_qa_pairs.csv')

        # LLM
        print("   - Loading fine-tuned LLM...")
        base_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/phi-2",
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16
        )
        self.llm_model = PeftModel.from_pretrained(
            base_model,
            "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model"
        )
        self.llm_tokenizer = AutoTokenizer.from_pretrained(
            "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model"
        )

        print("   ✓ All components loaded!\n")

    def classify_query(self, query):
        """Returns 0 for deterministic, 1 for indeterministic"""
        query_tfidf = self.tfidf.transform([query])
        return self.classifier.predict(query_tfidf)[0]

    def retrieve_response(self, query, k=1):
        """Semantic search for deterministic queries"""
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
        distances, indices = self.retrieval_index.search(query_embedding.astype('float32'), k)
        return self.retrieval_data.iloc[indices[0][0]]['response'], distances[0][0]

    def generate_response(self, query, max_tokens=150):
        """LLM generation for indeterministic queries"""
        prompt = f"Customer: {query}\nAssistant:"
        inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)

        with torch.no_grad():
            outputs = self.llm_model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=self.llm_tokenizer.eos_token_id
            )

        response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("Assistant:")[-1].strip()

    def respond(self, query):
        """Main pipeline: classify → route → respond"""
        start_time = time.time()

        # Step 1: Classify
        prediction = self.classify_query(query)
        route = "RETRIEVAL" if prediction == 0 else "LLM_GENERATION"

        # Step 2: Get response
        if prediction == 0:  # Deterministic
            response, distance = self.retrieve_response(query)
            confidence = 1.0 / (1.0 + distance)  # Convert distance to confidence
        else:  # Indeterministic
            response = self.generate_response(query)
            confidence = None

        latency = (time.time() - start_time) * 1000

        return {
            'query': query,
            'route': route,
            'response': response,
            'latency_ms': latency,
            'confidence': confidence
        }

# Initialize chatbot
chatbot = HybridChatbot()

# Test it
print(chatbot.respond("I'm unhappy with the order I received"))

4.1: End-to-End Pipeline System

1. Loading all components...
   - Loading classifier...
   - Loading retrieval system...
   - Loading fine-tuned LLM...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ All components loaded!

{'query': "I'm unhappy with the order I received", 'route': 'LLM_GENERATION', 'response': "I'm sorry to hear that you're unhappy with the order you received. I understand how frustrating this can be, and I apologize for any inconvenience caused. To better assist you, could you please provide me with some specific details about the issues you encountered with the order? This will help me investigate the matter thoroughly and find a suitable solution for you. Your feedback is highly valuable to us, and we appreciate your patience as we work towards resolving this matter. How can I assist you further? Is there anything else I can do to help? Remember, your satisfaction is our top priority, and we're committed to ensuring your order meets your expectations. Thank you for bringing this to our attention, and we'll do our best to make things right.", 'latency_ms': 13497.104406356812, 'confidence': None}


## 4.2: End-to-End Pipeline Testing



In [None]:
print("\n" + "="*60)
print("INTEGRATION TESTING")
print("="*60)

# Load test dataset
print("\nLoading test dataset...")
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')
print(f"Loaded {len(df_test)} test examples")

# Sample diverse queries from test set
print("\nSampling diverse queries for integration testing...")

# Sample strategy: Get examples from each category + label combination
sample_queries = []

# Get unique categories
categories = df_test['category'].unique()

# For each category, try to get both deterministic and indeterministic examples
for category in sorted(categories):
    cat_df = df_test[df_test['category'] == category]

    # Try to get 1 deterministic (label=0)
    det_samples = cat_df[cat_df['label'] == 0]
    if len(det_samples) > 0:
        sample = det_samples.sample(1, random_state=42).iloc[0]
        sample_queries.append(sample.to_dict())

    # Try to get 1 indeterministic (label=1)
    indet_samples = cat_df[cat_df['label'] == 1]
    if len(indet_samples) > 0:
        sample = indet_samples.sample(1, random_state=42).iloc[0]
        sample_queries.append(sample.to_dict())

# Convert to DataFrame
df_test_queries = pd.DataFrame(sample_queries)

print(f"Selected {len(df_test_queries)} test queries spanning {df_test_queries['category'].nunique()} categories")
print(f"  Deterministic (label=0): {len(df_test_queries[df_test_queries['label'] == 0])}")
print(f"  Indeterministic (label=1): {len(df_test_queries[df_test_queries['label'] == 1])}")

# Track results
results = []
routing_errors = 0
crashes = 0
retrieval_latencies = []
llm_latencies = []

print("\n" + "-"*60)
print("INDIVIDUAL TEST RESULTS")
print("-"*60)

for i, row in df_test_queries.iterrows():
    query = row['instruction']
    true_label = row['label']
    expected_route = "RETRIEVAL" if true_label == 0 else "LLM_GENERATION"
    category = row['category']

    print(f"\nTest {i+1}/{len(df_test_queries)}:")
    print(f"  Query: {query}")
    print(f"  True Label: {true_label} ({expected_route})")
    print(f"  Category: {category}")

    try:
        # Run query through pipeline
        result = chatbot.respond(query)

        # Check routing correctness
        actual_route = result['route']
        routing_correct = (actual_route == expected_route)

        if not routing_correct:
            routing_errors += 1
            print(f"  ⚠️  ROUTING ERROR: Expected {expected_route}, got {actual_route}")
        else:
            print(f"  ✓ Correct routing: {actual_route}")

        # Track latency by route
        if actual_route == "RETRIEVAL":
            retrieval_latencies.append(result['latency_ms'])
        else:
            llm_latencies.append(result['latency_ms'])

        # Display results
        print(f"  Latency: {result['latency_ms']:.0f} ms")
        if result['confidence'] is not None:
            print(f"  Confidence: {result['confidence']:.3f}")
        print(f"  Response preview: {result['response'][:100]}...")

        # Store result
        results.append({
            'query': query,
            'category': category,
            'true_label': true_label,
            'expected_route': expected_route,
            'actual_route': actual_route,
            'routing_correct': routing_correct,
            'latency_ms': result['latency_ms'],
            'confidence': result['confidence'],
            'response': result['response'],
            'crashed': False
        })

    except Exception as e:
        crashes += 1
        print(f"  ❌ CRASH: {str(e)}")
        import traceback
        traceback.print_exc()

        results.append({
            'query': query,
            'category': category,
            'true_label': true_label,
            'expected_route': expected_route,
            'actual_route': None,
            'routing_correct': False,
            'latency_ms': None,
            'confidence': None,
            'response': None,
            'crashed': True,
            'error': str(e)
        })

# Summary statistics
print("\n" + "="*60)
print("INTEGRATION TEST SUMMARY")
print("="*60)

total_tests = len(df_test_queries)
successful_tests = total_tests - crashes
routing_accuracy = ((total_tests - routing_errors - crashes) / total_tests) * 100

print(f"\nTest Coverage:")
print(f"  Total queries tested: {total_tests}")
print(f"  Successful completions: {successful_tests} ({successful_tests/total_tests*100:.1f}%)")
print(f"  Crashes: {crashes} ({crashes/total_tests*100:.1f}%)")

print(f"\nRouting Accuracy:")
print(f"  Correct routing: {total_tests - routing_errors - crashes}/{total_tests} ({routing_accuracy:.1f}%)")
print(f"  Routing errors: {routing_errors}")

print(f"\nLatency Analysis:")
if retrieval_latencies:
    print(f"  Retrieval path:")
    print(f"    Count: {len(retrieval_latencies)}")
    print(f"    Average: {sum(retrieval_latencies)/len(retrieval_latencies):.0f} ms")
    print(f"    Min: {min(retrieval_latencies):.0f} ms")
    print(f"    Max: {max(retrieval_latencies):.0f} ms")

if llm_latencies:
    print(f"  LLM generation path:")
    print(f"    Count: {len(llm_latencies)}")
    print(f"    Average: {sum(llm_latencies)/len(llm_latencies):.0f} ms")
    print(f"    Min: {min(llm_latencies):.0f} ms")
    print(f"    Max: {max(llm_latencies):.0f} ms")

if retrieval_latencies and llm_latencies:
    avg_overall = (sum(retrieval_latencies) + sum(llm_latencies)) / (len(retrieval_latencies) + len(llm_latencies))
    print(f"  Overall average: {avg_overall:.0f} ms")

# Category coverage
print(f"\nCategory Coverage:")
categories_tested = set([r['category'] for r in results])
print(f"  Unique categories tested: {len(categories_tested)}")
print(f"  Categories: {', '.join(sorted(categories_tested))}")

# Routing breakdown
retrieval_count = sum(1 for r in results if r['actual_route'] == 'RETRIEVAL' and not r['crashed'])
llm_count = sum(1 for r in results if r['actual_route'] == 'LLM_GENERATION' and not r['crashed'])
print(f"\nRouting Distribution:")
print(f"  Routed to Retrieval: {retrieval_count} ({retrieval_count/successful_tests*100:.1f}%)")
print(f"  Routed to LLM: {llm_count} ({llm_count/successful_tests*100:.1f}%)")

# Pass/Fail determination
print("\n" + "="*60)
if crashes == 0 and routing_accuracy >= 90:
    print("✅ INTEGRATION TEST PASSED")
    print(f"   - No crashes detected")
    print(f"   - Routing accuracy: {routing_accuracy:.1f}%")
    print(f"   - All query types processed successfully")
elif crashes == 0:
    print("⚠️  INTEGRATION TEST PARTIAL PASS")
    print(f"   - No crashes detected")
    print(f"   - Routing accuracy below 90%: {routing_accuracy:.1f}%")
else:
    print("❌ INTEGRATION TEST FAILED")
    print(f"   - {crashes} crashes detected")
    print(f"   - Routing accuracy: {routing_accuracy:.1f}%")
print("="*60)

# Save results
results_df = pd.DataFrame(results)
output_path = '/content/drive/MyDrive/NLP_Project/results/'
import os
os.makedirs(output_path, exist_ok=True)

results_df.to_csv(f'{output_path}/integration_test_results.csv', index=False)
print(f"\n✓ Results saved to {output_path}/integration_test_results.csv")

# Display sample results
print("\n" + "="*60)
print("SAMPLE RESPONSES")
print("="*60)

# Show 2 retrieval examples
print("\n--- Retrieval Path Examples ---")
retrieval_results = [r for r in results if r['actual_route'] == 'RETRIEVAL' and not r['crashed']]
for r in retrieval_results[:2]:
    print(f"\nQuery: {r['query']}")
    print(f"Category: {r['category']} | Route: {r['actual_route']} | Latency: {r['latency_ms']:.0f} ms")
    if r['confidence']:
        print(f"Confidence: {r['confidence']:.3f}")
    print(f"Response: {r['response'][:200]}...")

# Show 2 LLM examples
print("\n--- LLM Generation Path Examples ---")
llm_results = [r for r in results if r['actual_route'] == 'LLM_GENERATION' and not r['crashed']]
for r in llm_results[:2]:
    print(f"\nQuery: {r['query']}")
    print(f"Category: {r['category']} | Route: {r['actual_route']} | Latency: {r['latency_ms']:.0f} ms")
    print(f"Response: {r['response'][:200]}...")

print("\n" + "="*60)
print("✓ INTEGRATION TESTING COMPLETE")
print("="*60)


INTEGRATION TESTING

Loading test dataset...
Loaded 3978 test examples

Sampling diverse queries for integration testing...
Selected 8 test queries spanning 8 categories
  Deterministic (label=0): 5
  Indeterministic (label=1): 3

------------------------------------------------------------
INDIVIDUAL TEST RESULTS
------------------------------------------------------------

Test 1/8:
  Query: what do i need to do to recover my user account pin code
  True Label: 1 (LLM_GENERATION)
  Category: ACCOUNT
  ✓ Correct routing: LLM_GENERATION
  Latency: 9615 ms
  Response preview: I'm happy to help! I understand that you need assistance in recovering your user account pin code. D...

Test 2/8:
  Query: where could I see the early termination penalties?
  True Label: 0 (RETRIEVAL)
  Category: CANCEL
  ✓ Correct routing: RETRIEVAL
  Latency: 11 ms
  Confidence: 0.944
  Response preview: Assuredly! If you want to find information about the early termination penalty, the best place to lo...

Te

# Phase 5: Metrics Evaluation

## 5.1: Retrieval System

In [8]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

print("\n" + "="*60)
print("RETRIEVAL EVALUATION METRICS")
print("="*60)

# 1. Load test dataset
print("\nLoading test dataset...")
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')

# 2. Filter for deterministic test queries only
df_det_test = df_test[df_test['label'] == 0].reset_index(drop=True)
print(f"Loaded {len(df_det_test)} deterministic test queries")

# 3. Load sentence transformer model (same as used for training index)
print("\nLoading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# 4. Encode test queries
print("\nEncoding test queries...")
test_embeddings = model.encode(
    df_det_test['instruction'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

# 5. Load FAISS index and training data
import faiss
retrieval_path = '/content/drive/MyDrive/NLP_Project/models/retrieval/'

print("\nLoading FAISS index...")
index = faiss.read_index(f'{retrieval_path}/faiss_index.bin')

print("Loading training retrieval data...")
df_det_train = pd.read_csv(f'{retrieval_path}/deterministic_qa_pairs.csv')

print(f"   ✓ Index loaded with {index.ntotal:,} vectors")
print(f"   ✓ Training data: {len(df_det_train):,} examples")

# 6. Evaluate at multiple k values
print("\n" + "="*60)
print("RETRIEVAL ACCURACY AT MULTIPLE K VALUES")
print("="*60)

k_values = [1, 3, 5, 10]
results = {}

for k in k_values:
    print(f"\nEvaluating Top-{k} retrieval...")

    # Search for top-k matches
    distances, indices = index.search(test_embeddings.astype('float32'), k)

    # Calculate metrics
    category_matches = 0
    exact_matches = 0

    for i, test_row in df_det_test.iterrows():
        test_category = test_row['category']
        test_query = test_row['instruction'].lower().strip()

        # Get top-k retrieved examples
        retrieved_indices = indices[i][:k]
        retrieved_categories = df_det_train.iloc[retrieved_indices]['category'].values
        retrieved_queries = df_det_train.iloc[retrieved_indices]['instruction'].values

        # Check category match (if correct category in top-k)
        if test_category in retrieved_categories:
            category_matches += 1

        # Check exact match (if exact query text in top-k)
        for retrieved_query in retrieved_queries:
            if test_query == retrieved_query.lower().strip():
                exact_matches += 1
                break

    # Calculate percentages
    category_accuracy = (category_matches / len(df_det_test)) * 100
    exact_match_rate = (exact_matches / len(df_det_test)) * 100

    results[k] = {
        'category_accuracy': category_accuracy,
        'exact_match': exact_match_rate
    }

    print(f"   Top-{k} Results:")
    print(f"      Category Accuracy:  {category_accuracy:.2f}%")
    print(f"      Exact Match:        {exact_match_rate:.2f}%")

# 7. Print summary table
print("\n" + "="*60)
print("RETRIEVAL ACCURACY SUMMARY")
print("="*60)
print(f"{'K':<5} {'Category Accuracy':<20} {'Exact Match':<15}")
print("-" * 60)
for k in k_values:
    print(f"{k:<5}  {results[k]['category_accuracy']:>17.2f}%  {results[k]['exact_match']:>12.2f}%")

# 8. COMPUTE DISTANCE STATISTICS (Top-1)
print("\n" + "="*60)
print("DISTANCE STATISTICS (TOP-1 MATCHES)")
print("="*60)

# Get Top-1 distances
distances_top1, indices_top1 = index.search(test_embeddings.astype('float32'), 1)
top1_distances = distances_top1[:, 0]

# Compute statistics
distance_stats = {
    'mean': float(np.mean(top1_distances)),
    'median': float(np.median(top1_distances)),
    'min': float(np.min(top1_distances)),
    'max': float(np.max(top1_distances)),
    'std': float(np.std(top1_distances))
}

print(f"\nDistance Distribution:")
print(f"   Mean:     {distance_stats['mean']:.4f}")
print(f"   Median:   {distance_stats['median']:.4f}")
print(f"   Min:      {distance_stats['min']:.4f}")
print(f"   Max:      {distance_stats['max']:.4f}")
print(f"   Std Dev:  {distance_stats['std']:.4f}")

# Distance distribution buckets
print(f"\nDistance Distribution Breakdown:")
low = np.sum(top1_distances < 0.1)
medium = np.sum((top1_distances >= 0.1) & (top1_distances < 0.5))
high = np.sum(top1_distances >= 0.5)

print(f"   Distances < 0.1:     {low:>4} ({low/len(top1_distances)*100:>5.1f}%) - Excellent match")
print(f"   Distances 0.1-0.5:   {medium:>4} ({medium/len(top1_distances)*100:>5.1f}%) - Good match")
print(f"   Distances > 0.5:     {high:>4} ({high/len(top1_distances)*100:>5.1f}%) - Weak match")

print("\n" + "="*60)
print("✓ RETRIEVAL EVALUATION COMPLETE")
print("="*60)

# Save results
results_summary = {
    'k_values': k_values,
    'accuracies': results,
    'distance_stats': distance_stats
}

import json
output_path = '/content/drive/MyDrive/NLP_Project/results/'
os.makedirs(output_path, exist_ok=True)

with open(f'{output_path}/retrieval_evaluation.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"\n✓ Results saved to {output_path}/retrieval_evaluation.json")


RETRIEVAL EVALUATION METRICS

Loading test dataset...
Loaded 1584 deterministic test queries

Loading sentence transformer model...

Encoding test queries...


Batches:   0%|          | 0/50 [00:00<?, ?it/s]


Loading FAISS index...
Loading training retrieval data...
   ✓ Index loaded with 6,333 vectors
   ✓ Training data: 6,333 examples

RETRIEVAL ACCURACY AT MULTIPLE K VALUES

Evaluating Top-1 retrieval...
   Top-1 Results:
      Category Accuracy:  99.94%
      Exact Match:        4.73%

Evaluating Top-3 retrieval...
   Top-3 Results:
      Category Accuracy:  99.94%
      Exact Match:        4.73%

Evaluating Top-5 retrieval...
   Top-5 Results:
      Category Accuracy:  99.94%
      Exact Match:        4.73%

Evaluating Top-10 retrieval...
   Top-10 Results:
      Category Accuracy:  99.94%
      Exact Match:        4.73%

RETRIEVAL ACCURACY SUMMARY
K     Category Accuracy    Exact Match    
------------------------------------------------------------
1                  99.94%          4.73%
3                  99.94%          4.73%
5                  99.94%          4.73%
10                 99.94%          4.73%

DISTANCE STATISTICS (TOP-1 MATCHES)

Distance Distribution:
   Mean:     

In [10]:
# ============================================================
# CONFIDENCE INTERVALS FOR RETRIEVAL METRICS
# ============================================================
from statsmodels.stats.proportion import proportion_confint

print("\n" + "="*60)
print("CONFIDENCE INTERVALS FOR RETRIEVAL METRICS")
print("="*60)

def compute_ci(n_correct, n_total):
    """Compute 95% Wilson score confidence interval."""
    accuracy = n_correct / n_total
    ci_low, ci_high = proportion_confint(
        n_correct,
        n_total,
        alpha=0.05,
        method='wilson'
    )
    return accuracy, ci_low, ci_high

n_test = len(df_det_test)

# 1. Category Accuracy CI (from your k=1 results)
print("\n1. CATEGORY ACCURACY:")
n_category_correct = category_matches  # From your evaluation loop
acc, ci_low, ci_high = compute_ci(n_category_correct, n_test)
print(f"   n_correct: {n_category_correct}/{n_test}")
print(f"   {acc*100:.2f}% [{ci_low*100:.2f}%, {ci_high*100:.2f}%]")

# 2. Exact Match Rate CI
print("\n2. EXACT MATCH RATE:")
n_exact_matches = exact_matches  # From your evaluation loop
exact_rate, ci_low, ci_high = compute_ci(n_exact_matches, n_test)
print(f"   n_exact: {n_exact_matches}/{n_test}")
print(f"   {exact_rate*100:.2f}% [{ci_low*100:.2f}%, {ci_high*100:.2f}%]")

# 3. Distance Distribution CIs
print("\n3. DISTANCE DISTRIBUTION:")

# Excellent (< 0.1)
n_excellent = low  # From your distance distribution code
acc, ci_low, ci_high = compute_ci(n_excellent, n_test)
print(f"   Excellent (<0.1):  {n_excellent}/{n_test}")
print(f"                      {acc*100:.1f}% [{ci_low*100:.1f}%, {ci_high*100:.1f}%]")

# Good (0.1-0.5)
n_good = medium  # From your distance distribution code
acc, ci_low, ci_high = compute_ci(n_good, n_test)
print(f"   Good (0.1-0.5):    {n_good}/{n_test}")
print(f"                      {acc*100:.1f}% [{ci_low*100:.1f}%, {ci_high*100:.1f}%]")

# Weak (> 0.5)
n_weak = high  # From your distance distribution code
acc, ci_low, ci_high = compute_ci(n_weak, n_test)
print(f"   Weak (>0.5):       {n_weak}/{n_test}")
print(f"                      {acc*100:.1f}% [{ci_low*100:.1f}%, {ci_high*100:.1f}%]")

print("\n" + "="*60)
print("✓ CONFIDENCE INTERVALS COMPUTED FROM ACTUAL DATA")
print("="*60)


CONFIDENCE INTERVALS FOR RETRIEVAL METRICS

1. CATEGORY ACCURACY:
   n_correct: 1583/1584
   99.94% [99.64%, 99.99%]

2. EXACT MATCH RATE:
   n_exact: 75/1584
   4.73% [3.79%, 5.89%]

3. DISTANCE DISTRIBUTION:
   Excellent (<0.1):  1084/1584
                      68.4% [66.1%, 70.7%]
   Good (0.1-0.5):    456/1584
                      28.8% [26.6%, 31.1%]
   Weak (>0.5):       44/1584
                      2.8% [2.1%, 3.7%]

✓ CONFIDENCE INTERVALS COMPUTED FROM ACTUAL DATA


## 5.2: Binary Classifier

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import pickle

print("\n" + "="*60)
print("PER-CATEGORY CLASSIFIER ANALYSIS")
print("="*60)

# 1. Load test dataset
print("\nLoading test dataset...")
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')
print(f"Loaded {len(df_test)} test examples")

# 2. Load trained classifier and vectorizer
classifier_path = '/content/drive/MyDrive/NLP_Project/models/classifier'

print("\nLoading trained classifier...")
with open(f'{classifier_path}/logistic_regression.pkl', 'rb') as f:
    classifier = pickle.load(f)

with open(f'{classifier_path}/tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

print("   ✓ Classifier and vectorizer loaded")

# 3. Make predictions with confidence scores
print("\nMaking predictions on test set...")
X_test = vectorizer.transform(df_test['instruction'])
y_test = df_test['label']

# Predictions
y_pred = classifier.predict(X_test)

# Confidence scores (probabilities)
y_proba = classifier.predict_proba(X_test)

# Get confidence for predicted class
confidence_scores = np.max(y_proba, axis=1)

# Add predictions and confidence to dataframe
df_test['predicted_label'] = y_pred
df_test['confidence'] = confidence_scores
df_test['correct'] = (y_pred == y_test).astype(int)

print("   ✓ Predictions complete")

# 4. PART 1: Per-Category Accuracy Breakdown
print("\n" + "="*60)
print("CLASSIFIER ACCURACY BY CATEGORY")
print("="*60)

categories = df_test['category'].unique()
category_results = []

for category in sorted(categories):
    # Filter for this category
    cat_df = df_test[df_test['category'] == category]

    # Calculate metrics
    total = len(cat_df)
    correct = cat_df['correct'].sum()
    accuracy = (correct / total) * 100

    category_results.append({
        'category': category,
        'total': total,
        'correct': correct,
        'accuracy': accuracy
    })

    print(f"\n{category}:")
    print(f"   Total examples: {total}")
    print(f"   Correct: {correct}")
    print(f"   Accuracy: {accuracy:.2f}%")

# Create summary dataframe
df_category_results = pd.DataFrame(category_results)
df_category_results = df_category_results.sort_values('accuracy', ascending=False)

print("\n" + "-"*60)
print("SUMMARY TABLE")
print("-"*60)
print(f"{'Category':<20} {'Total':<10} {'Correct':<10} {'Accuracy':<10}")
print("-"*60)
for _, row in df_category_results.iterrows():
    print(f"{row['category']:<20} {row['total']:<10} {row['correct']:<10} {row['accuracy']:>8.2f}%")

# 5. PART 2: Confusion Matrix
print("\n" + "="*60)
print("CONFUSION MATRIX")
print("="*60)

cm = confusion_matrix(y_test, y_pred)
print(f"\n{'':20} {'Predicted Det':>15} {'Predicted Indet':>15}")
print(f"{'True Det':20} {cm[0][0]:>15} {cm[0][1]:>15}")
print(f"{'True Indet':20} {cm[1][0]:>15} {cm[1][1]:>15}")

# 6. PART 3: Confidence Distribution Analysis
print("\n" + "="*60)
print("PREDICTION CONFIDENCE DISTRIBUTION")
print("="*60)

# Overall confidence statistics
print("\nOverall Confidence Statistics:")
print(f"   Mean:     {df_test['confidence'].mean():.4f}")
print(f"   Median:   {df_test['confidence'].median():.4f}")
print(f"   Min:      {df_test['confidence'].min():.4f}")
print(f"   Max:      {df_test['confidence'].max():.4f}")
print(f"   Std Dev:  {df_test['confidence'].std():.4f}")

# Confidence by correctness
print("\nConfidence by Prediction Correctness:")
correct_conf = df_test[df_test['correct'] == 1]['confidence']
incorrect_conf = df_test[df_test['correct'] == 0]['confidence']

print(f"   Correct predictions - Mean confidence:   {correct_conf.mean():.4f}")
if len(incorrect_conf) > 0:
    print(f"   Incorrect predictions - Mean confidence: {incorrect_conf.mean():.4f}")
else:
    print(f"   Incorrect predictions - Mean confidence: N/A (no errors)")

# Confidence distribution buckets
print("\nConfidence Distribution:")
very_high = np.sum(df_test['confidence'] >= 0.9)
high = np.sum((df_test['confidence'] >= 0.7) & (df_test['confidence'] < 0.9))
medium = np.sum((df_test['confidence'] >= 0.5) & (df_test['confidence'] < 0.7))
low = np.sum(df_test['confidence'] < 0.5)

total = len(df_test)
print(f"   Very High (≥0.9): {very_high:>5} ({very_high/total*100:>5.1f}%)")
print(f"   High (0.7-0.9):   {high:>5} ({high/total*100:>5.1f}%)")
print(f"   Medium (0.5-0.7): {medium:>5} ({medium/total*100:>5.1f}%)")
print(f"   Low (<0.5):       {low:>5} ({low/total*100:>5.1f}%)")

# 7. PART 4: High-Confidence vs Low-Confidence Accuracy
print("\n" + "="*60)
print("HIGH-CONFIDENCE VS LOW-CONFIDENCE ACCURACY")
print("="*60)

# Define thresholds
high_confidence_threshold = 0.9
low_confidence_threshold = 0.7

# Split data
high_conf_df = df_test[df_test['confidence'] > high_confidence_threshold]
medium_conf_df = df_test[(df_test['confidence'] >= low_confidence_threshold) &
                          (df_test['confidence'] <= high_confidence_threshold)]
low_conf_df = df_test[df_test['confidence'] < low_confidence_threshold]

# Calculate accuracies
high_conf_acc = high_conf_df['correct'].mean() * 100 if len(high_conf_df) > 0 else 0
medium_conf_acc = medium_conf_df['correct'].mean() * 100 if len(medium_conf_df) > 0 else 0
low_conf_acc = low_conf_df['correct'].mean() * 100 if len(low_conf_df) > 0 else 0

print(f"\nHigh Confidence (>{high_confidence_threshold}):")
print(f"   Examples: {len(high_conf_df)}")
print(f"   Accuracy: {high_conf_acc:.2f}%")
print(f"   Errors: {len(high_conf_df) - high_conf_df['correct'].sum()}")

print(f"\nMedium Confidence ({low_confidence_threshold}-{high_confidence_threshold}):")
print(f"   Examples: {len(medium_conf_df)}")
print(f"   Accuracy: {medium_conf_acc:.2f}%")
print(f"   Errors: {len(medium_conf_df) - medium_conf_df['correct'].sum()}")

print(f"\nLow Confidence (<{low_confidence_threshold}):")
print(f"   Examples: {len(low_conf_df)}")
print(f"   Accuracy: {low_conf_acc:.2f}%")
print(f"   Errors: {len(low_conf_df) - low_conf_df['correct'].sum()}")

print("\n" + "="*60)
print("✓ CLASSIFIER ANALYSIS COMPLETE")
print("="*60)


PER-CATEGORY CLASSIFIER ANALYSIS

Loading test dataset...
Loaded 3978 test examples

Loading trained classifier...
   ✓ Classifier and vectorizer loaded

Making predictions on test set...
   ✓ Predictions complete

CLASSIFIER ACCURACY BY CATEGORY

ACCOUNT:
   Total examples: 1197
   Correct: 1197
   Accuracy: 100.00%

CANCEL:
   Total examples: 190
   Correct: 190
   Accuracy: 100.00%

CONTACT:
   Total examples: 400
   Correct: 400
   Accuracy: 100.00%

FEEDBACK:
   Total examples: 399
   Correct: 399
   Accuracy: 100.00%

INVOICE:
   Total examples: 400
   Correct: 400
   Accuracy: 100.00%

ORDER:
   Total examples: 798
   Correct: 797
   Accuracy: 99.87%

SHIPPING:
   Total examples: 394
   Correct: 393
   Accuracy: 99.75%

SUBSCRIPTION:
   Total examples: 200
   Correct: 200
   Accuracy: 100.00%

------------------------------------------------------------
SUMMARY TABLE
------------------------------------------------------------
Category             Total      Correct    Accuracy

In [4]:
# ============================================================
# CONFIDENCE INTERVALS
# ============================================================
from statsmodels.stats.proportion import proportion_confint

print("\n" + "="*60)
print("CONFIDENCE INTERVALS (95%)")
print("="*60)

def compute_ci(n_correct, n_total):
    """Compute 95% Wilson score confidence interval."""
    accuracy = n_correct / n_total
    ci_low, ci_high = proportion_confint(
        n_correct,
        n_total,
        alpha=0.05,
        method='wilson'
    )
    return accuracy, ci_low, ci_high

print(f"\n{'Category':<20} {'Accuracy':>10} {'95% CI':>28}")
print("-"*60)

for _, row in df_category_results.iterrows():
    category = row['category']
    n_correct = int(row['correct'])
    n_total = int(row['total'])

    acc, ci_low, ci_high = compute_ci(n_correct, n_total)

    print(f"{category:<20} {acc*100:>9.2f}% [{ci_low*100:>6.2f}%, {ci_high*100:>6.2f}%]")

# Overall
overall_correct = df_test['correct'].sum()
overall_total = len(df_test)
acc, ci_low, ci_high = compute_ci(overall_correct, overall_total)

print("-"*60)
print(f"{'OVERALL':<20} {acc*100:>9.2f}% [{ci_low*100:>6.2f}%, {ci_high*100:>6.2f}%]")


CONFIDENCE INTERVALS (95%)

Category               Accuracy                       95% CI
------------------------------------------------------------
ACCOUNT                 100.00% [ 99.68%, 100.00%]
CANCEL                  100.00% [ 98.02%, 100.00%]
CONTACT                 100.00% [ 99.05%, 100.00%]
FEEDBACK                100.00% [ 99.05%, 100.00%]
INVOICE                 100.00% [ 99.05%, 100.00%]
SUBSCRIPTION            100.00% [ 98.12%, 100.00%]
ORDER                    99.87% [ 99.29%,  99.98%]
SHIPPING                 99.75% [ 98.58%,  99.96%]
------------------------------------------------------------
OVERALL                  99.95% [ 99.82%,  99.99%]


## 5.3: LLM

In [2]:
import pandas as pd
import numpy as np
import torch
import json
import os
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
import time

print("="*60)
print("EVALUATING FINE-TUNED PHI-2 ON TEST SET")
print("="*60)

# Configuration
checkpoint_path = "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model"
output_path = '/content/drive/MyDrive/NLP_Project/results/'
progress_file = f'{output_path}/evaluation_progress1.json'
results_file = f'{output_path}/llm_generation_samples1.csv'
save_frequency = 20  # Save every 20 examples

os.makedirs(output_path, exist_ok=True)

# Load test dataset
print("\n1. Loading test dataset...")
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')

# Filter for indeterministic queries only (label=1)
df_indet_test = df_test[df_test['label'] == 1].reset_index(drop=True)

# All queries for evaluation
df_eval = df_indet_test.reset_index(drop=True)
print(f"   ✓ Loaded {len(df_eval)} indeterministic test queries for evaluation")

# Load fine-tuned model
print("\n2. Loading fine-tuned Phi-2 model...")
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map={"": 0},  # Force all to GPU 0
    trust_remote_code=True,
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(base_model, checkpoint_path)
model = model.to("cuda")  # Explicitly move to GPU
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

print("   ✓ Model loaded successfully")
print(f"   GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# Check for existing progress
print("\n3. Checking for existing progress...")
start_idx = 0
results = []

if os.path.exists(progress_file):
    with open(progress_file, 'r') as f:
        progress = json.load(f)
    start_idx = progress['last_completed_idx'] + 1

    # Load existing results
    if os.path.exists(results_file):
        results_df = pd.read_csv(results_file)
        results = results_df.to_dict('records')

    print(f"   ✓ Found existing progress: resuming from index {start_idx}")
    print(f"   Already processed: {len(results)} examples")
else:
    print("   No existing progress found - starting from beginning")

# Generate responses
print(f"\n4. Generating responses for {len(df_eval) - start_idx} remaining queries...")
print(f"   Saving every {save_frequency} examples")
print("="*60)

for i in range(start_idx, len(df_eval)):
    query = df_eval.iloc[i]['instruction']
    reference = df_eval.iloc[i]['response']
    category = df_eval.iloc[i]['category']

    print(f"\n[{i+1}/{len(df_eval)}] Processing: {query[:60]}...")

    try:
        # Generate response
        prompt = f"Customer: {query}\nAssistant:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )

        generation_time = time.time() - start_time

        generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_response = generated_response.split("Assistant:")[-1].strip()

        # Store result
        results.append({
            'query': query,
            'category': category,
            'reference_response': reference,
            'generated_response': generated_response,
            'generation_time_sec': generation_time
        })

        print(f"   ✓ Generated ({generation_time:.2f}s)")

    except Exception as e:
        print(f"   ✗ Error: {str(e)}")
        results.append({
            'query': query,
            'category': category,
            'reference_response': reference,
            'generated_response': f"ERROR: {str(e)}",
            'generation_time_sec': None
        })

    # Save progress periodically
    if (i + 1) % save_frequency == 0 or i == len(df_eval) - 1:
        print(f"\n   💾 Saving progress at index {i}...")

        # Save results
        results_df = pd.DataFrame(results)
        results_df.to_csv(results_file, index=False)

        # Save progress tracker
        with open(progress_file, 'w') as f:
            json.dump({
                'last_completed_idx': i,
                'total_examples': len(df_eval),
                'timestamp': pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
            }, f, indent=2)

        print(f"   ✓ Saved {len(results)} results")

print("\n" + "="*60)
print("✓ GENERATION COMPLETE")
print("="*60)

EVALUATING FINE-TUNED PHI-2 ON TEST SET

1. Loading test dataset...
   ✓ Loaded 2394 indeterministic test queries for evaluation

2. Loading fine-tuned Phi-2 model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   ✓ Model loaded successfully
   GPU memory: 5.61 GB

3. Checking for existing progress...
   ✓ Found existing progress: resuming from index 2394
   Already processed: 2394 examples

4. Generating responses for 0 remaining queries...
   Saving every 20 examples

✓ GENERATION COMPLETE


In [3]:
import pandas as pd
import numpy as np
import json
import os
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score

print("="*60)
print("COMPUTING METRICS FROM SAVED RESULTS")
print("="*60)

# Load saved results
output_path = '/content/drive/MyDrive/NLP_Project/results/'
results_file = f'{output_path}/llm_generation_samples1.csv'

print("\nLoading saved generation results...")
results_df = pd.read_csv(results_file)
results = results_df.to_dict('records')

print(f"   ✓ Loaded {len(results)} generated responses")

# Filter out errors
valid_results = [r for r in results if not str(r['generated_response']).startswith('ERROR')]
print(f"   Valid generations: {len(valid_results)}/{len(results)}")

if len(valid_results) == 0:
    print("❌ No valid generations to evaluate!")
else:
    # ROUGE scores
    print("\n1. Computing ROUGE scores...")
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for r in valid_results:
        scores = rouge_scorer_obj.score(r['reference_response'], r['generated_response'])
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    print(f"   ✓ ROUGE computed for {len(valid_results)} examples")

    # BLEU scores
    print("\n2. Computing BLEU scores...")
    bleu1_scores = []
    bleu2_scores = []
    bleu3_scores = []
    bleu4_scores = []
    smoothie = SmoothingFunction().method4

    for r in valid_results:
        reference_tokens = [r['reference_response'].split()]
        generated_tokens = r['generated_response'].split()

        bleu1_scores.append(sentence_bleu(reference_tokens, generated_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie))
        bleu2_scores.append(sentence_bleu(reference_tokens, generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie))
        bleu3_scores.append(sentence_bleu(reference_tokens, generated_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie))
        bleu4_scores.append(sentence_bleu(reference_tokens, generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie))

    print(f"   ✓ BLEU computed for {len(valid_results)} examples")

    # BERTScore
    print("\n3. Computing BERTScore (this may take a few minutes)...")
    references = [r['reference_response'] for r in valid_results]
    candidates = [r['generated_response'] for r in valid_results]

    P, R, F1 = bert_score(candidates, references, lang='en', verbose=False)

    print(f"   ✓ BERTScore computed for {len(valid_results)} examples")

    # Response length analysis
    gen_lengths = [len(r['generated_response'].split()) for r in valid_results]
    ref_lengths = [len(r['reference_response'].split()) for r in valid_results]

    # Print results
    print("\n" + "="*60)
    print("EVALUATION METRICS")
    print("="*60)

    print("\nROUGE Scores (F1):")
    print(f"  ROUGE-1: {np.mean(rouge1_scores):.4f}")
    print(f"  ROUGE-2: {np.mean(rouge2_scores):.4f}")
    print(f"  ROUGE-L: {np.mean(rougeL_scores):.4f}")

    print("\nBLEU Scores:")
    print(f"  BLEU-1: {np.mean(bleu1_scores):.4f}")
    print(f"  BLEU-2: {np.mean(bleu2_scores):.4f}")
    print(f"  BLEU-3: {np.mean(bleu3_scores):.4f}")
    print(f"  BLEU-4: {np.mean(bleu4_scores):.4f}")

    print("\nBERTScore:")
    print(f"  Precision: {P.mean():.4f}")
    print(f"  Recall:    {R.mean():.4f}")
    print(f"  F1:        {F1.mean():.4f} (primary metric)")

    print("\nResponse Length:")
    print(f"  Generated: Mean {np.mean(gen_lengths):.1f} words, Median {np.median(gen_lengths):.1f}")
    print(f"  Reference: Mean {np.mean(ref_lengths):.1f} words, Median {np.median(ref_lengths):.1f}")

    # Save metrics
    metrics = {
        'num_examples': len(valid_results),
        'rouge_scores': {
            'rouge1_f1': float(np.mean(rouge1_scores)),
            'rouge2_f1': float(np.mean(rouge2_scores)),
            'rougeL_f1': float(np.mean(rougeL_scores))
        },
        'bleu_scores': {
            'bleu1': float(np.mean(bleu1_scores)),
            'bleu2': float(np.mean(bleu2_scores)),
            'bleu3': float(np.mean(bleu3_scores)),
            'bleu4': float(np.mean(bleu4_scores))
        },
        'bertscore': {
            'precision': float(P.mean()),
            'recall': float(R.mean()),
            'f1': float(F1.mean())
        },
        'response_length': {
            'generated_mean': float(np.mean(gen_lengths)),
            'generated_median': float(np.median(gen_lengths)),
            'reference_mean': float(np.mean(ref_lengths)),
            'reference_median': float(np.median(ref_lengths))
        }
    }

    with open(f'{output_path}/llm_evaluation_metrics1.json', 'w') as f:
        json.dump(metrics, f, indent=2)

    print(f"\n✓ Metrics saved to {output_path}/llm_evaluation_metrics1.json")

# Clean up progress file
if os.path.exists(f'{output_path}/evaluation_progress1.json'):
    os.remove(f'{output_path}/evaluation_progress1.json')
    print(f"✓ Cleaned up progress tracker")

print("\n" + "="*60)
print("✓ EVALUATION COMPLETE")
print("="*60)

COMPUTING METRICS FROM SAVED RESULTS

Loading saved generation results...
   ✓ Loaded 2394 generated responses
   Valid generations: 2394/2394

1. Computing ROUGE scores...
   ✓ ROUGE computed for 2394 examples

2. Computing BLEU scores...
   ✓ BLEU computed for 2394 examples

3. Computing BERTScore (this may take a few minutes)...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   ✓ BERTScore computed for 2394 examples

EVALUATION METRICS

ROUGE Scores (F1):
  ROUGE-1: 0.5691
  ROUGE-2: 0.3090
  ROUGE-L: 0.3971

BLEU Scores:
  BLEU-1: 0.4219
  BLEU-2: 0.3041
  BLEU-3: 0.2402
  BLEU-4: 0.1906

BERTScore:
  Precision: 0.9036
  Recall:    0.9199
  F1:        0.9115 (primary metric)

Response Length:
  Generated: Mean 133.8 words, Median 137.0
  Reference: Mean 109.1 words, Median 94.0

✓ Metrics saved to /content/drive/MyDrive/NLP_Project/results//llm_evaluation_metrics1.json
✓ Cleaned up progress tracker

✓ EVALUATION COMPLETE


In [4]:
# ============================================================
# COMPUTE BOOTSTRAP CONFIDENCE INTERVALS
# ============================================================
from scipy import stats

print("\n" + "="*60)
print("COMPUTING 95% CONFIDENCE INTERVALS (Bootstrap)")
print("="*60)

def bootstrap_ci(scores, n_bootstrap=10000):
    """Compute bootstrap confidence interval for mean."""
    bootstrap_means = []
    n = len(scores)

    for _ in range(n_bootstrap):
        sample_indices = np.random.choice(n, size=n, replace=True)
        sample = [scores[i] for i in sample_indices]
        bootstrap_means.append(np.mean(sample))

    ci_low = np.percentile(bootstrap_means, 2.5)
    ci_high = np.percentile(bootstrap_means, 97.5)
    mean_val = np.mean(scores)

    return mean_val, ci_low, ci_high

print("\nROUGE Scores with 95% CI:")
r1_mean, r1_low, r1_high = bootstrap_ci(rouge1_scores)
print(f"  ROUGE-1: {r1_mean:.4f} [{r1_low:.4f}, {r1_high:.4f}]")

r2_mean, r2_low, r2_high = bootstrap_ci(rouge2_scores)
print(f"  ROUGE-2: {r2_mean:.4f} [{r2_low:.4f}, {r2_high:.4f}]")

rL_mean, rL_low, rL_high = bootstrap_ci(rougeL_scores)
print(f"  ROUGE-L: {rL_mean:.4f} [{rL_low:.4f}, {rL_high:.4f}]")

print("\nBLEU Scores with 95% CI:")
b1_mean, b1_low, b1_high = bootstrap_ci(bleu1_scores)
print(f"  BLEU-1: {b1_mean:.4f} [{b1_low:.4f}, {b1_high:.4f}]")

b4_mean, b4_low, b4_high = bootstrap_ci(bleu4_scores)
print(f"  BLEU-4: {b4_mean:.4f} [{b4_low:.4f}, {b4_high:.4f}]")

print("\nBERTScore with 95% CI:")
bert_f1_list = F1.numpy().tolist()
bf1_mean, bf1_low, bf1_high = bootstrap_ci(bert_f1_list)
print(f"  F1: {bf1_mean:.4f} [{bf1_low:.4f}, {bf1_high:.4f}]")

print("\n" + "="*60)
print("✓ CONFIDENCE INTERVALS COMPUTED")
print("="*60)


COMPUTING 95% CONFIDENCE INTERVALS (Bootstrap)

ROUGE Scores with 95% CI:
  ROUGE-1: 0.5691 [0.5649, 0.5733]
  ROUGE-2: 0.3090 [0.3045, 0.3135]
  ROUGE-L: 0.3971 [0.3924, 0.4019]

BLEU Scores with 95% CI:
  BLEU-1: 0.4219 [0.4174, 0.4264]
  BLEU-4: 0.1906 [0.1868, 0.1946]

BERTScore with 95% CI:
  F1: 0.9115 [0.9106, 0.9124]

✓ CONFIDENCE INTERVALS COMPUTED


## 5.4: Hybrid System

In [None]:
import pandas as pd

print("="*60)
print("COMPUTING HYBRID SYSTEM SCORE")
print("="*60)

# Load test dataset to get proportions
df_test = pd.read_csv('/content/drive/MyDrive/NLP_Project/data/test_dataset.csv')

# Calculate label distribution
total_test = len(df_test)
det_count = len(df_test[df_test['label'] == 0])
indet_count = len(df_test[df_test['label'] == 1])

det_proportion = det_count / total_test
indet_proportion = indet_count / total_test

print(f"\nTest Set Distribution:")
print(f"  Total examples: {total_test}")
print(f"  Deterministic (label=0): {det_count} ({det_proportion*100:.1f}%)")
print(f"  Indeterministic (label=1): {indet_count} ({indet_proportion*100:.1f}%)")

# Component scores
classification_accuracy = 0.9994  # Binary classifier accuracy
retrieval_accuracy = 0.9994       # Top-1 category accuracy on retrieval
bertscore_f1 = 0.9126             # BERTScore F1 on LLM generations

print(f"\nComponent Scores:")
print(f"  Classification Accuracy: {classification_accuracy:.4f} (99.94%)")
print(f"  Retrieval Accuracy (Top-1): {retrieval_accuracy:.4f} (99.94%)")
print(f"  LLM Quality (BERTScore F1): {bertscore_f1:.4f} (91.26%)")

# Method 1: Simple weighted average (routes are assumed correct)
hybrid_score_simple = (det_proportion * retrieval_accuracy) + (indet_proportion * bertscore_f1)

print(f"\n{'-'*60}")
print("METHOD 1: Simple Weighted Score")
print(f"{'-'*60}")
print(f"  Formula: (P_det × Retrieval) + (P_indet × BERTScore)")
print(f"  Calculation:")
print(f"    Deterministic contribution: {det_proportion:.3f} × {retrieval_accuracy:.4f} = {det_proportion * retrieval_accuracy:.4f}")
print(f"    Indeterministic contribution: {indet_proportion:.3f} × {bertscore_f1:.4f} = {indet_proportion * bertscore_f1:.4f}")
print(f"\n  Hybrid Score: {hybrid_score_simple:.4f} ({hybrid_score_simple*100:.2f}%)")

# Method 2: Factor in classification accuracy (more conservative)
hybrid_score_with_classifier = classification_accuracy * hybrid_score_simple

print(f"\n{'-'*60}")
print("METHOD 2: Including Classification Accuracy")
print(f"{'-'*60}")
print(f"  Formula: Classification × [(P_det × Retrieval) + (P_indet × BERTScore)]")
print(f"  Calculation:")
print(f"    Base score: {hybrid_score_simple:.4f}")
print(f"    With classification: {classification_accuracy:.4f} × {hybrid_score_simple:.4f} = {hybrid_score_with_classifier:.4f}")
print(f"\n  Hybrid Score: {hybrid_score_with_classifier:.4f} ({hybrid_score_with_classifier*100:.2f}%)")

# Summary
print(f"\n{'='*60}")
print("HYBRID SYSTEM SCORE SUMMARY")
print(f"{'='*60}")
print(f"\nRecommended Score (Method 1): {hybrid_score_simple:.4f}")
print(f"  Interpretation:")
print(f"    • Deterministic path ({det_proportion*100:.1f}% of queries): {retrieval_accuracy*100:.2f}% accuracy")
print(f"    • Indeterministic path ({indet_proportion*100:.1f}% of queries): {bertscore_f1*100:.2f}% semantic similarity")
print(f"    • Overall weighted performance: {hybrid_score_simple*100:.2f}%")

print(f"\nConservative Score (Method 2): {hybrid_score_with_classifier:.4f}")
print(f"  Factors in potential classification errors: {hybrid_score_with_classifier*100:.2f}%")

# Save scores
scores = {
    'test_set_distribution': {
        'total': total_test,
        'deterministic_count': det_count,
        'indeterministic_count': indet_count,
        'deterministic_proportion': float(det_proportion),
        'indeterministic_proportion': float(indet_proportion)
    },
    'component_scores': {
        'classification_accuracy': float(classification_accuracy),
        'retrieval_accuracy': float(retrieval_accuracy),
        'llm_bertscore_f1': float(bertscore_f1)
    },
    'hybrid_scores': {
        'simple_weighted': float(hybrid_score_simple),
        'with_classifier': float(hybrid_score_with_classifier)
    }
}

import json
output_path = '/content/drive/MyDrive/NLP_Project/results/'
with open(f'{output_path}/hybrid_system_score.json', 'w') as f:
    json.dump(scores, f, indent=2)

print(f"\n✓ Scores saved to {output_path}/hybrid_system_score.json")

COMPUTING HYBRID SYSTEM SCORE

Test Set Distribution:
  Total examples: 3978
  Deterministic (label=0): 1584 (39.8%)
  Indeterministic (label=1): 2394 (60.2%)

Component Scores:
  Classification Accuracy: 0.9994 (99.94%)
  Retrieval Accuracy (Top-1): 0.9994 (99.94%)
  LLM Quality (BERTScore F1): 0.9126 (91.26%)

------------------------------------------------------------
METHOD 1: Simple Weighted Score
------------------------------------------------------------
  Formula: (P_det × Retrieval) + (P_indet × BERTScore)
  Calculation:
    Deterministic contribution: 0.398 × 0.9994 = 0.3980
    Indeterministic contribution: 0.602 × 0.9126 = 0.5492

  Hybrid Score: 0.9472 (94.72%)

------------------------------------------------------------
METHOD 2: Including Classification Accuracy
------------------------------------------------------------
  Formula: Classification × [(P_det × Retrieval) + (P_indet × BERTScore)]
  Calculation:
    Base score: 0.9472
    With classification: 0.9994 × 0.