# 🚀 Phase 2: Multi-Model Embedding Generation & FAISS Search

**Objective**: Generate embeddings using multiple models, create FAISS indices, and evaluate embedding similarity performance for our AI-enhanced Saber category descriptions.

## 🎯 **What We'll Do:**

1. **Load AI-Enhanced Data** → Saber categories with rich semantic descriptions
2. **Multi-Model Embedding Generation** → Test OpenAI, Sentence Transformers, Arabic models
3. **FAISS Index Creation** → Optimize for fast similarity search
4. **Embedding Quality Evaluation** → Compare models on real user queries
5. **Performance Benchmarking** → Speed vs accuracy trade-offs

## 📊 **Expected Outcome:**
Production-ready embedding pipeline with optimal model selection for Arabic-English incident classification.

In [2]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml
from dotenv import load_dotenv
import json
import time
from datetime import datetime

# Load environment variables
load_dotenv('../.env')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Basic libraries imported successfully")
print(f"📂 Current working directory: {os.getcwd()}")
print(f"🔑 OpenAI API Key: {'✅ Found' if os.getenv('OPENAI_API_KEY') else '❌ Not Found'}")
print(f"🔑 Gemini API Key: {'✅ Found' if os.getenv('GEMINI_API_KEY') else '❌ Not Found'}")

# We'll import custom modules later as needed

✅ Basic libraries imported successfully
📂 Current working directory: c:\Users\ASUS\Classification\notebooks
🔑 OpenAI API Key: ✅ Found
🔑 Gemini API Key: ✅ Found


## 📊 1. Load AI-Enhanced Saber Categories Data

Load the data with rich semantic descriptions generated in Phase 1.

In [None]:
# Load AI-enhanced data and experiment results from Phase 1

def load_latest_experiment(experiment_type='user_optimized'):
    """Load the latest experiment results from Phase 1"""
    experiment_dir = Path('../results/experiments/phase1_descriptions')
    
    if experiment_dir.exists():
        # Find latest experiment file matching the type
        pattern = f'{experiment_type}_*.csv'
        experiment_files = list(experiment_dir.glob(pattern))
        
        if experiment_files:
            # Get the most recent file
            latest_file = max(experiment_files, key=lambda x: x.stat().st_mtime)
            print(f"📊 Found experiment files: {len(experiment_files)}")
            print(f"📁 Loading latest: {latest_file.name}")
            return pd.read_csv(latest_file, encoding='utf-8'), latest_file
    
    # Fallback to main results file
    data_file = '../results/saber_categories_with_user_style_descriptions.csv'
    print(f"📊 Loading main results file: {data_file}")
    return pd.read_csv(data_file, encoding='utf-8'), data_file

# Load the data
df, data_source = load_latest_experiment()

print(f"✅ Data loaded successfully!")
print(f"📋 Dataset shape: {df.shape}")
print(f"📁 Source: {data_source}")
print(f"📝 Columns: {list(df.columns)}")

# Check which description column to use
description_columns = [col for col in df.columns if 'description' in col.lower()]
print(f"📄 Available description columns: {description_columns}")

# Use the generated description column
if 'generated_description' in df.columns:
    description_col = 'generated_description'
elif 'user_style_description' in df.columns:
    description_col = 'user_style_description'
else:
    description_col = description_columns[0] if description_columns else 'raw_text'

print(f"🎯 Using description column: {description_col}")

# Display sample descriptions
print(f"\n📄 Sample AI-Generated Descriptions:")
print("="*70)

for i in range(min(3, len(df))):
    row = df.iloc[i]
    description = str(row[description_col])
    print(f"\n📋 Category {i+1}: {row['SubCategory']}")
    print(f"   Service: {row['Service']}")
    print(f"   Description Length: {len(description)} chars")
    print(f"   Description: {description[:200]}...")
    print("-" * 50)

print(f"\n📊 Description Statistics:")
descriptions = df[description_col].astype(str)
desc_lengths = [len(desc) for desc in descriptions]
print(f"   Total categories: {len(df)}")
print(f"   Average description length: {np.mean(desc_lengths):.0f} characters")
print(f"   Min length: {min(desc_lengths)} characters")
print(f"   Max length: {max(desc_lengths)} characters")
print(f"   Median length: {np.median(desc_lengths):.0f} characters")

# Check for any failed descriptions
failed_descriptions = df[df[description_col].astype(str).str.contains('Error generating description', na=False)]
print(f"\n🔍 Quality Check:")
print(f"   Successful descriptions: {len(df) - len(failed_descriptions)}")
print(f"   Failed descriptions: {len(failed_descriptions)}")
if len(failed_descriptions) > 0:
    print(f"   Failed categories: {list(failed_descriptions['SubCategory'])}")

print(f"\n✅ Data ready for embedding generation!")
print(f"🎯 Using '{description_col}' for embedding generation")

📊 Loading AI-enhanced Saber categories data...
✅ Data loaded successfully!
📋 Dataset shape: (100, 12)
📝 Columns: ['Service', 'Category', 'SubCategory', 'SubCategory_Prefix ', 'SubCategory_Keywords', 'SubCategory2', 'SubCategory2_Prefix ', 'SubCategory2_Keywords', 'raw_text', 'structured_text', 'user_query_format', 'user_style_description']

📄 Sample AI-Generated Descriptions:

📋 Category 1: الشهادات الصادرة من الهيئة
   Service: SASO - Products Safety and Certification
   Description Length: 2032 chars
   Description: Here's a semantically rich description designed for high embedding similarity with user queries related to SASO Saber, specifically focusing on "الشهادات الصادرة من الهيئة" (Certificates Issued by the...
--------------------------------------------------

📋 Category 2: جهات المطابقة
   Service: SASO - Products Safety and Certification
   Description Length: 2207 chars
   Description: Okay, here's a semantically rich description designed for high embedding similarity with 

## 🤖 2. Systematic Embedding Model Comparison Framework

We'll test multiple embedding models and save results systematically for comparison:

### 📊 **Embedding Models to Test:**

1. **OpenAI Models** (if available):
   - `text-embedding-3-large` (High quality, expensive)
   - `text-embedding-3-small` (Good quality, cost-effective)
   - `text-embedding-ada-002` (Baseline)

2. **Multilingual Sentence Transformers**:
   - `AIDA-UPM/mstsb-paraphrase-multilingual-mpnet-base-v2` (Arabic-English optimized)
   - `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` (Fast multilingual)
   - `sentence-transformers/all-MiniLM-L6-v2` (Lightweight baseline)

3. **Arabic-Specific Models**:
   - `aubmindlab/bert-base-arabertv02` (Arabic BERT)
   - `CAMeL-Lab/bert-base-arabic-camelbert-mix` (Arabic specialized)

### 🎯 **Evaluation Metrics:**
- **Generation Speed** (embeddings/second)
- **Model Size** (memory usage)
- **Similarity Quality** (manual validation)
- **Arabic-English Handling** (code-switching performance)

In [None]:
# 🚀 Systematic Embedding Generation Framework

import time
from datetime import datetime
import json
import gc
import psutil
import logging

# Import custom modules for embedding generation
sys.path.append('../src')
from embedding_manager import EmbeddingManager
from faiss_handler import FAISSHandler

def save_embedding_experiment(embeddings, model_name, metadata, df):
    """Save embedding experiment results with timestamp"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create experiment directory
    experiment_dir = Path(f'../results/experiments/phase2_embeddings')
    experiment_dir.mkdir(parents=True, exist_ok=True)
    
    # Clean model name for filename
    clean_model_name = model_name.replace('/', '_').replace('-', '_')
    
    # Save embeddings
    embeddings_file = experiment_dir / f'embeddings_{clean_model_name}_{timestamp}.npy'
    np.save(embeddings_file, embeddings)
    
    # Save metadata
    metadata['timestamp'] = timestamp
    metadata['model_name'] = model_name
    metadata['embeddings_file'] = str(embeddings_file)
    metadata['data_shape'] = embeddings.shape
    
    metadata_file = experiment_dir / f'embeddings_{clean_model_name}_{timestamp}_metadata.json'
    with open(metadata_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False, indent=2)
    
    # Save data mapping (category to embedding index)
    data_mapping = df[['SubCategory', 'Service', 'SubCategory2']].copy()
    data_mapping['embedding_index'] = range(len(data_mapping))
    
    mapping_file = experiment_dir / f'data_mapping_{clean_model_name}_{timestamp}.csv'
    data_mapping.to_csv(mapping_file, index=False, encoding='utf-8')
    
    print(f"💾 Saved embedding experiment '{clean_model_name}' to:")
    print(f"   📄 Embeddings: {embeddings_file}")
    print(f"   📄 Metadata: {metadata_file}")
    print(f"   📄 Mapping: {mapping_file}")
    
    return embeddings_file, metadata_file, mapping_file

def get_available_models():
    """Get list of available embedding models"""
    models = {
        'openai': {
            'text-embedding-3-large': {'size': 3072, 'cost': 'high', 'quality': 'excellent'},
            'text-embedding-3-small': {'size': 1536, 'cost': 'medium', 'quality': 'good'},
            'text-embedding-ada-002': {'size': 1536, 'cost': 'low', 'quality': 'baseline'}
        },
        'sentence_transformers': {
            'AIDA-UPM/mstsb-paraphrase-multilingual-mpnet-base-v2': {
                'size': 768, 'specialization': 'Arabic-English', 'quality': 'excellent'
            },
            'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2': {
                'size': 384, 'specialization': 'Multilingual', 'quality': 'good'
            },
            'sentence-transformers/all-MiniLM-L6-v2': {
                'size': 384, 'specialization': 'General', 'quality': 'baseline'
            }
        }
    }
    return models

def benchmark_embedding_generation(embedding_manager, texts, model_name):
    """Benchmark embedding generation performance"""
    print(f"🚀 Benchmarking {model_name}...")
    
    # Memory before
    process = psutil.Process()
    memory_before = process.memory_info().rss / 1024 / 1024  # MB
    
    # Time the embedding generation
    start_time = time.time()
    embeddings = embedding_manager.generate_embeddings(texts)
    end_time = time.time()
    
    # Memory after
    memory_after = process.memory_info().rss / 1024 / 1024  # MB
    
    # Calculate metrics
    generation_time = end_time - start_time
    texts_per_second = len(texts) / generation_time
    memory_used = memory_after - memory_before
    
    metadata = {
        'model_name': model_name,
        'total_texts': len(texts),
        'generation_time_seconds': generation_time,
        'texts_per_second': texts_per_second,
        'memory_used_mb': memory_used,
        'embedding_dimension': embeddings.shape[1],
        'embedding_dtype': str(embeddings.dtype)
    }
    
    print(f"   ⏱️  Generation time: {generation_time:.2f} seconds")
    print(f"   🚀 Speed: {texts_per_second:.2f} texts/second")
    print(f"   💾 Memory used: {memory_used:.1f} MB")
    print(f"   📊 Embedding shape: {embeddings.shape}")
    
    return embeddings, metadata

print("🤖 EMBEDDING GENERATION FRAMEWORK READY")
print("="*60)

# Show available models
available_models = get_available_models()

print("📊 Available Embedding Models:")
for provider, models in available_models.items():
    print(f"\n🔧 {provider.upper()}:")
    for model_name, specs in models.items():
        print(f"   • {model_name}")
        for key, value in specs.items():
            print(f"     - {key}: {value}")

print(f"\n✅ Framework ready for systematic embedding generation!")
print(f"🎯 Will test multiple models and save all results with timestamps")

In [None]:
# 🎯 Generate Embeddings with Specified HuggingFace Model

# Primary model specified in requirements
PRIMARY_MODEL = 'AIDA-UPM/mstsb-paraphrase-multilingual-mpnet-base-v2'

print(f"🎯 GENERATING EMBEDDINGS WITH PRIMARY MODEL")
print("="*60)
print(f"📊 Model: {PRIMARY_MODEL}")
print(f"📄 Data: {len(df)} categories")
print(f"📝 Using column: {description_col}")

try:
    # Initialize embedding manager with HuggingFace model
    print(f"\n🚀 Initializing EmbeddingManager...")
    embedding_manager = EmbeddingManager(
        provider='huggingface',
        model_name=PRIMARY_MODEL
    )
    
    print(f"✅ EmbeddingManager initialized successfully!")
    print(f"📊 Model loaded: {embedding_manager.model_name}")
    
    # Prepare texts for embedding
    texts = df[description_col].astype(str).tolist()
    print(f"📝 Prepared {len(texts)} texts for embedding")
    
    # Show sample texts
    print(f"\n📄 Sample texts to embed:")
    for i, text in enumerate(texts[:3]):
        print(f"   {i+1}. {text[:100]}...")
    
    # Generate embeddings with benchmarking
    print(f"\n🚀 Generating embeddings...")
    embeddings, metadata = benchmark_embedding_generation(
        embedding_manager, texts, PRIMARY_MODEL
    )
    
    print(f"\n✅ EMBEDDING GENERATION SUCCESSFUL!")
    print(f"📊 Generated {embeddings.shape[0]} embeddings")
    print(f"📏 Embedding dimension: {embeddings.shape[1]}")
    print(f"🔢 Data type: {embeddings.dtype}")
    
    # Save the experiment
    print(f"\n💾 Saving experiment results...")
    embeddings_file, metadata_file, mapping_file = save_embedding_experiment(
        embeddings, PRIMARY_MODEL, metadata, df
    )
    
    print(f"\n🎉 PRIMARY MODEL EMBEDDING GENERATION COMPLETE!")
    print(f"📁 Files saved successfully")
    print(f"🎯 Ready for FAISS index creation and similarity testing")
    
except Exception as e:
    print(f"❌ Error generating embeddings: {e}")
    import traceback
    traceback.print_exc()
    
    print(f"\n🔧 Will try alternative approach...")
    
    # Fallback: Try with sentence-transformers directly
    try:
        print(f"🔄 Trying direct sentence-transformers approach...")
        from sentence_transformers import SentenceTransformer
        
        model = SentenceTransformer(PRIMARY_MODEL)
        print(f"✅ Model loaded directly: {PRIMARY_MODEL}")
        
        # Generate embeddings
        start_time = time.time()
        embeddings = model.encode(texts, show_progress_bar=True)
        end_time = time.time()
        
        # Create metadata
        metadata = {
            'model_name': PRIMARY_MODEL,
            'total_texts': len(texts),
            'generation_time_seconds': end_time - start_time,
            'texts_per_second': len(texts) / (end_time - start_time),
            'embedding_dimension': embeddings.shape[1],
            'method': 'direct_sentence_transformers'
        }
        
        print(f"✅ Fallback successful!")
        print(f"📊 Shape: {embeddings.shape}")
        print(f"⏱️  Time: {metadata['generation_time_seconds']:.2f}s")
        
        # Save the experiment
        embeddings_file, metadata_file, mapping_file = save_embedding_experiment(
            embeddings, PRIMARY_MODEL, metadata, df
        )
        
        print(f"💾 Fallback results saved successfully!")
        
    except Exception as e2:
        print(f"❌ Fallback also failed: {e2}")
        embeddings = None

## 🔄 3. Additional Embedding Models Comparison

Now let's systematically test additional models and save all results for comparison.

In [None]:
# 🔄 Systematic Multi-Model Embedding Comparison

def test_multiple_models(texts, models_to_test):
    """Test multiple embedding models and save results"""
    results = {}
    
    for model_name in models_to_test:
        print(f"\n🤖 Testing model: {model_name}")
        print("-" * 50)
        
        try:
            # Try with EmbeddingManager first
            embedding_manager = EmbeddingManager(
                provider='huggingface',
                model_name=model_name
            )
            
            embeddings, metadata = benchmark_embedding_generation(
                embedding_manager, texts, model_name
            )
            
            # Save experiment
            embeddings_file, metadata_file, mapping_file = save_embedding_experiment(
                embeddings, model_name, metadata, df
            )
            
            results[model_name] = {
                'status': 'success',
                'embeddings': embeddings,
                'metadata': metadata,
                'files': {
                    'embeddings': embeddings_file,
                    'metadata': metadata_file,
                    'mapping': mapping_file
                }
            }
            
            print(f"✅ {model_name} completed successfully!")
            
            # Clean up memory
            del embedding_manager, embeddings
            gc.collect()
            
        except Exception as e:
            print(f"❌ {model_name} failed: {e}")
            
            # Try direct sentence-transformers approach
            try:
                print(f"🔄 Trying fallback for {model_name}...")
                from sentence_transformers import SentenceTransformer
                
                model = SentenceTransformer(model_name)
                start_time = time.time()
                embeddings = model.encode(texts, show_progress_bar=True)
                end_time = time.time()
                
                metadata = {
                    'model_name': model_name,
                    'total_texts': len(texts),
                    'generation_time_seconds': end_time - start_time,
                    'texts_per_second': len(texts) / (end_time - start_time),
                    'embedding_dimension': embeddings.shape[1],
                    'method': 'direct_sentence_transformers'
                }
                
                # Save experiment
                embeddings_file, metadata_file, mapping_file = save_embedding_experiment(
                    embeddings, model_name, metadata, df
                )
                
                results[model_name] = {
                    'status': 'success_fallback',
                    'embeddings': embeddings,
                    'metadata': metadata,
                    'files': {
                        'embeddings': embeddings_file,
                        'metadata': metadata_file,
                        'mapping': mapping_file
                    }
                }
                
                print(f"✅ {model_name} completed with fallback!")
                
                # Clean up memory
                del model, embeddings
                gc.collect()
                
            except Exception as e2:
                print(f"❌ {model_name} fallback also failed: {e2}")
                results[model_name] = {
                    'status': 'failed',
                    'error': str(e2)
                }
    
    return results

# Define models to test (in addition to the primary model)
ADDITIONAL_MODELS = [
    'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',  # Fast multilingual
    'sentence-transformers/all-MiniLM-L6-v2',  # Lightweight baseline
    'sentence-transformers/distiluse-base-multilingual-cased'  # DistilUSE multilingual
]

print(f"🔄 TESTING ADDITIONAL EMBEDDING MODELS")
print("="*60)
print(f"📊 Primary model already tested: {PRIMARY_MODEL}")
print(f"🔄 Additional models to test: {len(ADDITIONAL_MODELS)}")

for i, model in enumerate(ADDITIONAL_MODELS, 1):
    print(f"   {i}. {model}")

# Option to test additional models (set to True to run)
TEST_ADDITIONAL_MODELS = False  # Change to True to test additional models

if TEST_ADDITIONAL_MODELS:
    print(f"\n🚀 Starting additional model testing...")
    
    # Test additional models
    additional_results = test_multiple_models(texts, ADDITIONAL_MODELS)
    
    # Print summary
    print(f"\n📊 ADDITIONAL MODELS TESTING SUMMARY:")
    print("="*60)
    
    for model_name, result in additional_results.items():
        status = result['status']
        if status == 'success':
            metadata = result['metadata']
            print(f"\n✅ {model_name}")
            print(f"   Status: Success")
            print(f"   Dimension: {metadata['embedding_dimension']}")
            print(f"   Speed: {metadata['texts_per_second']:.2f} texts/sec")
            print(f"   Time: {metadata['generation_time_seconds']:.2f}s")
        elif status == 'success_fallback':
            metadata = result['metadata']
            print(f"\n🔄 {model_name}")
            print(f"   Status: Success (fallback)")
            print(f"   Dimension: {metadata['embedding_dimension']}")
            print(f"   Speed: {metadata['texts_per_second']:.2f} texts/sec")
        else:
            print(f"\n❌ {model_name}")
            print(f"   Status: Failed")
            print(f"   Error: {result.get('error', 'Unknown error')}")
    
    print(f"\n🎉 All additional model testing complete!")
    
else:
    print(f"\n⏸️  Additional model testing skipped (TEST_ADDITIONAL_MODELS = False)")
    print(f"💡 To test additional models, set TEST_ADDITIONAL_MODELS = True and re-run")
    print(f"🎯 Primary model ({PRIMARY_MODEL}) results are already saved and ready!")

print(f"\n✅ EMBEDDING GENERATION PHASE COMPLETE")
print(f"📁 All results saved with timestamps in: ../results/experiments/phase2_embeddings/")
print(f"🔄 No data overwritten - all experiments preserved!")
print(f"\n🚀 READY FOR FAISS INDEX CREATION AND SIMILARITY TESTING")