In [None]:
# Import python packages

import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import StringType, StructType, StructField
import pandas as pd
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns

# === CONFIGURATION ===
# Table & Index Names (Based on your docs)
SOURCE_TABLE = "ENRICHED.RECIPES_SAMPLE_50K"
BM25_INDEX = "VECTORS.RECIPES_SAMPLE_50K_BM25_INDEX"
EMBEDDINGS_TABLE = "VECTORS.RECIPES_50K_EMBEDDINGS"

# Procedure Names
PROC_BM25 = "NUTRIRAG_PROJECT.VECTORS.SEARCH_BM25"
PROC_SEMANTIC = "NUTRIRAG_PROJECT.VECTORS.SEARCH_SEMANTIC"
PROC_HYBRID = "NUTRIRAG_PROJECT.VECTORS.SEARCH_SIMILAR_RECIPES"

# Hybrid Weights
VECTOR_WEIGHT = 0.7
BM25_WEIGHT = 0.3

# Cortex Model for Evaluation (LLM Judge)
JUDGE_MODEL = "llama3.1-70b" # or 'mistral-large' <- trés couteux ce truc ptn

# Test Parameters
TOP_K = 5

In [None]:
test_queries = [
    "chocolate cake",                     # Keyword heavy
    "healthy vegetarian dinner quick",    # Semantic + Constraint
    "gluten free pasta",   
    "refreshing summer dessert lemon",    # Semantic / Vibe
    "traditional italian lasagna",        # Specific entity
    "high protein breakfast",     
    "comfort food for rainy day",         # Purely semantic/abstract
    "romantic dinner for two italian style",  # Vibe + Cuisine
    "something easy to digest when sick",     # Fonctionnel (BM25 va chercher le mot "sick"...)
    "meal to impress my boss",                # Social / Abstrait
    "post workout high protein recovery",     # Fonctionnel / Nutrition
    "cozy comfort food for winter night",     # Vibe / Saison
    "light lunch that won't make me sleepy",  # Effet désiré
    "kids friendly vegetables dish"           # Cible
]

In [None]:
def run_bm25(query):
    """Executes BM25 Search Procedure"""
    try:
        # Args: Query, IndexTable, SourceTable, Limit, Filters
        # Note: Filters are NULL (None) for this benchmark
        res = session.call(PROC_BM25, query, BM25_INDEX, SOURCE_TABLE, TOP_K, None)
        return json.loads(res) if isinstance(res, str) else res
    except Exception as e:
        print(f"Error in BM25: {e}")
        return []

def run_semantic(query):
    """Executes Semantic Search Procedure"""
    try:
        # Args: Query, EmbeddingsTable, Limit, Filters
        res = session.call(PROC_SEMANTIC, query, EMBEDDINGS_TABLE, TOP_K, None)
        return json.loads(res) if isinstance(res, str) else res
    except Exception as e:
        print(f"Error in Semantic: {e}")
        return []

def run_hybrid(query):
    """Executes Hybrid Search Procedure"""
    try:
        # Args: Query, Limit, Filters, VecWeight, BM25Weight, Index, Source, Embeddings
        res = session.call(
            PROC_HYBRID, 
            query, 
            TOP_K, 
            None, 
            VECTOR_WEIGHT, 
            BM25_WEIGHT, 
            BM25_INDEX, 
            SOURCE_TABLE, 
            EMBEDDINGS_TABLE
        )
        return json.loads(res) if isinstance(res, str) else res
    except Exception as e:
        print(f"Error in Hybrid: {e}")
        return []

In [None]:
results_data = []

print(f"Starting benchmark for {len(test_queries)} queries...")

for q in test_queries:
    # 1. BM25
    start = time.time()
    res_bm25 = run_bm25(q)
    dur_bm25 = (time.time() - start) * 1000 # ms
    
    # 2. Semantic
    start = time.time()
    res_sem = run_semantic(q)
    dur_sem = (time.time() - start) * 1000 # ms
    
    # 3. Hybrid
    start = time.time()
    res_hyb = run_hybrid(q)
    dur_hyb = (time.time() - start) * 1000 # ms
    
    # Store raw results for the LLM Judge
    results_data.append({
        "query": q,
        "bm25_res": res_bm25,
        "bm25_time": dur_bm25,
        "sem_res": res_sem,
        "sem_time": dur_sem,
        "hyb_res": res_hyb,
        "hyb_time": dur_hyb
    })

print("Benchmark execution complete.")

In [None]:
for i, row in enumerate(results_data):
    print(f"\n==================================================")
    print(f"QUERY {i+1}: '{row['query']}'")
    print(f"==================================================")
    
    # Extract just the top 3 recipe names for clarity
    bm25_names = [r.get('NAME', 'Unknown') for r in row['bm25_res'][:3]]
    sem_names = [r.get('NAME', 'Unknown') for r in row['sem_res'][:3]]
    hyb_names = [r.get('NAME', 'Unknown') for r in row['hyb_res'][:3]]
    
    # Print comparison
    print(f"--- BM25 Found: ---")
    for name in bm25_names: print(f"  • {name}")
        
    print(f"\n--- SEMANTIC Found: ---")
    for name in sem_names: print(f"  • {name}")
        
    print(f"\n--- HYBRID Found: ---")
    for name in hyb_names: print(f"  • {name}")

In [None]:

PROMPT_TEMPLATE = """
Tu es un expert en recherche d’information (Information Retrieval) et en évaluation de moteurs de recherche.
Ton objectif est d’évaluer objectivement la qualité des résultats fournis par trois systèmes différents pour une même requête utilisateur.

=== REQUÊTE UTILISATEUR ===
{query_text}

=== RÉSULTATS DU SYSTÈME A (BM25) ===
{bm25_docs}

=== RÉSULTATS DU SYSTÈME B (SEMANTIQUE) ===
{sem_docs}

=== RÉSULTATS DU SYSTÈME C (HYBRIDE) ===
{hyb_docs}

=== CRITÈRES D’ÉVALUATION ===
1. Pertinence globale
2. Couverture de l’intention
3. Précision du top 3
4. Absence de résultats hors sujet

=== FORMAT DE SORTIE OBLIGATOIRE (JSON STRICT) ===
Retourne uniquement un JSON valide, sans texte supplémentaire :
{{
  "BM25": {{ "score": <float 0-5>, "justification": "..." }},
  "SEMANTIQUE": {{ "score": <float 0-5>, "justification": "..." }},
  "HYBRIDE": {{ "score": <float 0-5>, "justification": "..." }},
  "winner": "BM25 | SEMANTIQUE | HYBRIDE | TIE"
}}
"""

def format_docs(results):
    """Helper to format list of recipes into string for LLM"""
    if not results: return "Aucun résultat."
    return "\n".join([f"{i+1}. {r.get('NAME', 'Unknown')} (Desc: {r.get('DESCRIPTION', '')[:100]}...)" for i, r in enumerate(results)])


In [None]:
from snowflake.cortex import Complete

eval_metrics = []

print("Running Cortex Evaluation (LLM-as-a-Judge)...")

for row in results_data:
    # Prepare Prompt
    prompt = PROMPT_TEMPLATE.format(
        query_text=row['query'],
        bm25_docs=format_docs(row['bm25_res']),
        sem_docs=format_docs(row['sem_res']),
        hyb_docs=format_docs(row['hyb_res'])
    )
    
    # Call Cortex
    try:
        response_str = Complete(JUDGE_MODEL, prompt)
        # Simple cleanup in case LLM adds markdown blocks
        clean_json = response_str.replace("```json", "").replace("```", "").strip()
        eval_json = json.loads(clean_json)
        
        eval_metrics.append({
            "query": row['query'],
            "bm25_score": eval_json['BM25']['score'],
            "sem_score": eval_json['SEMANTIQUE']['score'],
            "hyb_score": eval_json['HYBRIDE']['score'],
            "winner": eval_json['winner']
        })
    except Exception as e:
        print(f"Error evaluating query '{row['query']}': {e}")

# Create DataFrame
df_eval = pd.DataFrame(eval_metrics)
df_time = pd.DataFrame([{
    "query": r['query'], 
    "bm25_time": r['bm25_time'], 
    "sem_time": r['sem_time'], 
    "hyb_time": r['hyb_time']
} for r in results_data])

# Merge
final_df = pd.merge(df_time, df_eval, on="query")
final_df

In [None]:
# Reshape for plotting
df_melted_time = final_df.melt(id_vars=["query"], value_vars=["bm25_time", "sem_time", "hyb_time"], 
                               var_name="Method", value_name="Time (ms)")

plt.figure(figsize=(10, 6))
sns.barplot(data=df_melted_time, x="query", y="Time (ms)", hue="Method")
plt.xticks(rotation=45, ha='right')
plt.title("Execution Time by Method")
plt.tight_layout()
plt.show()

In [None]:
# Reshape for plotting
df_melted_score = final_df.melt(id_vars=["query"], value_vars=["bm25_score", "sem_score", "hyb_score"], 
                                var_name="Method", value_name="Quality Score (0-5)")

plt.figure(figsize=(10, 6))
sns.barplot(data=df_melted_score, x="query", y="Quality Score (0-5)", hue="Method", palette="viridis")
plt.xticks(rotation=45, ha='right')
plt.title("LLM Quality Evaluation Scores")
plt.ylim(0, 5.5)
plt.tight_layout()
plt.show()

In [None]:
# --- 1. Strategic Trade-off Scatter Plot ---
plt.figure(figsize=(8, 6))

# Calculate Averages
avg_data = final_df.mean(numeric_only=True)
methods = [('BM25', 'bm25'), ('Semantic', 'sem'), ('Hybrid', 'hyb')]

# Plot points
for label, key in methods:
    score = avg_data[f'{key}_score']
    time_val = avg_data[f'{key}_time']
    plt.scatter(time_val, score, s=300, label=label)
    # Label with Score
    plt.text(time_val+1000, score, 
             f"  {label}\n  (Score: {score:.2f})", 
             va='center', fontweight='bold')

# Draw arrows to show the "Upgrade Path"
plt.plot([avg_data['bm25_time'], avg_data['sem_time'], avg_data['hyb_time']], 
         [avg_data['bm25_score'], avg_data['sem_score'], avg_data['hyb_score']], 
         '--', color='gray', alpha=0.5)

plt.title("Strategic Trade-off: Latency vs. Quality")
plt.xlabel("Latency (ms)")
plt.ylabel("Quality Score (0-5)")
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# --- 2. Net Improvement Calculation ---
final_df['delta_vs_sem'] = final_df['hyb_score'] - final_df['sem_score']
final_df['delta_vs_bm25'] = final_df['hyb_score'] - final_df['bm25_score']

print("\n=== NET IMPROVEMENT ANALYSIS ===")
print(f"Average Improvement (Hybrid vs Semantic): +{final_df['delta_vs_sem'].mean():.2f} points")
print(f"Average Improvement (Hybrid vs BM25):     +{final_df['delta_vs_bm25'].mean():.2f} points")

In [None]:
summary = final_df.agg({
    'bm25_time': 'mean',
    'sem_time': 'mean',
    'hyb_time': 'mean',
    'bm25_score': 'mean',
    'sem_score': 'mean',
    'hyb_score': 'mean'
}).transpose()

print("=== AVERAGE METRICS ===")
print(summary)

print("\n=== WINNER DISTRIBUTION ===")
print(final_df['winner'].value_counts())

In [None]:
def format_docs(results):
    """
    Transforme la liste JSON des recettes en un texte lisible pour le LLM.
    Gère le cas où la liste est vide.
    """
    if not results or len(results) == 0:
        return "Aucun résultat trouvé."
    
    formatted_text = ""
    for i, r in enumerate(results[:5]): # On limite aux 5 premiers pour ne pas saturer le contexte
        name = r.get('NAME', 'Recette Inconnue')
        # On coupe la description pour économiser des tokens
        desc = r.get('DESCRIPTION', 'Pas de description')[:150].replace("\n", " ")
        formatted_text += f"{i+1}. {name} (Desc: {desc}...)\n"
    
    return formatted_text

PROMPT_TEMPLATE = """
Tu es un expert impartial en évaluation de moteurs de recherche (Information Retrieval).
Ta mission est de noter la qualité des résultats pour une requête culinaire donnée.

=== REQUÊTE UTILISATEUR ===
"{query_text}"

=== RÉSULTATS À ÉVALUER (HYBRIDE) ===
{hyb_docs}

=== CRITÈRES DE NOTATION (0 à 5) ===
- 5.0 : Parfait. Résultats pertinents, respectent toutes les contraintes (ingrédients, régime).
- 4.0 : Très bon. Pertinent mais ordre perfectible.
- 3.0 : Acceptable. Quelques résultats pertinents, d'autres moins.
- 1.0 - 2.0 : Mauvais. Hors sujet ou non respect des contraintes critiques (ex: allergènes).
- 0.0 : Aucun résultat ou résultats vides.

=== FORMAT DE SORTIE OBLIGATOIRE ===
Tu dois répondre UNIQUEMENT avec un JSON valide.
Ne mets PAS de balises Markdown (pas de ```json).
Ne mets PAS de phrase d'introduction.

Structure attendue :
{{
  "HYBRIDE": {{
    "score": <nombre flottant entre 0 et 5>,
    "justification": "<courte explication en 1 phrase>"
  }}
}}
"""

In [None]:
weight_candidates = [
    (0.3, 0.7),
    (0.4, 0.6),
    (0.6, 0.4),
    (0.7, 0.3),
    (0.8, 0.2),
]

filters_dict = {
  "numeric_filters": [
        {"name": "minutes", "operator": "<=", "value": 50},
        {"name": "servings", "operator": ">=", "value": 2}
    ]
}
filters = json.dumps(filters_dict)

tuning_results = []

print(f"=== DÉMARRAGE ({len(weight_candidates)} configs) ===\n")

for vec_w, bm25_w in weight_candidates:
    print(f"\n>> CONFIG: Vector={vec_w} / BM25={bm25_w}")
    current_scores = []
    
    for q in test_queries:
        try:
            res_hyb = session.call(
                PROC_HYBRID, q, TOP_K, filters, vec_w, bm25_w, 
                BM25_INDEX, SOURCE_TABLE, EMBEDDINGS_TABLE
            )
            
            if not res_hyb:
                print(f"   [WARN] Recherche vide pour '{q}'")
                continue

            try:
                res_json = json.loads(res_hyb) if isinstance(res_hyb, str) else res_hyb
            except json.JSONDecodeError:
                print(f"   [ERREUR SP] La procédure n'a pas renvoyé du JSON pour '{q}'.")
                print(f"   [CONTENU REÇU] : {str(res_hyb)[:100]}...") # Affiche le début pour comprendre
                continue
                
        except Exception as e:
            print(f"   [CRASH SP] Erreur d'appel procédure sur '{q}': {e}")
            continue

        try:
            # On simplifie le prompt pour éviter de perdre le LLM
            prompt = PROMPT_TEMPLATE.format(
                query_text=q, 
                bm25_docs="[]", 
                sem_docs="[]", 
                hyb_docs=format_docs(res_json)
            )
            
            cmd = "SELECT snowflake.cortex.COMPLETE(?, ?)"
            cortex_res = session.sql(cmd, params=[JUDGE_MODEL, prompt]).collect()[0][0]
            
            if not cortex_res:
                print(f"   [WARN] Cortex a renvoyé une réponse vide pour '{q}'")
                continue

            # Nettoyage et Parsing JSON Cortex
            clean_json = cortex_res.replace("```json", "").replace("```", "").strip()
            
            try:
                eval_data = json.loads(clean_json)
                score = eval_data['HYBRIDE']['score']
                justif = eval_data['HYBRIDE']['justification']

                # AFFICHAGE DÉTAILLÉ
                print(f"   Query: {q}")
                print(f"   Note : {score}/5")
                print(f"   Avis : {justif}")
                print("   ---")
            
                current_scores.append(score)
            except json.JSONDecodeError:
                print(f"   [ERREUR LLM] Cortex n'a pas renvoyé du JSON valide pour '{q}'.")
                print(f"   [CONTENU REÇU] : {clean_json[:100]}...")
                continue
                
        except Exception as e:
            print(f"   [CRASH LLM] Erreur Cortex sur '{q}': {e}")
            continue

    # Calcul Moyenne
    if current_scores:
        avg = sum(current_scores) / len(current_scores)
        tuning_results.append({"vector_weight": vec_w, "avg_score": avg})
        print(f"   -> Score Moyen: {avg:.2f}/5.0")
    else:
        print("   -> Config ignorée (pas assez de données valides).")

In [None]:
# Visualisation des Résultats (Courbe de Sensibilité)
df_tuning = pd.DataFrame(tuning_results)

plt.figure(figsize=(10, 6))
sns.lineplot(data=df_tuning, x="vector_weight", y="avg_score", marker="o", markersize=10, linewidth=2.5, color="#2ecc71")

plt.title("Impact du Poids Vectoriel sur la Qualité (Sensitivity Analysis)")
plt.xlabel("Poids Vectoriel (0.0 = BM25 pur, 1.0 = Sémantique pur)")
plt.ylabel("Score Moyen de Qualité (0-5)")
plt.grid(True, linestyle='--', alpha=0.6)
plt.ylim(2.5, 5.2) # Ajuster selon vos scores min/max

# Annoter le meilleur point
best_config = df_tuning.loc[df_tuning['avg_score'].idxmax()]
plt.annotate(f"Meilleur: {best_config['avg_score']:.2f}\n(Vec: {best_config['vector_weight']})", 
             xy=(best_config['vector_weight'], best_config['avg_score']), 
             xytext=(0, 40), textcoords='offset points', ha='center',
             arrowprops=dict(facecolor='black', shrink=0.05))

plt.show()