In [None]:
import pandas as pd
import numpy as np
import os
import json
import sys


original_cwd = os.getcwd()

backend_path = os.path.abspath(os.path.join(original_cwd, "../backend"))
added_backend = False

if not any("backend" in p for p in sys.path):
    sys.path.insert(0, backend_path)
    added_backend = True
    print(f"Added backend to sys.path: {backend_path}")
else:
    print("Backend already in sys.path, skipping.")

from shared.snowflake.client import SnowflakeClient

os.chdir(original_cwd)
print(f"Returned to original working directory: {os.getcwd()}")

**load the config file**

In [None]:
CONFIG_FILE_PATH = "config/base_config.json"

with open(CONFIG_FILE_PATH, "r", encoding="utf-8") as f:
    config = json.load(f)

**set the experience id**

In [None]:
EXPERIENCE_ID = config["experiments_specifique_params"]["experiment_id"]

print(EXPERIENCE_ID)

**load the embedding file**

In [None]:
INPUT_EMBEDDINGS_FILE = config["output_recipies_embedding_file"].format(
    experiment_id=EXPERIENCE_ID
)
df_recipes_cleaned = pd.read_csv(INPUT_EMBEDDINGS_FILE)


emb_columns = [col for col in df_recipes_cleaned.columns if col.endswith('_EMB')]
print(f"Found embedding columns: {emb_columns}")

for col in emb_columns:
    df_recipes_cleaned[col] = df_recipes_cleaned[col].apply(
        lambda x: np.fromstring(x.strip('[]'), sep=' ', dtype=np.float32)
    )

for col in emb_columns:
    print(f"{col} -> first embedding shape: {df_recipes_cleaned[col][0].shape}")

## evaluation the LLM and the prompt system using the ground truth ##

**define function that call llm**

In [None]:
def get_llm_response(client: SnowflakeClient, model: str, prompt: str, response_format: dict) -> str:
    """
    Query Snowflake LLM with JSON schema for structured output.
    
    Args:
        client: SnowflakeClient instance
        model: Model name (e.g., 'mistral-large2')
        prompt: The prompt text
        response_format: JSON schema dict
        
    Returns:
        str: JSON response from LLM
    """
    
    query = """
        SELECT AI_COMPLETE(
            model => %s,
            prompt => %s,
            response_format => PARSE_JSON(%s)
        ) AS response;
    """
    
    # Format the response_format as Snowflake expects
    response_format_json = {
        'type': 'json',
        'schema': response_format
    }
    
    result = client.execute(
        query, 
        params=(model, prompt, json.dumps(response_format_json)), 
        fetch="one"
    )
    return result[0]

**load the test dataset**

In [None]:
QUERY_TEST_FILE_PATH = config["query_test_file_path"]

with open(QUERY_TEST_FILE_PATH, "r", encoding="utf-8") as f:
    query_documents_dicts = json.load(f)

print(QUERY_TEST_FILE_PATH)

**load the prompt**

In [None]:
PROMPT_EVAL_PATH = config["eval_prompt_file"]

with open(PROMPT_EVAL_PATH, "r") as f:
    prompt_template = f.read()
    
def build_prompt(query_text, doc_entries):
    # doc_entries should be a python list/dict → convert to JSON text
    doc_json = json.dumps(doc_entries, indent=2, ensure_ascii=False)

    # DO SAFE REPLACEMENT (no .format()!)
    prompt = (
        prompt_template
        .replace("{input_query_text}", query_text)
        .replace("{doc_entries}", doc_json)
    )

    return prompt

In [None]:
# JSON schema for recipe relevance ratings
json_schema = {
    'type': 'object',
    'properties': {
        'query_text': {
            'type': 'string'
        },
        'relevance_judgments': {
            'type': 'array',
            'items': {
                'type': 'object',
                'properties': {
                    'doc_id': {'type': 'integer'},
                    'relevance_score': {'type': 'number'}
                },
                'required': ['doc_id', 'relevance_score']
            }
        }
    },
    'required': ['query_text', 'relevance_judgments']
}

In [None]:
import json
import re

COLUMNS_TEXT = config["columns_to_clean"]
LLM_MODEL = config["llm_model"]

def normalize_json_response(text: str) -> str:
    """
    Normalize JSON response to compact single line, preserving spaces inside string values.
    
    Args:
        text: Raw JSON string from LLM (may contain \n, extra spaces, etc.)
        
    Returns:
        str: Compact, single-line JSON string
    """
    # Remove all literal \n escape sequences
    text = text.replace('\\n', '')
    
    # Remove actual newlines
    text = text.replace('\n', '').replace('\r', '')
    
    # Remove markdown code blocks
    text = re.sub(r'```(?:json)?', '', text).strip('` ')
    
    # Remove unwanted ""
    text = text.strip().strip('"').strip("'")
    
    try:
        parsed = json.loads(text)
        return json.dumps(parsed, separators=(',', ':'), ensure_ascii=False)
    except json.JSONDecodeError:
        text = re.sub(r'\s*:\s*', ':', text)
        text = re.sub(r'\s*,\s*', ',', text)
        text = re.sub(r'\{\s+', '{', text)
        text = re.sub(r'\s+\}', '}', text)
        text = re.sub(r'\[\s+', '[', text)
        text = re.sub(r'\s+\]', ']', text)
        
        return text.strip()


ratings_results = []

for query in query_documents_dicts:
    query_text = query["query_text"]

    # Build doc entries for the prompt
    doc_entries = []
    for document in query["relevance_documents"]:
        doc_id = document['doc_id']
        recipe_row = df_recipes_cleaned[df_recipes_cleaned["ID"] == doc_id]
        if recipe_row.empty:
            continue

        recipe_info = {}
        for col_key, col_info in COLUMNS_TEXT.items():
            recipe_info[col_info["start_text"]] = recipe_row.iloc[0][col_info["column_name"]]

        doc_entries.append({"doc_id": doc_id, "recipe_info": recipe_info})

    # Create the prompt
    prompt = build_prompt(query_text, doc_entries)

    # Call the LLM
    llm_response = get_llm_response(client=SnowflakeClient(), model=LLM_MODEL, prompt=prompt, response_format=json_schema)

    # Extract and parse JSON (this now returns a proper dict, not a string)
    llm_response_clean = normalize_json_response(llm_response)
    json_output = json.loads(llm_response_clean)
    print(json_output)
    
    ratings_results.append(json_output)
    print(f"✓ Parsed JSON for query: {query_text}")

In [None]:
print(ratings_results)

## compare LLM response to the ground truth ##

**compare the LLM response with the ground truth**

In [None]:
def compare_ground_truth_vs_llm(ground_truth: dict, llm_results: dict) -> tuple[int, list]:
    """
    Computes how close LLM relevance scores are to ground truth.
    
    Args:
        ground_truth(dict): dictionnary containing for each query relevent documents
        llm_results(dict): dictionnary containing for each query a relevent score for the doc in the ground truth
        
    Return
    """

    query_diffs = []

    for gt_query, llm_query in zip(ground_truth, llm_results):

        # Convert lists to dict {doc_id: score}
        gt_scores = {d["doc_id"]: d["relevance_score"] for d in gt_query["relevance_documents"]}
        print(llm_query)
        llm_scores = {d["doc_id"]: d["relevance_score"] for d in llm_query["relevance_judgments"]}

        # Use union of doc_ids so nothing is skipped
        all_doc_ids = set(gt_scores.keys()) | set(llm_scores.keys())

        diffs = []
        for doc_id in all_doc_ids:
            gt = gt_scores.get(doc_id, 0)   # missing → assume 0
            llm = llm_scores.get(doc_id, 0)
            diffs.append(1 - abs(gt - llm))

        # average difference for this query
        query_diffs.append(sum(diffs) / len(diffs))

    # overall average difference across all queries
    final_score = sum(query_diffs) / len(query_diffs)
    return final_score, query_diffs


**calculate coherence between LLM and test (ground truth)**

In [None]:
QUERY_LLM_RESULTS_PATH = config['query_llm_file_path'].format(
    experiment_id=EXPERIENCE_ID 
)

final_score, per_query_scores = compare_ground_truth_vs_llm(query_documents_dicts, ratings_results)

print("Final coherence score:", final_score)
print("Per-query coherence:", per_query_scores)

ratings_results.append({'COHERENCE_SCORE': final_score})

# Save results - now they're proper dicts, not strings
with open(QUERY_LLM_RESULTS_PATH, "w", encoding="utf-8") as f:
    json.dump(ratings_results, f, indent=2, ensure_ascii=False)



**write the config file for that experience**

In [68]:
OUPUT_EXPERIMENT_DIR = config["output_experiments_dir"].format(
    experiment_id=EXPERIENCE_ID 
)

# Write the config file
with open(os.path.join(OUPUT_EXPERIMENT_DIR, "config.json"), "w", encoding="utf-8") as f:
    json.dump(config, f, indent=4)