# Job App Assistant Evaluation Notebook

This notebook evaluates the matching pipeline with example requests,
proxy precision metrics, and latency charts.


## Project overview

- Step 1: scrape or parse job offers into `data/jobs_raw.csv`.
- Step 2: rewrite job offers with an LLM into `data/jobs_rewritten.csv`.
- Step 3: convert a PDF CV into `data/cv_converted.txt`.
- Step 4: rewrite and summarize the CV into `data/cv_synthesized.txt`.
- Step 5: embedding matching -> `data/final_matches.csv`.
- Step 6: cross-encoder reranking -> `data/final_matches_cross.csv`.


In [None]:
import os
import re
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

DATA_DIR = os.path.join(os.getcwd(), "data")


In [None]:
def safe_read_csv(path):
    if os.path.exists(path):
        return pd.read_csv(path)
    return None

jobs_raw_path = os.path.join(DATA_DIR, "jobs_raw.csv")
jobs_rewritten_path = os.path.join(DATA_DIR, "jobs_rewritten.csv")
matches_embed_path = os.path.join(DATA_DIR, "final_matches.csv")
matches_cross_path = os.path.join(DATA_DIR, "final_matches_cross.csv")
cv_path = os.path.join(DATA_DIR, "cv_synthesized.txt")

jobs_raw = safe_read_csv(jobs_raw_path)
jobs_rewritten = safe_read_csv(jobs_rewritten_path)
matches_embed = safe_read_csv(matches_embed_path)
matches_cross = safe_read_csv(matches_cross_path)

cv_text = None
if os.path.exists(cv_path):
    with open(cv_path, "r", encoding="utf-8") as f:
        cv_text = f.read()


## Example requests

These examples mirror the Flask API in `app.py`. Run the server separately
if you want to execute them against `http://localhost:5000`.


In [None]:
examples = [
    {
        "step": "step1_scrape",
        "endpoint": "/api/step1",
        "payload": {"mode": "scrape", "keyword": "Data Analyst", "num_jobs": 5}
    },
    {
        "step": "step1_text",
        "endpoint": "/api/step1",
        "payload": {"mode": "text", "text": "Paste raw job offer text here..."}
    },
    {
        "step": "step2_rewrite_jobs",
        "endpoint": "/api/step2",
        "payload": {}
    },
    {
        "step": "step3_upload_cv",
        "endpoint": "/api/step3/upload",
        "payload": "multipart/form-data (file)"
    },
    {
        "step": "step3_convert_cv",
        "endpoint": "/api/step3",
        "payload": {"filename": "cv.pdf"}
    },
    {
        "step": "step4_rewrite_cv",
        "endpoint": "/api/step4",
        "payload": {}
    },
    {
        "step": "step5_match",
        "endpoint": "/api/step5",
        "payload": {}
    },
    {
        "step": "step6_cross_match",
        "endpoint": "/api/step6",
        "payload": {}
    }
]

pd.DataFrame(examples)


## Data preview

Quick look at the raw and rewritten job data.


In [None]:
if jobs_raw is not None:
    display(jobs_raw.head(3))
else:
    print("Missing data/jobs_raw.csv")

if jobs_rewritten is not None:
    display(jobs_rewritten.head(3))
else:
    print("Missing data/jobs_rewritten.csv")


## Matching results preview

Top results from the embedding matcher and cross-encoder reranker.


In [None]:
if matches_embed is not None:
    cols = [c for c in ["Poste", "Entreprise", "match_score"] if c in matches_embed.columns]
    display(matches_embed[cols].head(5))
else:
    print("Missing data/final_matches.csv")

if matches_cross is not None:
    cols = [c for c in ["Poste", "Entreprise", "match_score"] if c in matches_cross.columns]
    display(matches_cross[cols].head(5))
else:
    print("Missing data/final_matches_cross.csv")


## Precision metrics (proxy labels)

If `data/eval_labels.csv` exists, it is used as manual labels (columns:
Poste, Entreprise, relevant). Otherwise, this notebook creates proxy labels
from keyword overlap between the CV and job text.


In [None]:
labels_path = os.path.join(DATA_DIR, "eval_labels.csv")

STOPWORDS = set([
    "de", "la", "le", "les", "des", "du", "un", "une",
    "et", "ou", "en", "au", "aux", "pour", "avec", "sur",
    "the", "and", "for", "with", "to", "in", "of", "a", "an"
])

def tokenize(text):
    tokens = re.findall(r"[a-z0-9]+", text.lower())
    return [t for t in tokens if t not in STOPWORDS and len(t) > 2]

def build_job_text(df):
    cols = ["Poste", "Entreprise", "Resume_IA", "Missions", "Profil_Recherche"]
    cols = [c for c in cols if c in df.columns]
    if not cols:
        return pd.Series([""] * len(df), index=df.index)
    return df[cols].fillna("").astype(str).apply(lambda row: " ".join(row), axis=1)

if jobs_rewritten is None:
    raise ValueError("jobs_rewritten.csv is required for evaluation")

job_text = build_job_text(jobs_rewritten)
jobs_eval = jobs_rewritten[["Poste", "Entreprise"]].copy()
jobs_eval["job_text"] = job_text

cv_tokens = set(tokenize(cv_text)) if cv_text else set()
if not cv_tokens:
    print("Warning: CV text missing or empty. Proxy labels may be all zeros.")

if os.path.exists(labels_path):
    labels = pd.read_csv(labels_path)
    labels["relevant"] = labels["relevant"].astype(int)
    jobs_labeled = jobs_eval.merge(labels, on=["Poste", "Entreprise"], how="left")
    jobs_labeled["relevant"] = jobs_labeled["relevant"].fillna(0).astype(int)
    label_source = "manual"
else:
    THRESHOLD = 1
    jobs_eval["overlap"] = job_text.apply(lambda t: len(set(tokenize(t)) & cv_tokens))
    jobs_eval["relevant"] = (jobs_eval["overlap"] >= THRESHOLD).astype(int)
    jobs_labeled = jobs_eval
    label_source = "proxy"

print(f"Label source: {label_source} (relevant count = {jobs_labeled['relevant'].sum()})")


In [None]:
def precision_at_k(df, k):
    k = min(k, len(df))
    if k == 0:
        return 0.0
    return float(df.head(k)["relevant"].mean())

def average_precision(df):
    rel = df["relevant"].values
    precisions = []
    hit = 0
    for i, r in enumerate(rel, start=1):
        if r:
            hit += 1
            precisions.append(hit / i)
    return float(np.mean(precisions)) if precisions else 0.0

def compute_metrics(ranked_df, label_df, name):
    merged = ranked_df.merge(
        label_df[["Poste", "Entreprise", "relevant"]],
        on=["Poste", "Entreprise"],
        how="left"
    )
    merged["relevant"] = merged["relevant"].fillna(0).astype(int)
    metrics = {
        "model": name,
        "precision_at_5": precision_at_k(merged, 5),
        "precision_at_10": precision_at_k(merged, 10),
        "avg_precision": average_precision(merged),
    }
    return metrics, merged

metrics_rows = []
ranked_frames = {}

if matches_embed is not None:
    metrics, merged = compute_metrics(matches_embed, jobs_labeled, "embedding_match")
    metrics_rows.append(metrics)
    ranked_frames["embedding_match"] = merged

if matches_cross is not None:
    metrics, merged = compute_metrics(matches_cross, jobs_labeled, "cross_encoder")
    metrics_rows.append(metrics)
    ranked_frames["cross_encoder"] = merged

metrics_df = pd.DataFrame(metrics_rows)
metrics_df


In [None]:
if not metrics_df.empty:
    metrics_plot = metrics_df.set_index("model")[
        ["precision_at_5", "precision_at_10", "avg_precision"]
    ]
    metrics_plot.plot(kind="bar")
    plt.ylim(0, 1)
    plt.title("Precision metrics (proxy labels)")
    plt.ylabel("score")
    plt.tight_layout()
else:
    print("No metrics to plot.")


## Latency benchmark

Set `RUN_HEAVY = True` to benchmark the full embedding and cross-encoder
models. With `RUN_HEAVY = False`, the notebook measures lightweight
operations as a proxy so the chart still renders quickly.


In [None]:
RUN_HEAVY = False

def time_call(label, fn):
    start = time.perf_counter()
    result = fn()
    end = time.perf_counter()
    return {"step": label, "seconds": end - start, "result": result}

latency_tasks = []

if RUN_HEAVY:
    from services.matcher import calculate_matches
    from services.cross_encoder_matcher import calculate_cross_matches

    latency_tasks.append((
        "embedding_match",
        lambda: calculate_matches(
            cv_txt_path=cv_path,
            jobs_csv_path=jobs_rewritten_path,
            progress_callback=None,
        ),
    ))
    latency_tasks.append((
        "cross_encoder",
        lambda: calculate_cross_matches(
            cv_txt_path=cv_path,
            jobs_csv_path=jobs_rewritten_path,
            progress_callback=None,
        ),
    ))
else:
    if os.path.exists(jobs_rewritten_path):
        latency_tasks.append(("load_jobs_rewritten", lambda: pd.read_csv(jobs_rewritten_path)))
    if os.path.exists(cv_path):
        latency_tasks.append(("load_cv", lambda: open(cv_path, "r", encoding="utf-8").read()))
    if matches_embed is not None:
        latency_tasks.append((
            "rank_top_5",
            lambda: matches_embed.sort_values("match_score", ascending=False).head(5)
        ))

latencies = []
for label, fn in latency_tasks:
    latencies.append(time_call(label, fn))

lat_df = pd.DataFrame(latencies)
if not lat_df.empty:
    lat_df["ms"] = lat_df["seconds"] * 1000
    display(lat_df[["step", "ms"]].sort_values("ms", ascending=False))
    lat_df.plot(kind="bar", x="step", y="ms", legend=False)
    plt.ylabel("latency (ms)")
    plt.title("Latency by step")
    plt.tight_layout()
else:
    print("No latency tasks to run.")
