In [None]:
# %%
# Requirements / environment (copy & run in shell if needed)
# pip install -r requirements.txt
# requirements.txt should include:
# sentence-transformers==5.1.0
# scikit-learn==1.2.2
# pandas==2.1.0
# numpy==1.25.0
# joblib==1.3.1
# fastapi==0.95.2
# uvicorn==0.23.2
# %%


In [4]:
import os
import json
from pathlib import Path
from typing import List, Dict

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    accuracy_score,
    precision_recall_curve,
    roc_curve,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.model_selection import train_test_split
import joblib

# Embedding model
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:

DATA_DIR = Path('.') / 'input_files'
BRIEF_PATH = DATA_DIR / 'brief.txt'
LABELED_PATH = DATA_DIR / 'labeled_examples.csv'
PAGES_PATH = DATA_DIR / 'pages.csv'
TEST_PATH = DATA_DIR / 'test_set_1.csv'

EMB_MODEL_NAME = 'all-MiniLM-L6-v2'
MIN_CPM = 0.5
MAX_CPM = 6.0
MODEL_PATH = Path('relevance_model.joblib')
CALIBRATOR_PATH = Path('relevance_calibrator.joblib')

<span style="font-size:30px;">Utility functions</span>

In [6]:

def read_text(path: Path) -> str:
    with open(path, 'r', encoding='utf-8') as f:
        return f.read().strip()


def compute_embeddings(model: SentenceTransformer, texts: List[str], batch_size: int = 32) -> np.ndarray:
    return np.array(model.encode(texts, show_progress_bar=False, convert_to_numpy=True, batch_size=batch_size))


def cosine_sim(a: np.ndarray, b: np.ndarray):
    # a: (n, d) or (d,)
    # b: (d,) or (m, d)
    a_norm = a / np.linalg.norm(a, axis=-1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=-1, keepdims=True)
    return np.dot(a_norm, b_norm.T)


def map_score_to_cpm(score: float, min_cpm=MIN_CPM, max_cpm=MAX_CPM) -> float:
    # score in [0,1] -> linear map to [min_cpm, max_cpm]
    return float(min_cpm + score * (max_cpm - min_cpm))


def choose_threshold_by_f1(scores: np.ndarray, labels: np.ndarray) -> float:
    precision, recall, thresholds = precision_recall_curve(labels, scores)
    f1 = (2 * precision * recall) / (precision + recall + 1e-12)
    best_idx = np.nanargmax(f1)
    # precision_recall_curve returns thresholds of length len(precision)-1
    if best_idx >= len(thresholds):
        return 0.5
    return float(thresholds[best_idx])



<span style="font-size:30px;">Load Inputs</span>

In [7]:

assert BRIEF_PATH.exists(), f"Missing brief at {BRIEF_PATH}"
brief_text = read_text(BRIEF_PATH)
print('Loaded brief (len=%d chars)' % len(brief_text))

labeled = pd.read_csv(LABELED_PATH)
pages = pd.read_csv(PAGES_PATH)
testset = pd.read_csv(TEST_PATH)

print('labeled:', labeled.shape, 'pages:', pages.shape, 'testset:', testset.shape)


Loaded brief (len=617 chars)
labeled: (87, 3) pages: (158, 2) testset: (51, 3)


<span style="font-size:30px;">Initialize embedding model</span>

In [8]:
print('Loading embedding model:', EMB_MODEL_NAME)
emb_model = SentenceTransformer(EMB_MODEL_NAME)

Loading embedding model: all-MiniLM-L6-v2


In [9]:

all_snippets = list(pages['snippet'].fillna(''))  #   (158)  ['Stre…', 'Nurses rave about…', 'Review: n…',]
snippet_embs = compute_embeddings(emb_model, all_snippets)  #(158, 384)  array([[-0.00971121,  0.00492362,  0.06036666, ..., -0.08424693,
      #  -0.02608203,  0.04787878],
      # [-0.04969076, -0.01702726, -0.02019079, ..., -0.00679669,
      #  -0.01795055,  0.04061359]],dtype=float32)
brief_emb = compute_embeddings(emb_model, [brief_text])[0]  # (384,)  array([-4.02793400e-02, -3.91791016e-03, -5.85291721e-02,], dtype=float32)

sims = cosine_sim(snippet_embs, brief_emb).squeeze()  # (158,)  array([ 3.55501950e-01,  6.30325973e-01,  4.16991338e-02,  5.67493021e-01,] dtype=float32)

# If you need normalized score in [0,1], rescale using min/max observed in the labeled set or pages
sims_min, sims_max = sims.min(), sims.max()
sims_norm = (sims - sims_min) / (sims_max - sims_min + 1e-12)  # (158,)  array([0.5689435 , 0.94204515, 0.1429241 , 0.856743  , 1 ] dtype=float32)

In [11]:
# Baseline threshold tuning using labeled examples
# From the labeled examples of each (cosine_score, label) pair, determine the optimal threshold.
labeled_snippets = list(labeled['snippet'].fillna(''))  # (87) ['Str Week…', 'Nurses…', 'Review: n"]
labeled_embs = compute_embeddings(emb_model, labeled_snippets)  # (87, 384)
labeled_sims = cosine_sim(labeled_embs, brief_emb).squeeze()  # (87,)  array([ 3.55501950e-01,  6.30325973e-01, ], dtype=float32)

labeled_sims_norm = (labeled_sims - labeled_sims.min()) / (labeled_sims.max() - labeled_sims.min() + 1e-12) #(87) array([0.5689435 , 0.94204515, 0.1429241 , 0.85674)
labels = labeled['label'].values  # 87  array([1, 1, 0, 1, 0, 1, 0, 1, 0, 1,])

baseline_threshold = choose_threshold_by_f1(labeled_sims_norm, labels)
print(f'Baseline tuned threshold (similarity normalized): {baseline_threshold:.4f}')


Baseline tuned threshold (similarity normalized): 0.2919


In [16]:
# Map baseline scores to CPM
baseline_prices = [map_score_to_cpm(s, MIN_CPM, MAX_CPM) for s in sims_norm]


In [17]:
# Prepare baseline results DataFrame (for the 150 pages)
# Apply the prediction using the threshold to the 150 pages.
baseline_df = pages.copy()
baseline_df['score'] = sims_norm
baseline_df['bid'] = (baseline_df['score'] >= baseline_threshold).astype(int)
baseline_df['price'] = baseline_df['score'].apply(lambda s: map_score_to_cpm(s))



<span style="font-size:30px;">Learnable model: embeddings -> LogisticRegression</span><br>
<span style="font-size:18px;">Train Logistic regression on the labeled data</span>


In [18]:
X = labeled_embs
y = labels


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [19]:
clf = LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=1000)
clf.fit(X_train, y_train)

In [20]:
# Calibration for better probability estimates
calibrator = CalibratedClassifierCV(clf, method='sigmoid', cv=3)
calibrator.fit(X_train, y_train)


In [93]:
# Save model objects
joblib.dump(clf, MODEL_PATH)
joblib.dump(calibrator, CALIBRATOR_PATH)
print('Saved model and calibrator')


Saved model and calibrator


ROC the curve
The AUC (Area Under the Curve) is a single number that summarizes the performance of the classifier across all possible thresholds.

In [29]:
val_probs = calibrator.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
# average precision focuses on the model's ability to find positive examples
# average_precision_score is especially useful for highly imbalanced datasets where the positive class is rare. Unlike roc_auc_score
val_ap = average_precision_score(y_val, val_probs)
# Test using a threshold of 0.5, which we later realized was not optimal — the best threshold turned out to be 0.3.
val_pred = (val_probs >= 0.5).astype(int)
val_acc = accuracy_score(y_val, val_pred)

print(f'Validation ROC AUC: {val_auc:.4f}, PR AUC: {val_ap:.4f}, Acc: {val_acc*100:.2f}%')


Validation ROC AUC: 1.0000, PR AUC: 1.0000, Acc: 94.44%


<span style="font-size:18px;">Find the optinal threshold</span>

In [30]:
# Choose threshold on the labeled set (using all labeled examples)
all_probs = calibrator.predict_proba(X)[:, 1]
learned_threshold = choose_threshold_by_f1(all_probs, y)
print(f'Learned threshold (probability): {learned_threshold:.4f}')


Learned threshold (probability): 0.3544


<span style="font-size:30px;">Evaluate on test_set_1.csv</span><br>

In [32]:

# Compute embeddings for test set
test_snippets = list(testset['snippet'].fillna(''))
test_embs = compute_embeddings(emb_model, test_snippets)

# Two modes: baseline sims vs learned model probabilities
# Baseline scores for test set
test_sims = cosine_sim(test_embs, brief_emb).squeeze()
test_sims_norm = (test_sims - sims_min) / (sims_max - sims_min + 1e-12)

# Learned model probabilities for test set
test_probs = calibrator.predict_proba(test_embs)[:, 1]



In [34]:
# Metrics
test_labels = testset['label'].values


# Baseline metrics (using baseline_threshold)
baseline_test_pred = (test_sims_norm >= baseline_threshold).astype(int)
baseline_acc = accuracy_score(test_labels, baseline_test_pred)
baseline_auc = roc_auc_score(test_labels, test_sims_norm)
baseline_ap = average_precision_score(test_labels, test_sims_norm)

# Learned model metrics (using learned_threshold)
learned_test_pred = (test_probs >= learned_threshold).astype(int)
learned_acc = accuracy_score(test_labels, learned_test_pred)
learned_auc = roc_auc_score(test_labels, test_probs)
learned_ap = average_precision_score(test_labels, test_probs)

print('\nTest set performance:')
print(f'Baseline - Acc: {baseline_acc*100:.2f}%, ROC AUC: {baseline_auc:.4f}, PR AUC: {baseline_ap:.4f}')
print(f'Learned  - Acc: {learned_acc*100:.2f}%, ROC AUC: {learned_auc:.4f}, PR AUC: {learned_ap:.4f}')

pages_embs = compute_embeddings(emb_model, list(pages['snippet'].fillna('')))
pages_probs = calibrator.predict_proba(pages_embs)[:, 1]

pages_results = []
for url, score in zip(pages['url'], pages_probs):
    bid = int(score >= learned_threshold)
    price = map_score_to_cpm(score)
    pages_results.append({'url': url, 'bid': bid, 'price': round(price, 3), 'score': float(round(score, 4))})

with open('results.json', 'w', encoding='utf-8') as f:
    json.dump(pages_results, f, indent=2)

print('Wrote results.json (N=%d)' % len(pages_results))


Test set performance:
Baseline - Acc: 88.24%, ROC AUC: 0.9431, PR AUC: 0.9126
Learned  - Acc: 100.00%, ROC AUC: 1.0000, PR AUC: 1.0000
Wrote results.json (N=158)
