In [1]:
                                                                      
import os, random, math, warnings
warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

import numpy as np, pandas as pd
np.random.seed(SEED)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import minmax_scale
from scipy import sparse
from tqdm import tqdm

                                                                      
                       
         
                                                
                                
                                
                                     
                                   
                                  
    


In [2]:
                                                             
DATA_DIR = "./"                                     
USERS_CSV = os.path.join(DATA_DIR, "user.csv")
PT_CSV    = os.path.join(DATA_DIR, "PT_data.cleaned.csv")
ENG_CSV   = os.path.join(DATA_DIR, "engagement_data.csv")

users = pd.read_csv(USERS_CSV)
pt    = pd.read_csv(PT_CSV)
eng   = pd.read_csv(ENG_CSV)

print("user.csv columns:", list(users.columns))
print("PT_data.cleaned.csv columns:", list(pt.columns))
print("engagement_data.csv columns:", list(eng.columns))


user.csv columns: ['Unnamed: 0', 'user_id', 'age_group', 'health_goal', 'baseline_activity_level', 'engagement_persona']
PT_data.cleaned.csv columns: ['trainer_id', 'name', 'videos_count', 'total_likes', 'workout_recommendations', 'athlete_rating', 'years_experience', 'specialities']
engagement_data.csv columns: ['interaction_id', 'timestamp', 'user_id', 'trainer_id', 'video_completion_rate', 'adherence_score', 'feedback_score']


In [3]:
                                                                        

                                                             
REQUIRED_USER = ["user_id", "health_goal"]
for c in REQUIRED_USER:
    if c not in users.columns:
        raise ValueError(f"Missing '{c}' in user.csv")

                                                  
if "trainer_id" not in pt.columns:
    raise ValueError("Missing 'trainer_id' in PT_data.cleaned.csv")
if "specialities" not in pt.columns:
    raise ValueError("Missing 'specialities' in PT_data.cleaned.csv")
if "athlete_rating" not in pt.columns:
    raise ValueError("Missing 'athlete_rating' in PT_data.cleaned.csv")
if "name" not in pt.columns:
    raise ValueError("Missing 'name' (physio name) in PT_data.cleaned.csv")

                                                       
if "user_id" not in eng.columns or "trainer_id" not in eng.columns:
    raise ValueError("engagement_data.csv must have 'user_id' and 'trainer_id'")
if "timestamp" not in eng.columns:
                                                          
    print("Note: 'timestamp' not found in engagement_data.csv; will use random/leave-one-out split fallback.")

                                                                   
                                     
pt = pt.rename(columns={"trainer_id": "physio_id"})
eng = eng.rename(columns={"trainer_id": "physio_id"})

                                         
physio_name_col = "name"

                                                                   
PT_PROVIDER_FEATURES = [c for c in ["total_likes",
                                    "workout_recommendations",
                                    "athlete_rating",
                                    "years_experience"] if c in pt.columns]

print("Using physio name column:", physio_name_col)
print("Provider metric columns for WSM:", PT_PROVIDER_FEATURES)

                            
users["user_id"]  = users["user_id"].astype(int)
pt["physio_id"]   = pt["physio_id"].astype(int)
eng["user_id"]    = eng["user_id"].astype(int)
eng["physio_id"]  = eng["physio_id"].astype(int)

                                                        
ts_col = "timestamp" if "timestamp" in eng.columns else None
if ts_col:
    eng[ts_col] = pd.to_datetime(eng[ts_col], errors="coerce")


Using physio name column: name
Provider metric columns for WSM: ['total_likes', 'workout_recommendations', 'athlete_rating', 'years_experience']


In [4]:
                                                              
                           
                                
                                    

def _norm_text(x):
    if pd.isna(x): 
        return ""
    return str(x).lower().strip()

users["_goal_text"] = users["health_goal"].apply(_norm_text)
pt["_spec_text"]    = pt["specialities"].apply(_norm_text)

                                                              
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1)
X_spec = tfidf.fit_transform(pt["_spec_text"])                  
X_goal = tfidf.transform(users["_goal_text"])                   

                                                           
cosine = X_goal @ X_spec.T                                                                                            
                
cosine = cosine.tocsr().astype(np.float32)
cosine.data = np.clip(cosine.data, 0.0, 1.0)

                                                        
def token_set(s): 
    return set([t for t in _norm_text(s).split() if t])

user_sets = users["_goal_text"].apply(token_set).tolist()
pt_sets   = pt["_spec_text"].apply(token_set).tolist()

                                           
rows, cols, vals = [], [], []
for i, uset in enumerate(user_sets):
    if not uset: 
        continue
                                                                                   
    row = cosine.getrow(i)
    cand_idx = row.indices if row.nnz > 0 else np.arange(len(pt))
                                          
    for j in cand_idx:
        pset = pt_sets[j]
        if not pset: 
            jacc = 0.0
        else:
            inter = len(uset & pset)
            union = len(uset | pset)
            jacc = inter/union if union>0 else 0.0
        if jacc>0:
            rows.append(i); cols.append(j); vals.append(jacc)

J = sparse.csr_matrix((vals, (rows, cols)), shape=(len(users), len(pt)), dtype=np.float32)

                                                     
match_score = 0.7 * cosine + 0.3 * J
match_score.data = np.clip(match_score.data, 0.0, 1.0)


In [5]:
                                                                          
                                                                
                                                                       
WSM_WEIGHTS = {
    "athlete_rating":         0.60,
    "total_likes":            0.15,
    "workout_recommendations":0.15,
    "years_experience":       0.10
}
                                                   
WSM_WEIGHTS = {k:v for k,v in WSM_WEIGHTS.items() if k in PT_PROVIDER_FEATURES}
weight_sum = sum(WSM_WEIGHTS.values())
if weight_sum == 0:
    raise ValueError("No provider metric columns found for WSM reuse. Ensure at least one of: total_likes, workout_recommendations, athlete_rating, years_experience")

for k in list(WSM_WEIGHTS):
    WSM_WEIGHTS[k] /= weight_sum

pt_w = pt.copy()
                                            
def winsorize_minmax(s, p=5):
    s = s.astype(float)
    lo, hi = np.nanpercentile(s, [p, 100-p])
    s = s.clip(lo, hi)
                             
    if float(hi - lo) == 0:
        return pd.Series(np.zeros_like(s, dtype=float), index=s.index)
    return (s - lo) / (hi - lo)

for col in WSM_WEIGHTS:
    pt_w[f"ws_{col}"] = winsorize_minmax(pt_w[col])

pt_w["wsm_raw"] = 0.0
for col, w in WSM_WEIGHTS.items():
    pt_w["wsm_raw"] += w * pt_w[f"ws_{col}"]

                                         
pt_w["wsm_score"] = minmax_scale(pt_w["wsm_raw"].fillna(pt_w["wsm_raw"].median()))
pt = pt.merge(pt_w[["physio_id","wsm_score"]], on="physio_id", how="left")


In [6]:
                                           
ALPHA = 0.40  

                                                                                                          
U, P = match_score.shape
usr_idx, phy_idx = match_score.nonzero()
content_pairs = pd.DataFrame({"u_idx": usr_idx, "p_idx": phy_idx})
content_pairs["match_score"] = match_score[usr_idx, phy_idx].A1

                       
content_pairs["user_id"]  = users.loc[content_pairs["u_idx"], "user_id"].to_numpy()
content_pairs["physio_id"]= pt.loc[content_pairs["p_idx"], "physio_id"].to_numpy()

                          
content_pairs = content_pairs.merge(pt[["physio_id","wsm_score"]], on="physio_id", how="left")
content_pairs["wsm_score"] = content_pairs["wsm_score"].fillna(content_pairs["wsm_score"].median())

content_pairs["content_score"] = ALPHA*content_pairs["match_score"] + (1-ALPHA)*content_pairs["wsm_score"]


In [7]:

                                                                                       
def split_last7_min2_fallback(eng_df, ts_col_name: str | None = "timestamp"):
    """
    Returns (train_eng, test_eng):
      • Test = each user's events in the last 7 days IF that yields ≥2 test events
      • Else Test = that user's single last interaction
      • Train = everything else
    Expects at least 'user_id' and a timestamp column (default 'timestamp').
    Optional: 'interaction_id' for deterministic ordering in ties.
    """
    import pandas as pd

    if eng_df is None or len(eng_df) == 0:
        return eng_df.copy(), eng_df.copy()

                                                                            
    if not ts_col_name or ts_col_name not in eng_df.columns:
        tmp = eng_df.copy()
        order_cols = ["user_id"]
        if "interaction_id" in tmp.columns:
            order_cols.append("interaction_id")
        tmp = tmp.sort_values(order_cols)
        test = tmp.groupby("user_id", as_index=False).tail(1)
        train = tmp.drop(test.index)
        return train.reset_index(drop=True), test.reset_index(drop=True)

                              
    ev = eng_df.copy()
    ev[ts_col_name] = pd.to_datetime(ev[ts_col_name], errors="coerce")
    order_cols = ["user_id", ts_col_name]
    if "interaction_id" in ev.columns:
        order_cols.append("interaction_id")
    ev = ev.sort_values(order_cols)

                                 
    last_ts = ev.groupby("user_id")[ts_col_name].max()
    cutoff  = (last_ts - pd.Timedelta(days=7)).rename("cutoff")
    ev = ev.merge(cutoff.reset_index(), on="user_id", how="left")
    ev["_in_last7"] = ev[ts_col_name] >= ev["cutoff"]

    win_counts = ev.groupby("user_id")["_in_last7"].sum()
    ok_users = win_counts[win_counts >= 2].index
    fb_users = win_counts[win_counts <  2].index

                                           
    last_idx_fb = (
        ev.loc[ev["user_id"].isin(fb_users)]
          .groupby("user_id", as_index=False)
          .tail(1)
          .index
    )

    test_mask  = (ev["_in_last7"] & ev["user_id"].isin(ok_users)) | (ev.index.isin(last_idx_fb))
    train_mask = ~test_mask

    train = ev.loc[train_mask].drop(columns=["cutoff","_in_last7"]).copy()
    test  = ev.loc[test_mask ].drop(columns=["cutoff","_in_last7"]).copy()
    return train.reset_index(drop=True), test.reset_index(drop=True)


In [8]:
                                                                                    
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.decomposition import NMF

                                                                                                      
signal_weights = {
    "video_completion_rate": 1.0,
    "adherence_score":       5.0,
    "feedback_score":        4.0,
}
sig_cols_present = [c for c in eng.columns if c in signal_weights]

if len(sig_cols_present) == 0:
    eng["_strength"] = 1.0
else:
    e = np.zeros(len(eng), dtype=float)
    for c in sig_cols_present:
        s = pd.to_numeric(eng[c], errors="coerce")
        if s.notna().any() and s.max() != s.min():
            s = (s - s.min()) / (s.max() - s.min())
        else:
            s = pd.Series(0.0, index=eng.index)
        e += signal_weights[c] * s.fillna(0.0).to_numpy()
    eng["_strength"] = e

                                                      
HALF_LIFE_DAYS = 30.0
ts_col = "timestamp" if "timestamp" in eng.columns else None
if ts_col:
    eng[ts_col] = pd.to_datetime(eng[ts_col], errors="coerce")
    max_ts = eng[ts_col].max()
    age_days = (max_ts - eng[ts_col]).dt.total_seconds().div(86400.0)
    decay = np.exp(-np.log(2.0) * age_days / HALF_LIFE_DAYS)
    eng["_strength"] = eng["_strength"] * decay.fillna(1.0)

                                                            
def build_split_from_eng(eng_df, ts_col_name):
    if "is_test" in eng_df.columns:
        train_idx = eng_df[~eng_df["is_test"]].index
        test_idx  = eng_df[ eng_df["is_test"]].index
        return eng_df.loc[train_idx].copy(), eng_df.loc[test_idx].copy()
    if ts_col_name:
        tmp = eng_df.sort_values(["user_id", ts_col_name])
        test = tmp.groupby("user_id", as_index=False).tail(1)
        train = tmp.drop(test.index)
        return train.copy(), test.copy()
                                              
    rng = np.random.default_rng(42)
    test_idx = eng_df.groupby("user_id").apply(lambda g: g.sample(1, random_state=42).index[0]).values
    test = eng_df.loc[test_idx].copy()
    train = eng_df.drop(test_idx).copy()
    return train, test
train_eng, test_eng = split_last7_min2_fallback(eng, (ts_col or "timestamp"))
                                                                          
train_ui = train_eng.groupby(["user_id","physio_id"], as_index=False)["_strength"].sum()
                                                       
all_user_ids  = users["user_id"].astype(int).tolist()
all_item_ids  = pt["physio_id"].astype(int).tolist()
u2i = {u:i for i,u in enumerate(all_user_ids)}
p2j = {p:j for j,p in enumerate(all_item_ids)}

                                                               
rows = train_ui["user_id"].map(u2i).to_numpy()
cols = train_ui["physio_id"].map(p2j).to_numpy()
vals = train_ui["_strength"].astype(float).to_numpy()
U, P = len(all_user_ids), len(all_item_ids)
R = sparse.csr_matrix((vals, (rows, cols)), shape=(U, P), dtype=float)

                                                                                       
                                             
nmf = NMF(
    n_components=64, 
    init="nndsvd", 
    random_state=42, 
    max_iter=300, 
    alpha_W=0.01, alpha_H=0.01, l1_ratio=0.0
)
W = nmf.fit_transform(R)                    
H = nmf.components_                         

                                                                             
cf_raw = W @ H                              
                  
cf_min = cf_raw.min(axis=1, keepdims=True)
cf_max = cf_raw.max(axis=1, keepdims=True)
denom  = np.where((cf_max - cf_min) == 0, 1.0, (cf_max - cf_min))
cf_scaled = (cf_raw - cf_min) / denom

                                                                                             
cf_df = pd.DataFrame(cf_scaled, columns=all_item_ids, index=all_user_ids)
cf_df.index.name = "user_id"
cf_df = cf_df.reset_index()
cf_long = cf_df.melt(id_vars=["user_id"], var_name="physio_id", value_name="cf_score")
cf_long["physio_id"] = cf_long["physio_id"].astype(int)


In [9]:
                                                                                                   
W_CONTENT = 0.35
W_CF      = 0.65

                            
cand = content_pairs.merge(cf_long, on=["user_id","physio_id"], how="left")
cand["cf_score"] = cand["cf_score"].fillna(0.0)                      
cand["score"] = W_CONTENT*cand["content_score"] + W_CF*cand["cf_score"]

                                    
had_eng = set(map(tuple, train_eng[["user_id","physio_id"]]
                  .drop_duplicates()
                  .itertuples(index=False, name=None)))
cand["is_cold_for_user"] = ~cand[["user_id","physio_id"]].apply(tuple, axis=1).isin(had_eng)


                                                                                   
tie_noise = np.random.RandomState(SEED).rand(len(cand)) * 1e-9
cand["tie_noise"] = tie_noise

                                                                         
if "years_experience" in pt.columns:
    cand = cand.merge(
        pt[["physio_id","years_experience"]].rename(columns={"years_experience":"years_experience_tb"}),
        on="physio_id",
        how="left"
    )
else:
    cand["years_experience_tb"] = 0.0

def top5_with_one_cold(df_u):
    df_u = df_u.sort_values(
        by=["score","wsm_score","years_experience_tb","tie_noise"],
        ascending=[False,False,False,False]
    ).copy()

    non_cold = df_u[~df_u["is_cold_for_user"]]
    cold     = df_u[df_u["is_cold_for_user"]]

    top_non_cold = non_cold.head(4)
    if len(cold) > 0:
        best_cold = cold.sort_values(
            by=["content_score","wsm_score","years_experience_tb","tie_noise"],
            ascending=[False,False,False,False]
        ).head(1)
    else:
        best_cold = df_u.head(1)            
        best_cold["is_cold_for_user"] = True

    outu = pd.concat([top_non_cold, best_cold], axis=0)

    if len(outu) < 5:
        need = 5 - len(outu)
        remaining = df_u[~df_u.index.isin(outu.index)]
        remaining = remaining[~remaining["is_cold_for_user"]]                           
        outu = pd.concat([outu, remaining.head(need)], axis=0)

    if outu["is_cold_for_user"].sum() > 1:
        cold_rows = outu[outu["is_cold_for_user"]].sort_values(
            by=["content_score","wsm_score","years_experience_tb","tie_noise"],
            ascending=[False,False,False,False]
        )
        keep = cold_rows.head(1).index
        outu = pd.concat([outu[~outu.index.isin(cold_rows.index)], outu.loc[keep]], axis=0)

    outu = outu.sort_values(
        by=["score","wsm_score","years_experience_tb","tie_noise"],
        ascending=[False,False,False,False]
    ).head(5)
    return outu

top_rows = []
for uid, g in cand.groupby("user_id"):
    top_rows.append(top5_with_one_cold(g))
top = pd.concat(top_rows, axis=0).copy()

                                                    
                                                                                 
extras_available = [c for c in ["years_experience","total_likes","videos_count","workout_recommendations"] if c in pt.columns]

                                                                     
merge_cols = ["physio_id", physio_name_col, "specialities", "athlete_rating"] + extras_available
out = top.merge(users[["user_id","health_goal"]], on="user_id", how="left")\
         .merge(pt[merge_cols], on="physio_id", how="left")

                                
ordered_cols = ["user_id","health_goal", physio_name_col, "specialities","athlete_rating"] + extras_available + ["score"]
out = out[ordered_cols].copy()

                              
out["rank"] = out.groupby("user_id")["score"].rank(method="first", ascending=False).astype(int)
out["score"] = out["score"].round(3)

                                                          
out = out.rename(columns={physio_name_col: "physio name"})

                  
out = out.sort_values(by=["user_id","rank"])
print(out.head(10))

                      
out.to_csv("hybrid_top5_per_user.csv", index=False)


   user_id                 health_goal  physio name  \
0        1      Flexibility & Mobility  Trainer 593   
1        1      Flexibility & Mobility  Trainer 838   
2        1      Flexibility & Mobility  Trainer 657   
3        1      Flexibility & Mobility  Trainer 339   
4        1      Flexibility & Mobility   Trainer 92   
5        2  General Fitness & Wellness    Trainer 7   
6        2  General Fitness & Wellness  Trainer 416   
7        2  General Fitness & Wellness  Trainer 174   
8        2  General Fitness & Wellness    Trainer 1   
9        2  General Fitness & Wellness  Trainer 635   

                                        specialities  athlete_rating  \
0                                        Flexibility             4.7   
1                                        Flexibility             4.7   
2                                        Flexibility             4.8   
3                                        Flexibility             4.0   
4                                 

In [10]:
                                                                         
                                                                                
pop = train_ui.merge(pt[["physio_id","wsm_score"]], on="physio_id", how="left").groupby("physio_id")["_strength"].sum().reset_index()
pop = pop.merge(pt[["physio_id","wsm_score"]], on="physio_id", how="left")
pop = pop.sort_values(by=["_strength","wsm_score"], ascending=[False,False])

def top5_baseline_for_user(uid):
                                                                                    
    i = users.index[users["user_id"]==uid][0]
    row = match_score.getrow(i)
    cand_ids = pt.loc[row.indices, "physio_id"].tolist() if row.nnz>0 else pt["physio_id"].tolist()
    sub = pop[pop["physio_id"].isin(cand_ids)]
    return sub.head(5)["physio_id"].tolist()

                                                                                          
def top5_wsm_for_user(uid):
    g = content_pairs[content_pairs["user_id"]==uid].copy()
    g["content_wsm_only"] = g["wsm_score"]                                                                
    g = g.sort_values(by=["content_wsm_only"], ascending=False).head(5)
    return g["physio_id"].tolist()

                                                   
hybrid_top = out[["user_id"]].copy()
hybrid_top["physio_id_list"] = out.groupby("user_id")["physio name"].transform(lambda s: 0)               
                                                 
hybrid_map = top.sort_values(["user_id","score"], ascending=[True,False]).groupby("user_id")["physio_id"].apply(list).to_dict()


In [11]:
                                                                                                        
                                                  

import json, re, nbformat, math
import numpy as np
import pandas as pd

                                                                               
WSM_NOTEBOOK = "C:/Users/DELL/OneDrive/Desktop/Code_files/WSM_Model.ipynb"
BASE_NOTEBOOK = "C:/Users/DELL/OneDrive/Desktop/Code_files/Baseline_models_cleaned.ipynb"

METRIC_NAME_PATTERNS = [
    r"Precision@ ?(\d+)", r"Recall@ ?(\d+)", r"nDCG@ ?(\d+)", r"MRR@ ?(\d+)", r"MAP@ ?(\d+)",
    r"HitRate@ ?(\d+)", r"HR@ ?(\d+)", r"AUC", r"ROC[- ]?AUC", r"RMSE", r"MAE"
]

def extract_metric_names(ipynb_path):
    names = set()
    try:
        nb = nbformat.read(ipynb_path, as_version=4)
        for cell in nb.cells:
            if cell.cell_type not in ("code", "markdown"):
                continue
            txt = cell.source
            for pat in METRIC_NAME_PATTERNS:
                for m in re.findall(pat, txt, flags=re.IGNORECASE):
                                                      
                    if "Precision" in pat:
                        names.add(f"Precision@{m}")
                    elif "Recall" in pat:
                        names.add(f"Recall@{m}")
                    elif "nDCG" in pat:
                        names.add(f"nDCG@{m}")
                    elif "MRR" in pat and m:
                        names.add(f"MRR@{m}")
                    elif "MAP" in pat and m:
                        names.add(f"MAP@{m}")
                    elif "HitRate" in pat and m:
                        names.add(f"HitRate@{m}")
                    elif "HR@" in pat and m:
                        names.add(f"HitRate@{m}")                       
                    elif "AUC" in pat:
                        names.add("AUC")
                    elif "RMSE" in pat:
                        names.add("RMSE")
                    elif "MAE" in pat:
                        names.add("MAE")
        return sorted(names)
    except Exception as e:
        print(f"[warn] Could not read {ipynb_path}: {e}")
        return []

metric_names = set()
for p in [WSM_NOTEBOOK, BASE_NOTEBOOK]:
    metric_names.update(extract_metric_names(p))

                                                        
if not metric_names:
    metric_names = {"Precision@5","Recall@5","nDCG@5","MRR@5","MAP@5","HitRate@5"}

metric_names = sorted(metric_names)
print("Detected/evaluated metrics:", metric_names)

                                                                 
                                                           
test_truth = test_eng.groupby("user_id")["physio_id"].apply(set).to_dict()

                                      
def topk_hybrid(u, K):
    return (hybrid_map.get(u, []) or [])[:K]

USERS = users["user_id"].tolist()

                                                               
def precision_at_k(recommended, relevant, k):
    if not recommended: return 0.0
    hits = sum(1 for r in recommended[:k] if r in relevant)
    return hits / k

def recall_at_k(recommended, relevant, k):
    if not relevant: return 0.0
    hits = sum(1 for r in recommended[:k] if r in relevant)
    return hits / len(relevant)

def dcg_at_k(recommended, relevant, k):
    dcg = 0.0
    for i, r in enumerate(recommended[:k], start=1):
        if r in relevant:
            dcg += 1.0 / math.log2(i + 1)
    return dcg

def ndcg_at_k(recommended, relevant, k):
    ideal = min(k, len(relevant))
    if ideal == 0:
        return 0.0
    idcg = sum(1.0 / math.log2(i + 1) for i in range(1, ideal + 1))
    return dcg_at_k(recommended, relevant, k) / idcg if idcg > 0 else 0.0

def mrr_at_k(recommended, relevant, k):
    for i, r in enumerate(recommended[:k], start=1):
        if r in relevant:
            return 1.0 / i
    return 0.0

def ap_at_k(recommended, relevant, k):
    """Average Precision@k."""
    if not relevant: return 0.0
    ap, hits = 0.0, 0
    for i, r in enumerate(recommended[:k], start=1):
        if r in relevant:
            hits += 1
            ap += hits / i
    return ap / min(k, len(relevant)) if relevant else 0.0

def hitrate_at_k(recommended, relevant, k):
    """1 if any relevant in top-k, else 0."""
    return 1.0 if any(r in relevant for r in recommended[:k]) else 0.0

                                            
UNSUPPORTED = {"AUC","ROC-AUC","RMSE","MAE"}

def compute_metric_for_system(name, topk_fn, K):
    vals = []
    for u in USERS:
        rel = test_truth.get(u, set())
        rec = topk_fn(u, K)
        if name.startswith("Precision@"):
            vals.append(precision_at_k(rec, rel, K))
        elif name.startswith("Recall@"):
            vals.append(recall_at_k(rec, rel, K))
        elif name.startswith("nDCG@"):
            vals.append(ndcg_at_k(rec, rel, K))
        elif name.startswith("MRR@"):
            vals.append(mrr_at_k(rec, rel, K))
        elif name.startswith("MAP@"):
            vals.append(ap_at_k(rec, rel, K))
        elif name.startswith("HitRate@") or name.startswith("HR@"):
            vals.append(hitrate_at_k(rec, rel, K))
        else:
                                                      
            return None
    return float(np.mean(vals)) if vals else None

                                                                               
def parse_k(name):
    m = re.search(r"@(\d+)", name)
    return int(m.group(1)) if m else 5

rows = []
skipped = []
for mname in metric_names:
    if any(tag in mname.upper() for tag in ["AUC","RMSE","MAE"]):
        skipped.append(mname)
        continue
    K = parse_k(mname)
    rows.append({
        "Metric": mname,
        "Hybrid": compute_metric_for_system(mname, topk_hybrid, K),
    })

metrics_table = pd.DataFrame(rows).set_index("Metric").sort_index()
display(metrics_table)

if skipped:
    print("[info] Skipped metrics that require explicit labels/scores:", skipped)

                
metrics_table.to_csv("hybrid_metrics_using_WSM_Baseline_metric_names.csv")


Detected/evaluated metrics: ['Precision@10', 'Precision@5', 'Recall@10', 'Recall@5', 'nDCG@10', 'nDCG@5']


Unnamed: 0_level_0,Hybrid
Metric,Unnamed: 1_level_1
Precision@10,0.007667
Precision@5,0.015335
Recall@10,0.019247
Recall@5,0.019247
nDCG@10,0.018062
nDCG@5,0.018902


In [12]:
out.head()


Unnamed: 0,user_id,health_goal,physio name,specialities,athlete_rating,years_experience,total_likes,videos_count,workout_recommendations,score,rank
0,1,Flexibility & Mobility,Trainer 593,Flexibility,4.7,19,199143,317,191,0.301,1
1,1,Flexibility & Mobility,Trainer 838,Flexibility,4.7,22,78568,453,192,0.285,2
2,1,Flexibility & Mobility,Trainer 657,Flexibility,4.8,11,11264,28,67,0.264,3
3,1,Flexibility & Mobility,Trainer 339,Flexibility,4.0,22,247027,457,64,0.247,4
4,1,Flexibility & Mobility,Trainer 92,Flexibility,4.0,15,295069,420,97,0.238,5


In [13]:
                                                                    
uid = 1
diag_cols = ["user_id","physio_id","score","content_score","match_score","wsm_score","cf_score","is_cold_for_user","years_experience_tb"]
(
    cand[cand["user_id"]==uid]
      .merge(pt[["physio_id", physio_name_col]], on="physio_id", how="left")
      .rename(columns={physio_name_col: "physio name"})
      .loc[:, ["physio name"] + diag_cols]
      .sort_values("score", ascending=False)
      .head(10)
)


Unnamed: 0,physio name,user_id,physio_id,score,content_score,match_score,wsm_score,cf_score,is_cold_for_user,years_experience_tb
76,Trainer 593,1,593,0.301267,0.834737,0.8,0.857895,0.014013,False,19
104,Trainer 838,1,838,0.284888,0.810129,0.8,0.816882,0.002066,True,22
13,Trainer 106,1,106,0.280821,0.629806,0.286634,0.858586,0.092906,True,21
96,Trainer 771,1,771,0.278208,0.762426,0.8,0.737377,0.017475,True,19
32,Trainer 298,1,298,0.274149,0.737496,0.8,0.695827,0.024655,True,14
26,Trainer 256,1,256,0.27184,0.757366,0.8,0.728944,0.010404,True,16
85,Trainer 657,1,657,0.263919,0.711624,0.8,0.652707,0.022847,False,11
12,Trainer 105,1,105,0.26313,0.643538,0.298187,0.873771,0.058295,True,22
109,Trainer 873,1,873,0.259819,0.738551,0.8,0.697584,0.002041,True,5
34,Trainer 305,1,305,0.258701,0.682555,0.3648,0.894391,0.030472,True,11


In [14]:
                                                                           
n_test_users = test_eng["user_id"].nunique()
n_train_users = train_eng["user_id"].nunique()
print("users in test:", n_test_users, " | users in train:", n_train_users)

                                                                 
assert (test_eng.groupby("user_id").size() >= 1).all()


users in test: 40000  | users in train: 40000


In [15]:
                                                                   
train_pairs = set(map(tuple, train_eng[["user_id","physio_id"]].itertuples(index=False, name=None)))
test_pairs  = list(test_eng[["user_id","physio_id"]].itertuples(index=False, name=None))

is_new = np.array([pair not in train_pairs for pair in test_pairs])
print(f"Test interactions that are FIRST-EVER with that physio for the user: {is_new.sum():,} / {len(is_new):,} "
      f"({is_new.mean():.2%})")
print(f"…and repeats of a physio seen in train: {(~is_new).sum():,} ({(~is_new).mean():.2%})")


Test interactions that are FIRST-EVER with that physio for the user: 120,563 / 160,324 (75.20%)
…and repeats of a physio seen in train: 39,761 (24.80%)
