In [21]:
LIGHT_IMPORT = True

# If the file is in the current folder:
%run -i "Calc_Embeddings_Model Training.ipynb"

# quick sanity checks
print("df shape:", df.shape if 'df' in globals() else 'df missing')
print("have get_matches?", 'get_matches' in globals())

shape: (59946, 31)


You should consider upgrading via the 'C:\Users\veera\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


QUERY user 1234 | age 31 | m straight | palo alto, california
------------------------------------------------------------------------------------------
hello! thanks for stopping by! i hope you enjoy yourself!  i am extremely financially responsible and nice! need a new kidney? i'll give you mine! if the doctor won't let us do that, i'll just buy you a new one! no expense is too much for you!  i definitely have skinny genes! rea...

=== Matches ===

user 44713 | age 23 | m straight | hayward, california | score=0.594
hey my name is jacob i'm a die hard raider! lol i work as a massage therapist at chiropractic offices and my personal clients i do some personal tranning generally weight loss, nutrition, and hypertrophy. i'm currently attending ohlone college working on a physical therapy assist...
------------------------------------------------------------------------------------------
user 10612 | age 28 | m gay | san francisco, california | score=0.564
high energy goofy nerd-jock-goo

In [None]:
"""
If any cells in Main Calc Embeddings are wrapped heavy cells those heavy steps 
(e.g., geocoding) will skip during this import.
"""

In [22]:
import numpy as np
import pandas as pd

In [48]:
def build_pool(user_id, pool_k=800, max_km=50):
    """
    Make a candidate pool for supervised training:
      - orientation/location/age = hard gates
      - lifestyle = soft (we'll learn its weight)
      - adds essay length as a simple completeness signal
    """
    pool_k = min(pool_k, len(df) - 1)

    #loose pool: let lifestyle through (we'll score it softly)
    cands = get_matches(user_id=user_id, k=pool_k, use_orientation=True, use_location=True, max_km=max_km,use_age=True, 
                        use_diet=False, use_drinks=False, use_smokes=False, use_drugs=False, allow_missing=True).copy()

    # add component scores + distance
    cands = compute_component_scores(user_id=user_id, cands=cands, max_km=max_km,
        drinks_strict=False, smokes_strict=True, drugs_strict=True,allow_missing=True)

     # ensure text_sim present
    if "text_sim" not in cands.columns and "score" in cands.columns:
        cands["text_sim"] = cands["score"]
    cands["text_sim"] = cands["text_sim"].fillna(0)
    
    # Extra simple features
    q_age = int(df.loc[df["user_id"] == user_id, "age"].iloc[0])
    cands["age"] = cands["age"].astype(float)
    cands["age_diff"] = (cands["age"].astype(int) - q_age).abs()
    
    bio = (df.set_index("user_id").loc[cands["user_id"], "bio_text"].fillna("").astype(str))
    cands["essay_len"] = bio.str.len().to_numpy() 

    cands["dist_penalty"] = np.maximum(0.0, cands["distance_km"] - max_km)

    return cands[["user_id","age","sex","orientation","location","text_sim","age_score","age_diff",
                  "lifestyle_score","loc_score","dist_penalty","distance_km","essay_len"]]

In [49]:
pool = build_pool(user_id=1234, pool_k=800, max_km=50); 
pool.head()

Start pool: 5000 (max_candidates=5000)
After orientation: 2023
Unfiltered: 2023 | After location filter (≤50 km): 1717
After location (≤50 km): 1717
After age rule: 1342


Unnamed: 0,user_id,age,sex,orientation,location,text_sim,age_score,age_diff,lifestyle_score,loc_score,dist_penalty,distance_km,essay_len
13,57256,33.0,f,straight,"oakland, california",0.544072,1.0,2,1.0,1.0,0.0,41.230915,2505
16,53843,47.0,f,straight,"oakland, california",0.533671,1.0,16,1.0,1.0,0.0,41.230915,2425
18,4015,38.0,f,straight,"san francisco, california",0.531963,1.0,7,1.0,1.0,0.0,43.696864,4344
19,30039,24.0,f,straight,"san francisco, california",0.531841,1.0,7,1.0,1.0,0.0,43.696864,1958
23,52460,24.0,f,straight,"san leandro, california",0.525988,1.0,7,0.75,1.0,0.0,31.203141,1706


In [38]:
# Make heuristic labels for "mutual-like"

# 1) Define what counts as a "positive" (tweak thresholds as you like)
POS_THRESH = {
    "text_sim_min": 0.55,       # strong textual affinity
    "lifestyle_min": 0.50,      # lifestyle fairly aligned
    "age_score_min": 1.0,       # mutual half+7 satisfied
    "dist_penalty_max": 0.0,      # not too far (relative to max_km)
}

def is_positive(row, thr=POS_THRESH):
    """Heuristic: mark row as positive if all thresholds pass."""
    return (
        (row.get("text_sim", 0)       >= thr["text_sim_min"]) and
        (row.get("lifestyle_score",0) >= thr["lifestyle_min"]) and
        (row.get("age_score", 0)      >= thr["age_score_min"]) and
        (row.get("dist_penalty", 0)   <= thr["dist_penalty_max"])
    )

def label_pool(pool_df, pos_thresh=POS_THRESH, neg_ratio=3, random_state=42):
    """
    Takes the pool you built earlier for one anchor user.
    pos_thresh = which cutoffs to use.
    neg_ratio=3 = keep at most 3 negatives per positive 
    random_state = seed for reproducible sampling of which negatives to keep.
    """
    df = pool_df.copy().reset_index(drop=True)

    # compute label
    pos_mask = df.apply(is_positive, axis=1, thr=pos_thresh)
    df["y"] = pos_mask.astype(int)
    #Run the is_positive test on every row to get a boolean mask. Make the label column y: 1 for pos, 0 for neg.

    # add a simple explanation for auditing
    def reason(r):
        misses = []
        if r.text_sim        < pos_thresh["text_sim_min"]:    misses.append(f"text<{pos_thresh['text_sim_min']}")
        if r.lifestyle_score < pos_thresh["lifestyle_min"]:    misses.append(f"life<{pos_thresh['lifestyle_min']}")
        if r.age_score       < pos_thresh["age_score_min"]:    misses.append("age_rule=0")
        if r.dist_penalty    > pos_thresh["dist_penalty_max"]: misses.append("dist>R")
        return "POS" if r.y == 1 else ("NEG[" + ",".join(misses) + "]")
    df["label_reason"] = df.apply(reason, axis=1)

    # balance classes: keep all positives; downsample negatives
    rng = np.random.default_rng(random_state) #Make a seeded random generator so selecting neg is reproducible
    pos_idx = df.index[df["y"] == 1].tolist()
    neg_idx = df.index[df["y"] == 0].tolist()

    #Decide how many negatives to keep:At most neg_ratio × (#positives).
    # If there are zero positives, max(1, …) prevents multiplying by zero (keep few neg so the code still works).
    #If there are too many negatives, randomly downsample to target_negs.
    target_negs = min(len(neg_idx), neg_ratio * max(1, len(pos_idx)))
    neg_keep = rng.choice(neg_idx, size=target_negs, replace=False).tolist() if target_negs < len(neg_idx) else neg_idx
    keep_idx = set(pos_idx) | set(neg_keep)
    labeled = df.loc[sorted(keep_idx)].reset_index(drop=True)
    #Keep all positives + the sampled negatives. Sort indices and reset to a tidy 0..M-1 index.

    # clean up
    if "distance_km" in labeled.columns:
        labeled["distance_km"] = labeled["distance_km"].replace([np.inf, -np.inf], np.nan)
        labeled["distance_km"] = labeled["distance_km"].fillna(labeled["distance_km"].max()).clip(lower=0)

    return labeled

# 2) Build a multi-user training set in one go
def build_training_set(anchor_user_ids, pool_k=400, max_km=50,
                       pos_thresh=POS_THRESH, neg_ratio=3, random_state=42):
    """
    For each anchor user, build a pool (loose lifestyle gates), label it, and stack.
    Returns a single DataFrame with columns:
      anchor_user, candidate_user(=user_id), features..., y, label_reason
    """
    rows = []
    for u in anchor_user_ids:
        pool = build_pool(user_id=u, pool_k=pool_k, max_km=max_km)
        lab  = label_pool(pool, pos_thresh=pos_thresh, neg_ratio=neg_ratio, random_state=random_state)
        lab = lab.rename(columns={"user_id": "candidate_user"})
        lab.insert(0, "anchor_user", u)
        rows.append(lab)
    out = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()
    return out

# 3) 
pool = build_pool(user_id=10, pool_k=5000, max_km=50)
labeled = label_pool(pool, POS_THRESH, neg_ratio=3)
labeled["y"].value_counts(), labeled.head()


Start pool: 5000 (max_candidates=5000)
After orientation: 2206
Unfiltered: 2206 | After location filter (≤50 km): 2186
After location (≤50 km): 2186
After age rule: 1872


(y
 0    1349
 1     523
 Name: count, dtype: int64,
    user_id   age sex orientation                   location  text_sim  \
 0    37350  52.0   f    straight   redwood city, california  0.718680   
 1    26930  25.0   f    straight     menlo park, california  0.694204   
 2    36162  44.0   f    straight       martinez, california  0.671570   
 3    49065  25.0   f    straight       pacifica, california  0.664452   
 4    52547  25.0   f    straight  san francisco, california  0.661309   
 
    age_score  age_diff  lifestyle_score  loc_score  dist_penalty  distance_km  \
 0        1.0        17         0.666667        1.0           0.0    36.491073   
 1        1.0        10         0.666667        1.0           0.0    42.146441   
 2        1.0         9         1.000000        1.0           0.0    36.167769   
 3        1.0        10         0.666667        1.0           0.0    21.350781   
 4        1.0        10         0.500000        1.0           0.0     0.000000   
 
    ess

In [45]:
# === Train Hinge-style logistic baseline (flat-inside-radius) ===
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# 1) choose anchor users (bio + coords)
has_bio    = df["bio_text"].fillna("").astype(str).str.len() > 50
has_coords = df["lat"].notna() & df["lon"].notna()
anchor_pool = df.loc[has_bio & has_coords, "user_id"].astype(int).tolist()

rng = np.random.default_rng(42)
N_ANCHORS = min(300, len(anchor_pool))
anchor_user_ids = rng.choice(anchor_pool, size=N_ANCHORS, replace=False).tolist()

# 2) build labeled dataset
MAX_KM = 50
train_df = build_training_set(
    anchor_user_ids=anchor_user_ids,
    pool_k=400, max_km=MAX_KM,
    pos_thresh=POS_THRESH, neg_ratio=3, random_state=42
).copy()

# 3) features/target/groups (NO raw distance/loc; use dist_penalty)
X_cols = ["text_sim", "lifestyle_score", "age_diff", "essay_len", "dist_penalty"]

X = train_df[X_cols].copy()
if "dist_penalty" not in X.columns and "distance_km" in train_df.columns:
    X["dist_penalty"] = np.maximum(0.0, train_df["distance_km"] - MAX_KM)
X["essay_len"]    = X["essay_len"].fillna(0).clip(lower=0)
X["dist_penalty"] = X["dist_penalty"].fillna(0).clip(lower=0)

y = train_df["y"].astype(int).values
groups = train_df["anchor_user"].values

# 4) pipeline (scaler + LR) + grouped CV
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=300, C=0.8, class_weight="balanced", solver="lbfgs")
)
cv = GroupKFold(n_splits=5)
roc = cross_val_score(pipe, X, y, groups=groups, cv=cv, scoring="roc_auc")
pr  = cross_val_score(pipe, X, y, groups=groups, cv=cv, scoring="average_precision")
print(f"AUC-ROC: {roc.mean():.3f} ± {roc.std():.3f} | PR-AUC: {pr.mean():.3f} ± {pr.std():.3f}")

# 5) fit final model on ALL data
pipe.fit(X, y)
scaler = pipe.named_steps["standardscaler"]
clf    = pipe.named_steps["logisticregression"]

# 6) Inspect learned weights (standardized)
coef = pd.Series(clf.coef_[0], index=X_cols).sort_values(ascending=False)
display(coef.to_frame("coef (standardized)"))
display(coef.sort_values().to_frame("coef (standardized)").head(5))  # most negative

# 7) package for inference
hinge_model = {"clf": clf, "scaler": scaler, "X_cols": X_cols, "params": {"max_km": MAX_KM}}


Start pool: 5000 (max_candidates=5000)
After orientation: 2024
Unfiltered: 2024 | After location filter (≤50 km): 1873
After location (≤50 km): 1873
After age rule: 1502
Start pool: 5000 (max_candidates=5000)
After orientation: 2708
Unfiltered: 2708 | After location filter (≤50 km): 2670
After location (≤50 km): 2670
After age rule: 2434
Start pool: 5000 (max_candidates=5000)
After orientation: 1497
Unfiltered: 1497 | After location filter (≤50 km): 1486
After location (≤50 km): 1486
After age rule: 1091
Start pool: 5000 (max_candidates=5000)
After orientation: 2053
Unfiltered: 2053 | After location filter (≤50 km): 2035
After location (≤50 km): 2035
After age rule: 813
Start pool: 5000 (max_candidates=5000)
After orientation: 1986
Unfiltered: 1986 | After location filter (≤50 km): 1799
After location (≤50 km): 1799
After age rule: 1076
Start pool: 5000 (max_candidates=5000)
After orientation: 2045
Unfiltered: 2045 | After location filter (≤50 km): 2040
After location (≤50 km): 2040
Af

Unnamed: 0,coef (standardized)
text_sim,3.135953
lifestyle_score,1.671583
essay_len,0.145017
dist_penalty,0.0
age_diff,-0.049669


Unnamed: 0,coef (standardized)
age_diff,-0.049669
dist_penalty,0.0
essay_len,0.145017
lifestyle_score,1.671583
text_sim,3.135953


In [None]:
"""
text_sim = +3.14 → by far the strongest signal. 
Higher text similarity sharply increases the mutual-like probability.

lifestyle_score = +1.67 → solid positive effect. Matching lifestyle helps meaningfully.

essay_len = +0.15 → tiny positive: longer/more complete bios help a bit.

age_diff = −0.05 → very small negative: being further apart in age slightly hurts.

dist_penalty = 0.00 → effectively ignored 
Likely all/most training pairs had dist_penalty = 0 (inside 50 km), 
so the model learned no within-radius distance preference.

"""

In [46]:
def hinge_prob(user_id, k=20, model=hinge_model, pool_k=800, max_km=50):
    """Build a pool, make the SAME features used in training, predict, return top-k."""
    # 1) pool (orientation/location/age hard; lifestyle soft)
    pool = build_pool(user_id=user_id, pool_k=pool_k, max_km=max_km).copy()

    # 2) features in EXACT order the model expects
    X = pool[model["X_cols"]].copy()

    # 3) same cleaning as training
    if "dist_penalty" not in X.columns and "distance_km" in pool.columns:
        X["dist_penalty"] = np.maximum(0.0, pool["distance_km"] - max_km)
    X["essay_len"]    = X["essay_len"].fillna(0).clip(lower=0)
    X["dist_penalty"] = X["dist_penalty"].fillna(0).clip(lower=0)

    # 4) scale + predict
    X_scaled = model["scaler"].transform(X)
    p = model["clf"].predict_proba(X_scaled)[:, 1]

    # 5) attach predictions
    out = pool.copy()
    out["prob_mutual_like"] = p

    # 6) human-readable breakdown (uses flat-inside-radius rule)
    def _explain(r):
        dist = r.get("distance_km", np.nan)
        dist_txt = "?" if pd.isna(dist) else int(round(dist))
        pen = r.get("dist_penalty", 0.0)
        loc_score = 1.0 if pen <= 0 else max(0.0, 1.0 - min(1.0, pen / max_km))
        return (f"text={r.text_sim:.2f}, life={r.lifestyle_score:.2f}, "
                f"age={int(r.age_score)}, loc={loc_score:.2f}, "
                f"Δage={int(r.age_diff)}, dist~{dist_txt}km")

    out["feature_breakdown"] = out.apply(_explain, axis=1)

    # 7) tidy columns & return top-k
    cols = ["user_id","prob_mutual_like","feature_breakdown",
            "text_sim","lifestyle_score","age_score",
            "dist_penalty","distance_km","age","sex","orientation","location"]
    for c in cols:
        if c not in out.columns:
            out[c] = np.nan
    return out.sort_values("prob_mutual_like", ascending=False).head(k).reset_index(drop=True)


In [47]:
hinge_top = hinge_prob(user_id=12, k=10, model=hinge_model, pool_k=800, max_km=50)
hinge_top

Start pool: 5000 (max_candidates=5000)
After orientation: 1561
Unfiltered: 1561 | After location filter (≤50 km): 1552
After location (≤50 km): 1552
After age rule: 1081


Unnamed: 0,user_id,age,sex,orientation,location,text_sim,age_score,age_diff,lifestyle_score,loc_score,dist_penalty,distance_km,essay_len,prob_mutual_like,feature_breakdown
0,27365,24.0,f,straight,"san francisco, california",0.596096,1.0,0,1.0,1.0,0.0,0.0,59,0.845938,"text=0.60, life=1.00, age=1, loc=1.00, Δage=0,..."
1,2695,21.0,f,bisexual,"novato, california",0.592071,1.0,3,1.0,1.0,0.0,38.624527,69,0.812392,"text=0.59, life=1.00, age=1, loc=1.00, Δage=3,..."
2,15562,21.0,f,straight,"berkeley, california",0.591714,1.0,3,1.0,1.0,0.0,16.407002,55,0.809331,"text=0.59, life=1.00, age=1, loc=1.00, Δage=3,..."
3,58866,22.0,f,straight,"san mateo, california",0.591408,1.0,2,1.0,1.0,0.0,32.302391,54,0.80848,"text=0.59, life=1.00, age=1, loc=1.00, Δage=2,..."
4,51829,30.0,f,straight,"san francisco, california",0.586222,1.0,6,1.0,1.0,0.0,0.0,72,0.756461,"text=0.59, life=1.00, age=1, loc=1.00, Δage=6,..."
5,49159,24.0,f,straight,"san francisco, california",0.583664,1.0,0,1.0,1.0,0.0,0.0,45,0.742836,"text=0.58, life=1.00, age=1, loc=1.00, Δage=0,..."
6,48293,24.0,f,straight,"san francisco, california",0.583664,1.0,0,1.0,1.0,0.0,0.0,45,0.742834,"text=0.58, life=1.00, age=1, loc=1.00, Δage=0,..."
7,40249,24.0,f,straight,"san francisco, california",0.583664,1.0,0,1.0,1.0,0.0,0.0,45,0.742834,"text=0.58, life=1.00, age=1, loc=1.00, Δage=0,..."
8,32359,24.0,f,straight,"san bruno, california",0.583664,1.0,0,1.0,1.0,0.0,17.174141,45,0.742834,"text=0.58, life=1.00, age=1, loc=1.00, Δage=0,..."
9,42193,24.0,f,straight,"oakland, california",0.583664,1.0,0,1.0,1.0,0.0,13.300941,45,0.742834,"text=0.58, life=1.00, age=1, loc=1.00, Δage=0,..."


In [51]:
#with build_pool settings: drinks_strict=False, smokes_strict=True, drugs_strict=True
hinge = hinge_prob(user_id=15, k=100, model=hinge_model, pool_k=500, max_km=45)
hinge

Start pool: 5000 (max_candidates=5000)
After orientation: 2005
Unfiltered: 2005 | After location filter (≤45 km): 1941
After location (≤45 km): 1941
After age rule: 1478


Unnamed: 0,user_id,age,sex,orientation,location,text_sim,age_score,age_diff,lifestyle_score,loc_score,dist_penalty,distance_km,essay_len,prob_mutual_like,feature_breakdown
0,28566,30.0,m,straight,"san francisco, california",0.682167,1.0,9,1.0,1.0,0.0,0.000000,1740,0.998035,"text=0.68, life=1.00, age=1, loc=1.00, Δage=9,..."
1,49783,42.0,m,straight,"oakland, california",0.681057,1.0,3,1.0,1.0,0.0,13.300941,1062,0.997894,"text=0.68, life=1.00, age=1, loc=1.00, Δage=3,..."
2,58244,48.0,m,straight,"moraga, california",0.672608,1.0,9,1.0,1.0,0.0,26.260308,2489,0.997040,"text=0.67, life=1.00, age=1, loc=1.00, Δage=9,..."
3,20973,43.0,m,straight,"san mateo, california",0.665855,1.0,4,1.0,1.0,0.0,32.302391,713,0.995174,"text=0.67, life=1.00, age=1, loc=1.00, Δage=4,..."
4,33449,50.0,m,straight,"san francisco, california",0.665802,1.0,11,1.0,1.0,0.0,0.000000,953,0.994933,"text=0.67, life=1.00, age=1, loc=1.00, Δage=11..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,11758,30.0,m,straight,"el cerrito, california",0.608280,1.0,9,1.0,1.0,0.0,18.340803,2751,0.926389,"text=0.61, life=1.00, age=1, loc=1.00, Δage=9,..."
96,21076,35.0,m,straight,"hayward, california",0.610922,1.0,4,1.0,1.0,0.0,32.207482,782,0.924489,"text=0.61, life=1.00, age=1, loc=1.00, Δage=4,..."
97,56383,29.0,m,straight,"vallejo, california",0.608997,1.0,10,1.0,1.0,0.0,38.834789,2226,0.924257,"text=0.61, life=1.00, age=1, loc=1.00, Δage=10..."
98,20893,40.0,m,straight,"san francisco, california",0.592386,1.0,1,1.0,1.0,0.0,0.000000,9045,0.923098,"text=0.59, life=1.00, age=1, loc=1.00, Δage=1,..."


In [54]:
import pandas as pd, json, os
from datetime import datetime

# ensure folder
os.makedirs("results", exist_ok=True)

# standardize + annotate
hinge = hinge.reset_index(drop=True).copy()
hinge["rank"] = range(1, len(hinge) + 1)
hinge["app"]  = "hinge"

# metadata relevant to hinge
meta = {
    "app": "hinge",
    "user_id": 15,
    "top_n": 100,
    "pool_k": 500,
    "max_km": 45,
    # record model + pool filter context (optional but useful)
    "model": str(type(hinge_model.get("clf", hinge_model)).__name__) if isinstance(hinge_model, dict) else str(type(hinge_model).__name__),
    "pool_filters": {"drinks_strict": False, "smokes_strict": True, "drugs_strict": True},
    "timestamp": datetime.utcnow().isoformat() + "Z",
}

stem = f"results/hinge_user_id:{meta['user_id']}_k{meta['top_n']}_pool{meta['pool_k']}_km{meta['max_km']}"

# save
hinge.to_parquet(f"{stem}.parquet", index=False)
with open(f"{stem}.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved:", f"{stem}.parquet and {stem}.json")

Saved: results/hinge_user_id:15_k100_pool500_km45.parquet and results/hinge_user_id:15_k100_pool500_km45.json
