In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load your base notebook (embeddings, df, helpers)
LIGHT_IMPORT = True
%run -i "Calc_Embeddings_Model Training.ipynb"

# quick sanity
print("df shape:", df.shape if 'df' in globals() else 'df missing')
print("have get_matches?", 'get_matches' in globals())

shape: (59946, 31)


You should consider upgrading via the 'C:\Users\veera\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


QUERY user 1234 | age 31 | m straight | palo alto, california
------------------------------------------------------------------------------------------
hello! thanks for stopping by! i hope you enjoy yourself!  i am extremely financially responsible and nice! need a new kidney? i'll give you mine! if the doctor won't let us do that, i'll just buy you a new one! no expense is too much for you!  i definitely have skinny genes! rea...

=== Matches ===

user 44713 | age 23 | m straight | hayward, california | score=0.594
hey my name is jacob i'm a die hard raider! lol i work as a massage therapist at chiropractic offices and my personal clients i do some personal tranning generally weight loss, nutrition, and hypertrophy. i'm currently attending ohlone college working on a physical therapy assist...
------------------------------------------------------------------------------------------
user 10612 | age 28 | m gay | san francisco, california | score=0.564
high energy goofy nerd-jock-goo

In [37]:
CUPID_TOP_K   = 100
CUPID_POOL_K  = 500
CUPID_MAX_KM  = 45

def cupid_candidate_pool(user_id: int,pool_k: int = CUPID_POOL_K,max_km: float = CUPID_MAX_KM) -> pd.DataFrame:
    """
    Build the candidate pool using only hard gates:
      - orientation/sex as the user specifies (inside get_matches)
      - age range
      - location radius
    NOTE: No lifestyle gating here. We'll handle lifestyle softly during scoring.
    """
    cands = get_matches(user_id=user_id, k=pool_k,use_orientation=True,use_location=True, max_km=max_km,use_age=True,
        # all lifestyle OFF here -> soft handling later
        use_diet=False, use_drinks=False, use_smokes=False, use_drugs=False,
        drinks_strict=False, smokes_strict=False, drugs_strict=False,
        allow_missing=True).copy()

    # ensure a minimal set of columns is present (keep others if they exist)
    base_cols = ["user_id", "age", "sex", "orientation", "location"]
    for col in base_cols:
        if col not in cands.columns:
            cands[col] = np.nan

    # return with base cols first, then any extra columns that came along
    ordered = base_cols + [c for c in cands.columns if c not in base_cols]
    return cands[ordered]

In [None]:
"""
sets a few simple knobs (CUPID_TOP_K, CUPID_POOL_K, CUPID_MAX_KM).
defines cupid_candidate_pool(...), which calls your get_matches with only hard gates (orientation, age, location). 
lifestyle is intentionally left out here, it will be soft later in the scoring step.
runs a quick smoke test so you can see the pool size and first 10 rows
"""

In [48]:
u = 15
pool = cupid_candidate_pool(user_id=u)
print("pool size:", len(pool))
pool

Start pool: 5000 (max_candidates=5000)
After orientation: 2005
Unfiltered: 2005 | After location filter (≤45 km): 1941
After location (≤45 km): 1941
After age rule: 1478
pool size: 500


Unnamed: 0,user_id,age,sex,orientation,location,text_sim,diet_c,drinks_c,smokes_c,drugs_c,lat,lon
5,35029,33,m,straight,"san francisco, california",0.690010,omnivore,heavy,heavy,sometimes,37.779259,-122.419329
11,28566,30,m,straight,"san francisco, california",0.682167,omnivore,light,none,none,37.779259,-122.419329
13,49783,42,m,straight,"oakland, california",0.681057,omnivore,none,none,none,37.804456,-122.271356
21,58244,48,m,straight,"moraga, california",0.672608,,light,none,none,37.834897,-122.128830
23,20571,37,m,straight,"san mateo, california",0.668853,other,light,none,none,37.496904,-122.333057
...,...,...,...,...,...,...,...,...,...,...,...,...
1769,6273,32,m,straight,"san francisco, california",0.572045,,light,light,none,37.779259,-122.419329
1772,29951,38,m,straight,"san mateo, california",0.572012,omnivore,light,none,none,37.496904,-122.333057
1779,21686,63,m,straight,"san rafael, california",0.571808,omnivore,heavy,none,none,37.974779,-122.531669
1783,4346,30,m,straight,"san francisco, california",0.571759,omnivore,heavy,none,sometimes,37.779259,-122.419329


In [7]:
#Base compatibility (text/age/lifestyle/location)
# Uses your compute_component_scores to get component columns, then blends them into a single base_score (no freshness/explore yet).

# weights for the base
CUPID_W = {"text": 0.60, "age": 0.10, "life": 0.15, "loc": 0.15}

def _ensure_text_sim(cands: pd.DataFrame) -> pd.DataFrame:
    """
    make sure text_sim is numeric and clipped to [0,1].
    """
    c = cands.copy()
    c["text_sim"] = pd.to_numeric(c.get("text_sim", 0.0), errors="coerce").fillna(0.0).clip(0, 1)
    return c

def cupid_base_compat(user_id: int,pool_k: int = CUPID_POOL_K,max_km: float = CUPID_MAX_KM,
                      weights: dict = CUPID_W) -> pd.DataFrame:
    """
    1) Build hard-gated pool (orientation/age/location)
    2) Compute component scores (text_sim, age_score, lifestyle_score, loc_score)
    3) Blend into base_score = 0.60*text + 0.10*age + 0.15*life + 0.15*loc
    Returns a dataframe sorted by base_score (desc).
    """
    # (1) pool
    pool = cupid_candidate_pool(user_id=user_id, pool_k=pool_k, max_km=max_km)

    # (2) component scores (lifestyle kept soft here)
    c = compute_component_scores(user_id=user_id, cands=pool, max_km=max_km,
        drinks_strict=False, smokes_strict=False, drugs_strict=False, allow_missing=True)
    c = _ensure_text_sim(c)

    # make sure required columns exist
    for col in ["age_score", "lifestyle_score", "loc_score"]:
        if col not in c.columns:
            c[col] = 0.0

    # (3) weighted blend
    w = weights
    c["base_score"] = (w["text"] * c["text_sim"] +w["age"]  * c["age_score"] +w["life"] * c["lifestyle_score"] +
        w["loc"]  * c["loc_score"])

    need = ["user_id", "base_score", "text_sim", "age_score", "lifestyle_score", "loc_score",
            "age", "sex", "orientation", "location"]
    for col in need:
        if col not in c.columns:
            c[col] = np.nan

    return c[need].sort_values("base_score", ascending=False).reset_index(drop=True)

In [49]:
u = 15
base = cupid_base_compat(user_id=u)
print("rows:", len(base))
base

Start pool: 5000 (max_candidates=5000)
After orientation: 2005
Unfiltered: 2005 | After location filter (≤50 km): 1990
After location (≤50 km): 1990
After age rule: 1511
rows: 500


Unnamed: 0,user_id,base_score,text_sim,age_score,lifestyle_score,loc_score,age,sex,orientation,location
0,44725,0.822358,0.703931,1.0,1.000000,1.0,54,m,straight,"alameda, california"
1,28566,0.809300,0.682167,1.0,1.000000,1.0,30,m,straight,"san francisco, california"
2,49783,0.808634,0.681057,1.0,1.000000,1.0,42,m,straight,"oakland, california"
3,58244,0.803565,0.672608,1.0,1.000000,1.0,48,m,straight,"moraga, california"
4,31789,0.803546,0.672577,1.0,1.000000,1.0,46,m,straight,"alameda, california"
...,...,...,...,...,...,...,...,...,...,...
495,37355,0.647360,0.578934,1.0,0.333333,1.0,49,m,straight,"san francisco, california"
496,52368,0.645354,0.575591,1.0,0.333333,1.0,27,m,straight,"san francisco, california"
497,32003,0.643842,0.573070,1.0,0.333333,1.0,35,m,straight,"san francisco, california"
498,44076,0.623434,0.622389,1.0,0.000000,1.0,33,m,straight,"san francisco, california"


In [50]:
# Freshness 
# - Format last_online to YYYY-MM-DD
# - Build recency_norm ∈ [0,1] on the master df
# - Add freshness to base_score → cupid_score_fresh

CUPID_G_FRESH = 0.10  # weight for freshness

# 1) Clean + parse dates on master df
df["last_online_clean"] = df["last_online"].astype(str).str.slice(0, 10)  # YYYY-MM-DD
df["last_online_dt"] = pd.to_datetime(df["last_online_clean"], errors="coerce", utc=True)

# 2) Normalize to [0,1] over the dataset
min_dt = df["last_online_dt"].min()
max_dt = df["last_online_dt"].max()

if pd.notna(min_dt) and pd.notna(max_dt) and max_dt > min_dt:
    span_sec = (max_dt - min_dt).total_seconds()
    df["recency_norm"] = ((df["last_online_dt"] - min_dt).dt.total_seconds() / span_sec).fillna(0.0)
else:
    # if all dates are the same or missing, set to 0.0
    df["recency_norm"] = 0.0

# 3) Merge into base and add freshness bump
fresh = base.merge(df[["user_id", "recency_norm"]], on="user_id", how="left")
fresh["recency_norm"] = fresh["recency_norm"].fillna(0.0).clip(0, 1)
fresh["cupid_score_fresh"] = fresh["base_score"] + CUPID_G_FRESH * fresh["recency_norm"]

print("Built recency_norm. Preview:")
fresh[["user_id","base_score","recency_norm","cupid_score_fresh"]]

Built recency_norm. Preview:


Unnamed: 0,user_id,base_score,recency_norm,cupid_score_fresh
0,44725,0.822358,0.997297,0.922088
1,28566,0.809300,0.997297,0.909030
2,49783,0.808634,0.997297,0.908364
3,58244,0.803565,0.997297,0.903295
4,31789,0.803546,0.818919,0.885438
...,...,...,...,...
495,37355,0.647360,0.956757,0.743036
496,52368,0.645354,0.991892,0.744544
497,32003,0.643842,0.781081,0.721950
498,44076,0.623434,1.000000,0.723434


In [51]:
#  Synthesize encounters & popularity, then compute cupid_base
# - Reproducible synthetic schedule_strength and right_swipe_rate
# - Adds UCB bonus and popularity penalty, producing `cupid_base`

CUPID_ALPHA_UCB = 0.15   # exploration strength
CUPID_BETA_POP  = 0.20   # popularity penalty
CUPID_SYN_SEED  = 7      # reproducible

rng = np.random.default_rng(CUPID_SYN_SEED)

def cupid_add_explore_pop_synth(fresh_df: pd.DataFrame) -> pd.DataFrame:
    """
    Build synthetic:
      - schedule_strength: higher if more recent activity, with noise (0..~20)
      - right_swipe_rate: function of text_sim + lifestyle_score, with tiny noise (0..1)
    Then compute:
      - ucb_bonus = α / sqrt(1 + schedule_strength)
      - pop_penalty = β * right_swipe_rate
      - cupid_base  = cupid_score_fresh + ucb_bonus - pop_penalty
    """
    out = fresh_df.copy()

    # Make sure needed columns exist
    for col in ["recency_norm", "text_sim", "lifestyle_score", "cupid_score_fresh"]:
        if col not in out.columns:
            out[col] = 0.0

    # Synthetic encounters: more recent users tend to have more encounters
    # scale to ~0..20 with some randomness
    base_enc = 14 * out["recency_norm"] + 2 * rng.random(len(out))
    out["schedule_strength"] = np.floor(np.clip(base_enc, 0, None)).astype(int)

    # Synthetic popularity: driven by text & lifestyle; small noise
    rrate = (0.10 + 0.65 * out["text_sim"] + 0.15 * out["lifestyle_score"] + 0.03 * rng.standard_normal(len(out)))
    out["right_swipe_rate"] = np.clip(rrate, 0.0, 1.0)

    # Exploration bonus (bigger for low-encounter users)
    out["ucb_bonus"]   = CUPID_ALPHA_UCB / np.sqrt(1.0 + out["schedule_strength"])
    # Popularity penalty (downweight very over-exposed profiles)
    out["pop_penalty"] = CUPID_BETA_POP * out["right_swipe_rate"]

    # Final per-candidate pre-slate score
    out["cupid_base"]  = out["cupid_score_fresh"] + out["ucb_bonus"] - out["pop_penalty"]

    # peek
    print("Synthetic explore+pop added. Preview:")
    display(out[["user_id","recency_norm","text_sim","lifestyle_score","schedule_strength","right_swipe_rate",
                 "ucb_bonus","pop_penalty","cupid_base"]].head(8))
    return out

# %% [markdown]
# CUPID — Cell 3b (alt): Synthesize encounters & popularity, then compute cupid_base
# - No Elo code required
# - Reproducible synthetic schedule_strength and right_swipe_rate
# - Adds UCB bonus and popularity penalty, producing `cupid_base`

# %%
import numpy as np
import pandas as pd

CUPID_ALPHA_UCB = 0.15   # exploration strength
CUPID_BETA_POP  = 0.20   # popularity penalty
CUPID_SYN_SEED  = 7      # reproducible

rng = np.random.default_rng(CUPID_SYN_SEED)

def cupid_add_explore_pop_synth(fresh_df: pd.DataFrame) -> pd.DataFrame:
    """
    Build synthetic:
      - schedule_strength: higher if more recent activity, with noise (0..~20)
      - right_swipe_rate: function of text_sim + lifestyle_score, with tiny noise (0..1)
    Then compute:
      - ucb_bonus = α / sqrt(1 + schedule_strength)
      - pop_penalty = β * right_swipe_rate
      - cupid_base  = cupid_score_fresh + ucb_bonus - pop_penalty
    """
    out = fresh_df.copy()

    # Make sure needed columns exist
    for col in ["recency_norm", "text_sim", "lifestyle_score", "cupid_score_fresh"]:
        if col not in out.columns:
            out[col] = 0.0

    # Synthetic encounters: more recent users tend to have more encounters
    # scale to ~0..20 with some randomness
    base_enc = 14 * out["recency_norm"] + 2 * rng.random(len(out))
    out["schedule_strength"] = np.floor(np.clip(base_enc, 0, None)).astype(int)

    # Synthetic popularity: driven by text & lifestyle; small noise
    rrate = (
        0.10 + 0.65 * out["text_sim"] + 0.15 * out["lifestyle_score"] + 0.03 * rng.standard_normal(len(out))
    )
    out["right_swipe_rate"] = np.clip(rrate, 0.0, 1.0)

    # Exploration bonus (bigger for low-encounter users)
    out["ucb_bonus"]   = CUPID_ALPHA_UCB / np.sqrt(1.0 + out["schedule_strength"])
    # Popularity penalty (downweight very over-exposed profiles)
    out["pop_penalty"] = CUPID_BETA_POP * out["right_swipe_rate"]

    # Final per-candidate pre-slate score
    out["cupid_base"]  = out["cupid_score_fresh"] + out["ucb_bonus"] - out["pop_penalty"]

    # peek
    print("Synthetic explore+pop added. Preview:")
    display(out[["user_id","recency_norm","text_sim","lifestyle_score",
                 "schedule_strength","right_swipe_rate",
                 "ucb_bonus","pop_penalty","cupid_base"]])
    return out

# --- test run it on current `fresh` ---
scored = cupid_add_explore_pop_synth(fresh)

Synthetic explore+pop added. Preview:


Unnamed: 0,user_id,recency_norm,text_sim,lifestyle_score,schedule_strength,right_swipe_rate,ucb_bonus,pop_penalty,cupid_base
0,44725,0.997297,0.703931,1.000000,15,0.729964,0.037500,0.145993,0.813595
1,28566,0.997297,0.682167,1.000000,15,0.718020,0.037500,0.143604,0.802926
2,49783,0.997297,0.681057,1.000000,15,0.663848,0.037500,0.132770,0.813094
3,58244,0.997297,0.672608,1.000000,14,0.645482,0.038730,0.129096,0.812928
4,31789,0.818919,0.672577,1.000000,12,0.676531,0.041603,0.135306,0.791734
...,...,...,...,...,...,...,...,...,...
495,37355,0.956757,0.578934,0.333333,13,0.556998,0.040089,0.111400,0.671726
496,52368,0.991892,0.575591,0.333333,15,0.519882,0.037500,0.103976,0.678067
497,32003,0.781081,0.573070,0.333333,11,0.553931,0.043301,0.110786,0.654465
498,44076,1.000000,0.622389,0.000000,14,0.505092,0.038730,0.101018,0.661145


In [52]:
#  Reciprocal keyword boost (shared interests) — log-scale bonus
# - Extract keyword sets from bio_text (user & candidates)
# - Add diminishing-returns bonus via log1p(shared)
# - Output columns: kw_shared, kw_bonus, cupid_score_kw, cupid_pre_slate

import re

# Small, interpretable keyword lexicon (extend anytime)
CUPID_KEYWORDS = {"hiking","hike","run","running","walk","walking","gym","fitness","yoga","climb","biking",
                  "cycling","swim","travel","trip","explore","roadtrip","camp","music","concert","festival",
                  "dj","guitar","piano","sing","movie","movies","film","netflix","series","tv","anime",
                  "art","museum","gallery","paint","painting","draw","photography","food","cook","cooking",
                  "bake","coffee","brunch","restaurants","reading","books","book","boardgames","games","gaming",
                  "dog","dogs","cat","cats","pets"}
 
CUPID_DELTA_KW = 0.03  # base factor for keyword bonus (tunable)

def _kw_set(text: str) -> set:
    """Lowercase, tokenize, keep only words in CUPID_KEYWORDS."""
    if not isinstance(text, str) or not text.strip():
        return set()
    tokens = re.findall(r"[a-zA-Z]+", text.lower())
    return {t for t in tokens if t in CUPID_KEYWORDS}

def cupid_add_keywords_log(base_df: pd.DataFrame, user_id: int) -> pd.DataFrame:
    """
    Add keyword overlap bonus:
      kw_bonus = δ * log(1 + #shared_keywords)
    Produces: kw_shared, kw_bonus, cupid_score_kw, cupid_pre_slate
    """
    out = base_df.copy()

    # Build a fast lookup for bios
    bio_map = df.set_index("user_id")["bio_text"].to_dict()

    # User keyword set
    user_bio = bio_map.get(int(user_id), "")
    user_kw  = _kw_set(user_bio)

    # Overlap count per candidate
    def _overlap(uid):
        cand_bio = bio_map.get(int(uid), "")
        return len(user_kw & _kw_set(cand_bio))

    out["kw_shared"] = out["user_id"].astype(int).map(_overlap)

    # Log-scale bonus: grows with shared interests but with diminishing returns
    out["kw_bonus"] = CUPID_DELTA_KW * np.log1p(out["kw_shared"])

    # Add into score
    if "cupid_base" not in out.columns:
        raise KeyError("cupid_add_keywords_log expects 'cupid_base' in the input dataframe.")
    out["cupid_score_kw"] = out["cupid_base"] + out["kw_bonus"]

    # Alias for next step (pre-slate score before fairness/diversity)
    out["cupid_pre_slate"] = out["cupid_score_kw"]

    print("Keyword boost (log-scale) added. Preview:")
    display(out[["user_id","kw_shared","kw_bonus","cupid_base","cupid_score_kw"]])
    return out

scored_kw = cupid_add_keywords_log(scored, user_id=15)

Keyword boost (log-scale) added. Preview:


Unnamed: 0,user_id,kw_shared,kw_bonus,cupid_base,cupid_score_kw
0,44725,0,0.0,0.813595,0.813595
1,28566,0,0.0,0.802926,0.802926
2,49783,0,0.0,0.813094,0.813094
3,58244,0,0.0,0.812928,0.812928
4,31789,0,0.0,0.791734,0.791734
...,...,...,...,...,...
495,37355,0,0.0,0.671726,0.671726
496,52368,0,0.0,0.678067,0.678067
497,32003,0,0.0,0.654465,0.654465
498,44076,0,0.0,0.661145,0.661145


In [53]:
# In‑slate diversity (greedy with similarity penalty)
# Input:  scored_kw  (must contain 'cupid_pre_slate'; will fetch bios from df)
# Method:
#   - Start from the highest 'cupid_pre_slate'
#   - For each next pick, compute Jaccard similarity between each remaining
#     candidate's keyword set and the sets already in the slate.
#   - Apply penalty = λ * max_similarity_to_slate
#   - Select argmax( score - penalty )
# Output: 'cupid_final' with diagnostic columns.

DIVERSITY_LAMBDA = 0.30  # λ — strength of the penalty (0.2–0.4 works well)

def _kw_set(text: str, lexicon) -> set:
    if not isinstance(text, str) or not text.strip():
        return set()
    toks = re.findall(r"[a-zA-Z]+", text.lower())
    return {t for t in toks if t in lexicon}

def _jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else 0.0

def cupid_build_diverse_slate(pre_df: pd.DataFrame,k: int = CUPID_TOP_K,lexicon=CUPID_KEYWORDS,
                              lam: float = DIVERSITY_LAMBDA) -> pd.DataFrame:
    """
    Greedy diverse selection on 'cupid_pre_slate'.
    Returns a DataFrame with columns:
      ['user_id','cupid_pre_slate','div_penalty','cupid_final','kw_set','kw_shared']
    """
    C = pre_df.copy()
    if "cupid_pre_slate" not in C.columns:
        raise KeyError("Expected 'cupid_pre_slate' in input dataframe (run keyword step first).")

    # Bring bios for keyword sets
    if "bio_text" not in C.columns:
        C = C.merge(df[["user_id","bio_text"]], on="user_id", how="left")

    # Precompute keyword sets for each candidate
    bio_map = C.set_index("user_id")["bio_text"].to_dict()
    kw_map  = {int(uid): _kw_set(bio_map.get(int(uid), ""), lexicon) for uid in C["user_id"].astype(int)}

    # Work arrays
    C = C.sort_values("cupid_pre_slate", ascending=False).reset_index(drop=True)
    taken_ids = []
    taken_kw  = []

    # For diagnostics
    penalties = {}
    final_scores = {}

    for _ in range(min(k, len(C))):
        best_idx, best_val, best_pen = None, -1e9, 0.0

        for idx, row in C.iterrows():
            uid = int(row["user_id"])
            if uid in taken_ids:
                continue

            # similarity penalty: compare to already chosen sets
            K = kw_map.get(uid, set())
            if taken_kw:
                sim = max((_jaccard(K, T) for T in taken_kw), default=0.0)
            else:
                sim = 0.0
            pen = lam * sim
            val = float(row["cupid_pre_slate"]) - pen

            if val > best_val:
                best_val, best_idx, best_pen = val, idx, pen

        # take the best one
        if best_idx is None:
            break
        chosen = C.loc[best_idx].copy()
        uid = int(chosen["user_id"])
        taken_ids.append(uid)
        taken_kw.append(kw_map.get(uid, set()))
        penalties[uid] = best_pen
        final_scores[uid] = best_val

    # Build slate frame
    slate = C[C["user_id"].isin(taken_ids)].copy()
    slate["div_penalty"] = slate["user_id"].map(penalties).fillna(0.0)
    slate["cupid_final"] = slate["user_id"].map(final_scores)
    # handy columns for display
    slate["kw_set"] = slate["user_id"].map(kw_map)
    slate = slate.sort_values("cupid_final", ascending=False).reset_index(drop=True)

    cols = ["user_id","cupid_pre_slate","div_penalty","cupid_final","kw_set","sex","age","location"]
    for c in cols:
        if c not in slate.columns:
            slate[c] = np.nan
    return slate[cols]

# --- build a diverse top-k from your current pre-slate ---
diverse_slate = cupid_build_diverse_slate(scored_kw, k=100)
diverse_slate

Unnamed: 0,user_id,cupid_pre_slate,div_penalty,cupid_final,kw_set,sex,age,location
0,20973,0.819688,0.000000,0.819688,"{movie, music}",m,43,"san mateo, california"
1,44725,0.813595,0.000000,0.813595,"{travel, food}",m,54,"alameda, california"
2,49783,0.813094,0.000000,0.813094,"{coffee, walking, run}",m,42,"oakland, california"
3,11636,0.799925,0.000000,0.799925,"{books, hike, tv}",m,63,"berkeley, california"
4,33493,0.796318,0.000000,0.796318,{},m,36,"san francisco, california"
...,...,...,...,...,...,...,...,...
95,29371,0.792236,0.133333,0.658902,"{music, books, hiking, movies, reading, movie,...",m,31,"vallejo, california"
96,33773,0.767608,0.109091,0.658517,"{travel, music, biking, dj, trip, movie, book,...",m,33,"san francisco, california"
97,55125,0.784562,0.126316,0.658246,"{travel, music, food, books, netflix, tv, read...",m,33,"san francisco, california"
98,33399,0.758234,0.100000,0.658234,{tv},m,31,"pacifica, california"


In [45]:
#Pretty final table with shared interests & notes
# Inputs: diverse_slate (from Cell 5b), df (master), u (target user_id)
# Output: final_table (nice display)

DISPLAY_SHARED_MAX = 4

def user_kw_set(user_id: int) -> set:
    bio = df.loc[df["user_id"] == user_id, "bio_text"]
    bio = str(bio.iloc[0]) if len(bio) else ""
    toks = re.findall(r"[a-zA-Z]+", bio.lower())
    return {t for t in toks if t in CUPID_KEYWORDS}

def shared_kw_str(user_kw: set, cand_kw: set, limit: int = DISPLAY_SHARED_MAX) -> str:
    shared = sorted(user_kw & (cand_kw or set()))
    if not shared:
        return ""
    return ", ".join(shared[:limit]) + (" ..." if len(shared) > limit else "")

def cupid_finalize_table_simple(diverse_df: pd.DataFrame, user_id: int, k: int = CUPID_TOP_K, serendipity_n: int = 2) -> pd.DataFrame:
    C = diverse_df.copy()
    C["user_id"] = C["user_id"].astype(int)

    # main picks
    main_n = max(0, k - serendipity_n)
    main_df = C.sort_values("cupid_final", ascending=False).head(main_n).copy()
    main_df["slot"] = "matched"

    # serendipity from remaining (already hard‑filtered earlier)
    remaining = C[~C["user_id"].isin(main_df["user_id"])].copy()
    u_kw = user_kw_set(user_id)
    remaining["shared_cnt"] = remaining["kw_set"].apply(lambda s: len(u_kw & s))
    ser_pool = remaining[(remaining.get("recency_norm", 0) > 0.6) & (remaining["shared_cnt"] >= 1)]
    if len(ser_pool) < serendipity_n:
        ser_pool = pd.concat([ser_pool, remaining]).drop_duplicates("user_id")
    ser_df = ser_pool.sample(n=min(serendipity_n, len(ser_pool)), random_state=42).copy()
    ser_df["slot"] = "suggested"

    # combine
    final = pd.concat([main_df, ser_df], ignore_index=True).drop_duplicates("user_id")

    # drop conflicting cols before merge so we keep master values
    for col in ["age","sex","location","job","religion","bio_text"]:
        if col in final.columns:
            final = final.drop(columns=[col])

    # enrich from master df
    enrich = df[["user_id","age","sex","location","job","religion","bio_text"]].copy()
    enrich["user_id"] = enrich["user_id"].astype(int)
    final = final.merge(enrich, on="user_id", how="left")

    # shared interests column
    final["shared_interests"] = final["kw_set"].apply(lambda s: shared_kw_str(u_kw, s))

    # pick columns + order
    cols = ["user_id","cupid_final","slot","age","sex","location","job","religion","shared_interests"]
    for c in cols:
        if c not in final.columns:
            final[c] = np.nan
    return final[cols].sort_values(["slot","cupid_final"], ascending=[True, False]).reset_index(drop=True)


In [29]:
from openai import OpenAI
import os
import requests

In [33]:
# ==== CUPID: pickup lines + question + date idea (API only) ====
API_KEY = "sk-proj-"
os.environ["OPENAI_API_KEY"] = API_KEY

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def truncate_bio(bio: str, max_words: int = 30) -> str:
    """Return the first max_words words of bio (with ellipsis if longer)."""
    words = str(bio).split()
    return " ".join(words[:max_words]) + (" ..." if len(words) > max_words else "")

def cupid_print_starters(user_id: int, k: int = 3, model: str = "gpt-4o-mini"):
    """
    For the top-k CUPID matches:
      - Show CUPID score + profile details
      - Generate 1 pickup line + 1 playful question + 1 date idea
    Assumes `final_slate` already exists from the CUPID pipeline.
    """
    # grab top-k matches from your CUPID final slate
    ranked = final_slate.sort_values("cupid_final", ascending=False).head(k).copy()

    for _, r in ranked.iterrows():
        uid   = int(r["user_id"])
        score = float(r["cupid_final"])
        age   = r.get("age", "")
        sex   = r.get("sex", "")
        loc   = r.get("location", "")
        job   = r.get("job", "")
        rel   = r.get("religion", "")
        # safe bio fetch + truncate
        bio_series = df.loc[df["user_id"] == uid, "bio_text"]
        bio_full = bio_series.iloc[0] if len(bio_series) else ""
        bio = truncate_bio(bio_full, max_words=30)

        # API call to GPT
        prompt = (
            f"Profile bio:\n{bio}\n\n"
            "Write exactly three lines:\n"
            "PICKUP: one playful pickup line (no emojis)\n"
            "QUESTION: one light, open question (no emojis)\n"
            "DATE: one fun first-date idea tailored to their mutual interests\n"
        )
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role":"user","content": prompt}],
            temperature=0.8,
        )
        text = resp.choices[0].message.content.strip().splitlines()

        # extract three parts
        pickup = next((ln.split(":",1)[1].strip() for ln in text if ln.upper().startswith("PICKUP:")), "")
        question = next((ln.split(":",1)[1].strip() for ln in text if ln.upper().startswith("QUESTION:")), "")
        date_idea = next((ln.split(":",1)[1].strip() for ln in text if ln.upper().startswith("DATE:")), "")

        # formatted printout
        print(f"\nCUPID Score = {score:.4f}")
        print(f"User id = {uid} | age = {age} | sex = {sex} | location = {loc}")
        print(f"job = {job} | religion = {rel}")
        print("bio text =")
        print(bio if bio else "(no bio)")
        print("\nGenerated starters:")
        print(f"  Pickup line: {pickup if pickup else '(n/a)'}")
        print(f"  Question:    {question if question else '(n/a)'}")
        print(f"  Date idea:   {date_idea if date_idea else '(n/a)'}")

# run (top 3 only to limit API usage)
cupid_print_starters(user_id=u, k=3, model="gpt-4o-mini")



CUPID Score = 0.8910
User id = 15046 | age = 36 | sex = m | location = san francisco, california
job =  | religion = 
bio text =
grew up near philadelphia. practiced law for a few years before deciding that i wanted to do something more creative. moving to the west coast has exposed me to a ...

Generated starters:
  Pickup line: Are you a legal document? Because you’ve got all the right clauses to catch my interest.
  Question:    What’s one creative project you’ve always wanted to dive into but haven’t yet?
  Date idea:   Let’s explore a local art gallery followed by a DIY painting class to unleash our inner creatives!

CUPID Score = 0.8547
User id = 18838 | age = 29 | sex = m | location = san francisco, california
job =  | religion = 
bio text =
i love to have fun -- this usually happens while having great discussions, watching sports, playing video games, exercising, reading books, watching movies, and many other things i find fascinating. ...

Generated starters:
  Pickup line: A

In [54]:
# run
final_table = cupid_finalize_table_simple(diverse_slate, user_id=15, k=100, serendipity_n=20)
final_table

Unnamed: 0,user_id,cupid_final,slot,age,sex,location,job,religion,shared_interests
0,20973,0.819688,matched,43,m,"san mateo, california",executive / management,christianity,
1,44725,0.813595,matched,54,m,"alameda, california",medicine / health,catholicism,
2,49783,0.813094,matched,42,m,"oakland, california",banking / financial / real estate,,
3,11636,0.799925,matched,63,m,"berkeley, california",political / government,atheism,
4,33493,0.796318,matched,36,m,"san francisco, california",other,catholicism but not too serious about it,
...,...,...,...,...,...,...,...,...,...
95,29371,0.658902,suggested,31,m,"vallejo, california",education / academia,atheism and laughing about it,
96,33773,0.658517,suggested,33,m,"san francisco, california",executive / management,agnosticism and laughing about it,
97,55125,0.658246,suggested,33,m,"san francisco, california",artistic / musical / writer,atheism and very serious about it,
98,33399,0.658234,suggested,31,m,"pacifica, california",computer / hardware / software,,


In [57]:
import pandas as pd, json, os
from datetime import datetime

# ensure folder exists
os.makedirs("results", exist_ok=True)

# run the algo
final_table = cupid_finalize_table_simple(
    diverse_slate, 
    user_id=15, 
    k=100, 
    serendipity_n=20
)

# standardize + annotate
cupid = final_table.reset_index(drop=True).copy()
cupid["rank"] = range(1, len(cupid) + 1)
cupid["app"]  = "custom"

# metadata
meta = {
    "app": "cupid",
    "user_id": 15,
    "top_n": 100,
    "serendipity_n": 20,
    "timestamp": datetime.utcnow().isoformat() + "Z",
}

stem = f"results/custom_algo_user_id:{meta['user_id']}_k{meta['top_n']}_ser{meta['serendipity_n']}"

# save
cupid.to_parquet(f"{stem}.parquet", index=False)
with open(f"{stem}.json","w") as f: json.dump(meta, f, indent=2)

print("Saved:", f"{stem}.parquet and {stem}.json")

Saved: results/custom_algo_user_id:15_k100_ser20.parquet and results/custom_algo_user_id:15_k100_ser20.json
