In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("okcupid_profiles.csv")
print("shape:", df.shape)
df.columns.tolist()

shape: (59946, 31)


['age',
 'status',
 'sex',
 'orientation',
 'body_type',
 'diet',
 'drinks',
 'drugs',
 'education',
 'ethnicity',
 'height',
 'income',
 'job',
 'last_online',
 'location',
 'offspring',
 'pets',
 'religion',
 'sign',
 'smokes',
 'speaks',
 'essay0',
 'essay1',
 'essay2',
 'essay3',
 'essay4',
 'essay5',
 'essay6',
 'essay7',
 'essay8',
 'essay9']

In [3]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
age,22,35,38,23,29
status,single,single,available,single,single
sex,m,m,m,m,m
orientation,straight,straight,straight,straight,straight
body_type,a little extra,average,thin,thin,athletic
diet,strictly anything,mostly other,anything,vegetarian,
drinks,socially,often,socially,socially,socially
drugs,never,sometimes,,,never
education,working on college/university,working on space camp,graduated from masters program,working on college/university,graduated from college/university
ethnicity,"asian, white",white,,white,"asian, black, other"


In [4]:
(df.isna().mean().sort_values(ascending=False) * 100).round(1)

offspring      59.3
diet           40.7
religion       33.7
pets           33.2
essay8         32.1
drugs          23.5
essay6         23.0
essay9         21.0
essay7         20.8
essay3         19.1
sign           18.4
essay5         18.1
essay4         17.6
essay2         16.1
job            13.7
essay1         12.6
education      11.1
ethnicity       9.5
smokes          9.2
essay0          9.2
body_type       8.8
drinks          5.0
speaks          0.1
height          0.0
status          0.0
location        0.0
last_online     0.0
income          0.0
orientation     0.0
sex             0.0
age             0.0
dtype: float64

In [5]:
essay_cols = [f"essay{i}" for i in range(10)]  # essay0 to essay9

# Replace NaNs with empty strings so join works cleanly
for col in essay_cols:
    df[col] = df[col].fillna("")

# Join into one string per row, with a separator
df["bio_text"] = df[essay_cols].agg(" <e> ".join, axis=1)

df[["bio_text"]].head(5)


Unnamed: 0,bio_text
0,about me: i would love to think that i was so...
1,i am a chef: this is what that means. 1. i am ...
2,"i'm not ashamed of much, but writing public te..."
3,i work in a library and go to school. . . <e> ...
4,hey how's it going? currently vague on the pro...


In [6]:
!pip -q install sentence-transformers

You should consider upgrading via the 'C:\Users\veera\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [8]:
df = df.reset_index().rename(columns={"index":"user_id"})
corpus = df["bio_text"].fillna("").astype(str).str.strip().tolist()

In [9]:
# normalize_embeddings=True makes vectors unit-length,
# so cosine similarity = dot product
# embeddings = model.encode(
    # corpus,
    # batch_size=64,
    # show_progress_bar=True,
    # convert_to_numpy=True,
    # normalize_embeddings=True
 # shape: (n_users, 384)

# save for reuse (avoid recomputing)
#np.save("okcupid_sbert_embeddings.npy", embeddings)

In [10]:
embeddings = np.load("okcupid_sbert_embeddings.npy")

In [11]:
from numpy.linalg import norm

E = embeddings  # alias

def top_k_semantic_neighbors_by_text(user_id, k=5):
    # map user_id -> row index
    i = df.index[df["user_id"] == user_id][0]
    # dot with everyone (cosine, since normalized)
    sims = E @ E[i]
    sims[i] = -1  # exclude self
    # top-k indices
    idx = np.argpartition(-sims, k)[:k]
    idx = idx[np.argsort(-sims[idx])]
    out = df.loc[idx, ["user_id","age","sex","orientation","location"]].copy()
    out["score"] = sims[idx]
    return out

In [12]:
# Pretty-print the query + its matches
# Prints the query user's basic info and a snippet of their bio, then prints the top-k matched users with their own bio snippets."""
def show_query_and_matches(user_id, matches_df, k=5, bio_chars=300):
    # get the query row
    qrow = df.loc[df["user_id"] == user_id].iloc[0]
    print(f"QUERY user {int(qrow.user_id)} | age {qrow.age} | {qrow.sex} {qrow.orientation} | {qrow.location}")
    print("-" * 90)
    qb = (qrow.bio_text or "").strip()
    print(qb[:bio_chars] + ("..." if len(qb) > bio_chars else ""))
    print("\n=== Matches ===\n")

    # loop through top-k rows from the matches_df
    for _, r in matches_df.head(k).iterrows():
        cand_bio = df.loc[df["user_id"] == r.user_id, "bio_text"].iloc[0]
        print(f"user {int(r.user_id)} | age {r.age} | {r.sex} {r.orientation} | {r.location} | score={r.score:.3f}")
        print((cand_bio or "").strip()[:bio_chars] + ("..." if len(cand_bio) > bio_chars else ""))
        print("-" * 90)
m = top_k_semantic_neighbors_by_text(user_id=1234, k=5)
show_query_and_matches(1234, m, k=5, bio_chars=280)

QUERY user 1234 | age 31 | m straight | palo alto, california
------------------------------------------------------------------------------------------
hello! thanks for stopping by! i hope you enjoy yourself!  i am extremely financially responsible and nice! need a new kidney? i'll give you mine! if the doctor won't let us do that, i'll just buy you a new one! no expense is too much for you!  i definitely have skinny genes! rea...

=== Matches ===

user 44713 | age 23 | m straight | hayward, california | score=0.594
hey my name is jacob i'm a die hard raider! lol i work as a massage therapist at chiropractic offices and my personal clients i do some personal tranning generally weight loss, nutrition, and hypertrophy. i'm currently attending ohlone college working on a physical therapy assist...
------------------------------------------------------------------------------------------
user 10612 | age 28 | m gay | san francisco, california | score=0.564
high energy goofy nerd-jock-goo

In [13]:
import numpy as np
import pandas as pd

# 1) Define interest_set (who this person is likely interested in) ---
def interest_set(row):
    sex = str(row.get("sex", "")).strip().lower()
    orient = str(row.get("orientation", "")).strip().lower()
    if orient == "straight":
        return {"m"} if sex == "f" else {"f"}
    if orient == "gay":
        return {"f"} if sex == "f" else {"m"}
    if orient == "bisexual":
        return {"m","f"}
    # unknown/empty → be permissive
    return {"m","f"}
    
# Filters a candidate DataFrame to keep only those with mutual interest.
def filter_by_orientation(user_id, candidates_df):
    # query's sex and interest set
    qrow = df.loc[df["user_id"] == user_id].iloc[0]
    q_interest = interest_set(qrow)
    q_sex = str(qrow.get("sex", "")).strip().lower()

    def mutual_interest(row):
        their_interest = interest_set(row)
        cand_sex = str(row.get("sex", "")).strip().lower()
        return (cand_sex in q_interest) and (q_sex in their_interest)

    filtered = candidates_df[candidates_df.apply(mutual_interest, axis=1)]
    return filtered

In [14]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import numpy as np
import pandas as pd

# --- 1) Geocoding helper ---
geolocator = Nominatim(user_agent="okcupid-matcher")
geocode_rl = RateLimiter(
    geolocator.geocode,
    min_delay_seconds=1.5,   
    max_retries=3,           
    error_wait_seconds=2.5,  
    swallow_exceptions=True
)

def geocode_location(loc_str, country_bias="USA", timeout_sec=10):
    """Input: 'city, state' string -> (lat, lon). Returns (nan, nan) on failure."""
    if not isinstance(loc_str, str) or not loc_str.strip():
        return (np.nan, np.nan)
    q = loc_str.strip()
    if country_bias and country_bias.lower() not in q.lower():
        q = f"{q}, {country_bias}"  # e.g., "san lorenzo, california, USA"
    res = geocode_rl(q, timeout=timeout_sec)
    if res is None:
        return (np.nan, np.nan)
    return (res.latitude, res.longitude)

# Pre-geocode unique locations (one-time)
unique_locs = df["location"].dropna().unique()
loc_to_coords = {}
for loc in unique_locs:
    loc_to_coords[loc] = geocode_location(loc)

# Add lat/lon to the main df
df["lat"] = df["location"].map(lambda loc: loc_to_coords.get(loc, (np.nan, np.nan))[0])
df["lon"] = df["location"].map(lambda loc: loc_to_coords.get(loc, (np.nan, np.nan))[1])

# --- 2) Distance helper ---
def haversine_km(lat1, lon1, lat2, lon2):
    """Great-circle distance between two points (in km)."""
    R = 6371.0088  # Earth's mean radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    return R * 2 * np.arcsin(np.sqrt(a))

In [15]:
# --- 3) Reusable filter (FAST: no API calls here) ---
def filter_by_location(user_id, candidates_df, max_km=50):
    """Keep only candidates within max_km of the query user's coords. Preserves order."""

    # get query coords; if missing, skip cleanly
    q = df.loc[df["user_id"] == user_id, ["lat", "lon"]]
    q_lat, q_lon = float(q.iloc[0]["lat"]), float(q.iloc[0]["lon"])

    # bring lat/lon into candidates_df (but only if needed to avoid lat_x/lat_y)
    if ("lat" in candidates_df.columns) and ("lon" in candidates_df.columns):
        cands = candidates_df.copy()
    else:
        cands = candidates_df.merge(df[["user_id", "lat", "lon"]],on="user_id", how="left", suffixes=("", "_df"))
        # if suffix columns appeared, coalesce back to plain 'lat'/'lon'
        if "lat_df" in cands.columns:
            cands["lat"] = cands.get("lat", np.nan).fillna(cands["lat_df"])
            cands.drop(columns=["lat_df"], inplace=True)
        if "lon_df" in cands.columns:
            cands["lon"] = cands.get("lon", np.nan).fillna(cands["lon_df"])
            cands.drop(columns=["lon_df"], inplace=True)

    # 2) distance predicate (use .get to avoid KeyError if a col is missing)
    def mutual_distance_ok(row):
        lat = row.get("lat", np.nan)
        lon = row.get("lon", np.nan)
        if pd.isna(lat) or pd.isna(lon) or pd.isna(q_lat) or pd.isna(q_lon):
            return False
        return haversine_km(q_lat, q_lon, lat, lon) <= max_km

    mask = cands.apply(mutual_distance_ok, axis=1)
    filtered = cands[mask].copy()
    print(f"Unfiltered: {len(candidates_df)} | After location filter (≤{max_km} km): {len(filtered)}")
    return filtered


In [16]:
print("coords in df:", "lat" in df.columns, "lon" in df.columns)
print(df[["location","lat","lon"]].head())

# verify the fresh cands used by get_matches has coords merged:
tmp = top_k_semantic_neighbors_by_text(user_id=1234, k=50).merge(
    df[["user_id","lat","lon"]], on="user_id", how="left")
print("coords in tmp:", "lat" in tmp.columns, "lon" in tmp.columns)
print(tmp[["user_id","lat","lon"]].head())

coords in df: True True
                          location        lat         lon
0  south san francisco, california  37.653540 -122.416866
1              oakland, california  37.804456 -122.271356
2        san francisco, california  37.779259 -122.419329
3             berkeley, california  37.870839 -122.272863
4        san francisco, california  37.779259 -122.419329
coords in tmp: True True
   user_id        lat         lon
0    44713  37.668821 -122.080796
1    10612  37.779259 -122.419329
2    30903  38.106198 -122.568119
3    31197  37.870839 -122.272863
4    23695  37.779259 -122.419329


In [17]:
cands = top_k_semantic_neighbors_by_text(user_id=1234, k=5000).copy()
cands.head(5)

Unnamed: 0,user_id,age,sex,orientation,location,score
44713,44713,23,m,straight,"hayward, california",0.593896
10612,10612,28,m,gay,"san francisco, california",0.564325
30903,30903,22,m,straight,"novato, california",0.560441
31197,31197,20,f,straight,"berkeley, california",0.559427
23695,23695,21,m,straight,"san francisco, california",0.559414


In [18]:
 """Keep only candidates whose age satisfies the half-your-age-plus-seven rule in both directions."""
def filter_by_age(user_id, candidates):
    # Get query user's age
    q_age = int(df.loc[df["user_id"] == user_id, "age"].iloc[0])
    # Query's acceptable range
    q_min = (q_age / 2) + 7
    q_max = (q_age - 7) * 2

    def mutual_age(row):
        cand_age = int(row["age"])
        # Candidate's acceptable range
        c_min = (cand_age / 2) + 7
        c_max = (cand_age - 7) * 2

        # Both must find each other acceptable
        return (q_min <= cand_age <= q_max) and (c_min <= q_age <= c_max)

    filtered = candidates[candidates.apply(mutual_age, axis=1)].copy()
    return filtered

In [19]:
import pandas as pd
import numpy as np

def _norm(s): 
    return str(s).strip().lower() if pd.notna(s) else ""

# ---- Diet → canonical groups ----
def canon_diet(v):
    v = _norm(v)
    if v in {"anything", "mostly anything", "strictly anything"}: return "omnivore"
    if v in {"vegetarian", "mostly vegetarian", "strictly vegetarian"}: return "vegetarian"
    if v in {"vegan", "mostly vegan", "strictly vegan"}: return "vegan"
    if v in {"pescetarian", "pescatarian"}: return "pescetarian"
    if v in {"kosher"}:  return "kosher"
    if v in {"halal"}:   return "halal"
    if v in {"mostly other", "other"}: return "other"
    return ""

# ---- Drinks → buckets ----
def canon_drinks(v):
    v = _norm(v)
    if v in {"not at all", "no"}:                      return "none"
    if v in {"rarely", "socially", "sometimes"}:       return "light"
    if v in {"often", "very often", "desperately"}:    return "heavy"
    return ""

# ---- Smokes → buckets ----
def canon_smokes(v):
    v = _norm(v)
    if v == "no":                                      return "none"
    if v in {"when drinking", "trying to quit", "sometimes"}: return "light"
    if v == "yes":                                     return "heavy"
    return ""

# ---- Drugs → buckets ----
def canon_drugs(v):
    v = _norm(v)
    if v in {"never", "no"}:                           return "none"
    if v in {"sometimes"}:                             return "sometimes"
    if v in {"often", "yes"}:                          return "often"
    return ""

# Create normalized columns once on df
df["diet_c"]   = df["diet"].apply(canon_diet)
df["drinks_c"] = df["drinks"].apply(canon_drinks)
df["smokes_c"] = df["smokes"].apply(canon_smokes)
df["drugs_c"]  = df["drugs"].apply(canon_drugs)

In [20]:
#Filters candidates by diet preference. Matches the query user's diet to each candidate's diet.
def filter_by_diet(user_id, candidates_df, allow_missing=True):
    """Keep candidates whose diet matches the user's diet_c (exact)."""
    user_diet = df.loc[df["user_id"] == user_id, "diet_c"].iloc[0]

    def diet_matches(row):
        cand_diet = row.get("diet_c", "")
        if user_diet and cand_diet:
            return cand_diet == user_diet
        return allow_missing

    out = candidates_df[candidates_df.apply(diet_matches, axis=1)].copy()
    print(f"Diet| before: {len(candidates_df)} → after: {len(out)}")
    return out

#Filters candidates by drinking habits. Matches the query user's 'drinks' field to each candidate's 'drinks' field.
def filter_by_drinks(user_id, candidates_df, allow_missing=True, strict=True):
    user_drinks = df.loc[df["user_id"] == user_id, "drinks_c"].iloc[0]
    neighbors = {"none": {"none","light"}, "light": {"none","light","heavy"}, "heavy": {"light","heavy"}}

    def drinks_match(row):
        cand = row.get("drinks_c", "")
        if not user_drinks or not cand:
            return allow_missing
        return (cand == user_drinks) if strict else (cand in neighbors[user_drinks])

    out = candidates_df[candidates_df.apply(drinks_match, axis=1)].copy()
    print(f"Drinks | before: {len(candidates_df)} → after: {len(out)} (strict={strict})")
    return out

# Keep candidates whose 'smokes' aligns with the user's preference.
#     strict=True  -> exact match (e.g., 'no' == 'no')
#     strict=False -> bucketed match:'no' -> 'no' 'sometimes'/'when drinking'/'trying to quit' -> 'light' 'yes' -> 'yes'
def filter_by_smokes(user_id, candidates_df, allow_missing=True, strict=False):
    user_smokes = df.loc[df["user_id"] == user_id, "smokes_c"].iloc[0]
    neighbors = {"none": {"none","light"}, "light": {"none","light","heavy"}, "heavy": {"light","heavy"}}

    def smokes_match(row):
        cand = row.get("smokes_c", "")
        if not user_smokes or not cand:
            return allow_missing
        return (cand == user_smokes) if strict else (cand in neighbors[user_smokes])

    out = candidates_df[candidates_df.apply(smokes_match, axis=1)].copy()
    print(f"Smokes | before: {len(candidates_df)} → after: {len(out)} (strict={strict})")
    return out

# Keep candidates whose 'drugs' aligns with the user's preference.
# strict=True  -> exact match (e.g., 'never' == 'never')
#strict=False -> bucketed match:'never'/'no' -> 'no' 'sometimes'/'often'/'yes' -> 'yes'
def filter_by_drugs(user_id, candidates_df, allow_missing=True, strict=True):
    user_drugs = df.loc[df["user_id"] == user_id, "drugs_c"].iloc[0]

    def drugs_match(row):
        cand = row.get("drugs_c", "")
        if not user_drugs or not cand:
            return allow_missing
        if strict:
            return cand == user_drugs
        # relaxed: only distinguish 'none' vs 'not-none'
        return (user_drugs == "none" and cand == "none") or (user_drugs != "none" and cand != "none")

    out = candidates_df[candidates_df.apply(drugs_match, axis=1)].copy()
    print(f"Drugs  | before: {len(candidates_df)} → after: {len(out)} (strict={strict})")
    return out

In [21]:
# get base candidates
cands = top_k_semantic_neighbors_by_text(user_id=1234, k=5000).copy()

# bring normalized lifestyle columns into cands (one time for this pool)
cands = cands.merge(
    df[["user_id","diet_c","drinks_c","smokes_c","drugs_c"]],
    on="user_id", how="left"
)

In [22]:
uid = 1234

# 1) Build a stable base pool (semantic neighbors only)
base = top_k_semantic_neighbors_by_text(user_id=uid, k=7000).copy()

# if your lifestyle filters expect *_c columns, attach once:
base = base.merge(
    df[["user_id","diet_c","drinks_c","smokes_c","drugs_c"]],
    on="user_id", how="left"
)

print("Base size:", len(base))

Base size: 7000


In [23]:
# 2) Test EACH filter separately (always starting from 'base')
location_only = filter_by_location(uid, base, 50)
location_only.head(5)

Unfiltered: 7000 | After location filter (≤50 km): 6044


Unnamed: 0,user_id,age,sex,orientation,location,score,diet_c,drinks_c,smokes_c,drugs_c,lat,lon
0,44713,23,m,straight,"hayward, california",0.593896,,light,none,,37.668821,-122.080796
1,10612,28,m,gay,"san francisco, california",0.564325,vegetarian,none,none,none,37.779259,-122.419329
3,31197,20,f,straight,"berkeley, california",0.559427,other,light,none,none,37.870839,-122.272863
4,23695,21,m,straight,"san francisco, california",0.559414,omnivore,light,light,,37.779259,-122.419329
5,353,32,m,straight,"redwood city, california",0.558399,,light,none,none,37.486324,-122.232523


In [24]:
# 2) Test EACH filter separately (always starting from 'base')
age_only = filter_by_age(uid, base)
age_only.head(5)

Unnamed: 0,user_id,age,sex,orientation,location,score,diet_c,drinks_c,smokes_c,drugs_c
0,44713,23,m,straight,"hayward, california",0.593896,,light,none,
1,10612,28,m,gay,"san francisco, california",0.564325,vegetarian,none,none,none
5,353,32,m,straight,"redwood city, california",0.558399,,light,none,none
6,33989,23,m,straight,"san francisco, california",0.556992,omnivore,light,none,none
7,46985,23,m,straight,"hayward, california",0.556756,,light,none,


In [25]:
# 2) Test EACH filter separately (always starting from 'base')
diet_only   = filter_by_diet(uid,base)
diet_only.head(5)

Diet| before: 7000 → after: 6223


Unnamed: 0,user_id,age,sex,orientation,location,score,diet_c,drinks_c,smokes_c,drugs_c
0,44713,23,m,straight,"hayward, california",0.593896,,light,none,
2,30903,22,m,straight,"novato, california",0.560441,omnivore,light,none,none
4,23695,21,m,straight,"san francisco, california",0.559414,omnivore,light,light,
5,353,32,m,straight,"redwood city, california",0.558399,,light,none,none
6,33989,23,m,straight,"san francisco, california",0.556992,omnivore,light,none,none


In [26]:
# 2) Test EACH filter separately (always starting from 'base')
drinks_only = filter_by_drinks(uid, base, allow_missing=True, strict=False)
drinks_only.head(5)

Drinks | before: 7000 → after: 7000 (strict=False)


Unnamed: 0,user_id,age,sex,orientation,location,score,diet_c,drinks_c,smokes_c,drugs_c
0,44713,23,m,straight,"hayward, california",0.593896,,light,none,
1,10612,28,m,gay,"san francisco, california",0.564325,vegetarian,none,none,none
2,30903,22,m,straight,"novato, california",0.560441,omnivore,light,none,none
3,31197,20,f,straight,"berkeley, california",0.559427,other,light,none,none
4,23695,21,m,straight,"san francisco, california",0.559414,omnivore,light,light,


In [27]:
# 2) Test EACH filter separately (always starting from 'base')
smokes_only = filter_by_smokes(uid, base, allow_missing=True, strict=False)
smokes_only.head(5)

Smokes | before: 7000 → after: 6731 (strict=False)


Unnamed: 0,user_id,age,sex,orientation,location,score,diet_c,drinks_c,smokes_c,drugs_c
0,44713,23,m,straight,"hayward, california",0.593896,,light,none,
1,10612,28,m,gay,"san francisco, california",0.564325,vegetarian,none,none,none
2,30903,22,m,straight,"novato, california",0.560441,omnivore,light,none,none
3,31197,20,f,straight,"berkeley, california",0.559427,other,light,none,none
4,23695,21,m,straight,"san francisco, california",0.559414,omnivore,light,light,


In [28]:
# 2) Test EACH filter separately (always starting from 'base')
drugs_only  = filter_by_drugs(uid,  base, allow_missing=True, strict=True)
drugs_only.head(5)

Drugs  | before: 7000 → after: 6152 (strict=True)


Unnamed: 0,user_id,age,sex,orientation,location,score,diet_c,drinks_c,smokes_c,drugs_c
0,44713,23,m,straight,"hayward, california",0.593896,,light,none,
1,10612,28,m,gay,"san francisco, california",0.564325,vegetarian,none,none,none
2,30903,22,m,straight,"novato, california",0.560441,omnivore,light,none,none
3,31197,20,f,straight,"berkeley, california",0.559427,other,light,none,none
4,23695,21,m,straight,"san francisco, california",0.559414,omnivore,light,light,


In [29]:
# 4) Compare sizes
print({
    "base":        len(base),
    "diet_only":   len(diet_only),
    "drinks_only": len(drinks_only),
    "smokes_only": len(smokes_only),
    "drugs_only":  len(drugs_only),
})

{'base': 7000, 'diet_only': 6223, 'drinks_only': 7000, 'smokes_only': 6731, 'drugs_only': 6152}


In [33]:
#test all filters 

def get_matches(
    user_id,
    k=10,
    max_candidates=5000,
    use_orientation=True,
    use_location=True, max_km=50,
    use_age=True,
    use_diet=True,
    use_drinks=False, drinks_strict=False,
    use_smokes=False, smokes_strict=False,
    use_drugs=False, drugs_strict=True,
    allow_missing=True
):
    """
    Full pipeline:
      1) text semantic neighbors
      2) (optional) mutual orientation
      3) (optional) location radius
      4) (optional) age rule (mutual half+7)
      5) (optional) lifestyle filters
    Always returns rows kept in original semantic similarity order.
    """
    # Base semantic candidates
    cands = top_k_semantic_neighbors_by_text(user_id=user_id, k=min(len(df) - 1, int(max_candidates))).copy()
    if "score" in cands.columns:
        cands.rename(columns={"score": "text_sim"}, inplace=True)

    # Merge any extra columns we might filter on
    merge_cols = ["user_id"]
    for col in ["diet_c","drinks_c","smokes_c","drugs_c","lat","lon"]:
        if col in df.columns:
            merge_cols.append(col)
    cands = cands.merge(df[merge_cols], on="user_id", how="left")

    print(f"Start pool: {len(cands)} (max_candidates={max_candidates})")

    # Filters
    if use_orientation:
        cands = filter_by_orientation(user_id, cands)
        print("After orientation:", len(cands))

    if use_location:
        cands = filter_by_location(user_id, cands, max_km=max_km)
        print(f"After location (≤{max_km} km):", len(cands))

    if use_age:
        cands = filter_by_age(user_id, cands)
        print("After age rule:", len(cands))

    if use_diet:
        cands = filter_by_diet(user_id, cands, allow_missing=allow_missing)
        print("After diet:", len(cands))

    if use_drinks:
        cands = filter_by_drinks(user_id, cands, allow_missing=allow_missing, strict=drinks_strict)
        print(f"After drinks (strict={drinks_strict}):", len(cands))

    if use_smokes:
        cands = filter_by_smokes(user_id, cands, allow_missing=allow_missing, strict=smokes_strict)
        print(f"After smokes (strict={smokes_strict}):", len(cands))

    if use_drugs:
        cands = filter_by_drugs(user_id, cands, allow_missing=allow_missing, strict=drugs_strict)
        print(f"After drugs (strict={drugs_strict}):", len(cands))

    # Return top-k after all filters
    return cands.head(k)

In [34]:
matches = get_matches(
    user_id=1234,
    k=10,
    use_orientation=True,
    use_location=True, max_km=50,
    use_age=True,
    use_diet=True,
    use_drinks=True,  drinks_strict=False,
    use_smokes=True,  smokes_strict=False,
    use_drugs=True,   drugs_strict=True,
    allow_missing=True
)
matches.head(5)

Start pool: 5000 (max_candidates=5000)
After orientation: 2023
Unfiltered: 2023 | After location filter (≤50 km): 1717
After location (≤50 km): 1717
After age rule: 1342
Diet| before: 1342 → after: 1152
After diet: 1152
Drinks | before: 1152 → after: 1152 (strict=False)
After drinks (strict=False): 1152
Smokes | before: 1152 → after: 1117 (strict=False)
After smokes (strict=False): 1117
Drugs  | before: 1117 → after: 1008 (strict=True)
After drugs (strict=True): 1008


Unnamed: 0,user_id,age,sex,orientation,location,text_sim,diet_c,drinks_c,smokes_c,drugs_c,lat,lon
13,57256,33,f,straight,"oakland, california",0.544072,,light,none,,37.804456,-122.271356
16,53843,47,f,straight,"oakland, california",0.533671,,light,none,none,37.804456,-122.271356
18,4015,38,f,straight,"san francisco, california",0.531963,,light,none,,37.779259,-122.419329
19,30039,24,f,straight,"san francisco, california",0.531841,,light,none,none,37.779259,-122.419329
26,9558,46,f,straight,"berkeley, california",0.522563,omnivore,none,none,none,37.870839,-122.272863


In [35]:
def compute_component_scores(user_id, cands, max_km=50, drinks_strict=False, smokes_strict=False, drugs_strict=True,
                             allow_missing=True):
    # --- text similarity ---
    if "text_sim" not in cands.columns and "score" in cands.columns:
        cands["text_sim"] = cands["score"]
    cands["text_sim"] = cands["text_sim"].fillna(0).clip(lower=0)

    # --- age score ---
    q_age = int(df.loc[df["user_id"] == user_id, "age"].iloc[0])
    q_min = (q_age / 2) + 7
    q_max = (q_age - 7) * 2

    def mutual_age_ok(a):
        a = int(a)
        a_min = (a / 2) + 7
        a_max = (a - 7) * 2
        return (q_min <= a <= q_max) and (a_min <= q_age <= a_max)

    cands["age_score"] = cands["age"].apply(lambda a: 1.0 if mutual_age_ok(a) else 0.0)

    # --- lifestyle score (0..1 average over available submatches) ---
    user_vals = {
        "diet_c":   df.loc[df["user_id"] == user_id, "diet_c"].iloc[0]  if "diet_c"   in df.columns else "",
        "drinks_c": df.loc[df["user_id"] == user_id, "drinks_c"].iloc[0] if "drinks_c" in df.columns else "",
        "smokes_c": df.loc[df["user_id"] == user_id, "smokes_c"].iloc[0] if "smokes_c" in df.columns else "",
        "drugs_c":  df.loc[df["user_id"] == user_id, "drugs_c"].iloc[0]  if "drugs_c"  in df.columns else "",
    }
    neigh = {"none": {"none","light"}, "light": {"none","light","heavy"}, "heavy": {"light","heavy"}}

    def lifestyle_row(row):
        parts = []
        # diet exact when both present
        if "diet_c" in row and user_vals["diet_c"]:
            if pd.notna(row["diet_c"]) and row["diet_c"]:
                parts.append(1.0 if row["diet_c"] == user_vals["diet_c"] else 0.0)

        # drinks strict/relaxed
        if "drinks_c" in row and user_vals["drinks_c"]:
            if pd.notna(row["drinks_c"]) and row["drinks_c"]:
                if drinks_strict:
                    parts.append(1.0 if row["drinks_c"] == user_vals["drinks_c"] else 0.0)
                else:
                    parts.append(1.0 if row["drinks_c"] in neigh.get(user_vals["drinks_c"], {user_vals["drinks_c"]}) else 0.0)

        # smokes strict/relaxed
        if "smokes_c" in row and user_vals["smokes_c"]:
            if pd.notna(row["smokes_c"]) and row["smokes_c"]:
                if smokes_strict:
                    parts.append(1.0 if row["smokes_c"] == user_vals["smokes_c"] else 0.0)
                else:
                    parts.append(1.0 if row["smokes_c"] in neigh.get(user_vals["smokes_c"], {user_vals["smokes_c"]}) else 0.0)

        # drugs strict or none vs non-none
        if "drugs_c" in row and user_vals["drugs_c"]:
            if pd.notna(row["drugs_c"]) and row["drugs_c"]:
                if drugs_strict:
                    parts.append(1.0 if row["drugs_c"] == user_vals["drugs_c"] else 0.0)
                else:
                    parts.append(1.0 if ((user_vals["drugs_c"] == "none" and row["drugs_c"] == "none") or
                                         (user_vals["drugs_c"] != "none" and row["drugs_c"] != "none")) else 0.0)

        if len(parts) == 0:
            return (1/3) if allow_missing else 0.0  # small neutral bump if everything missing
        return float(np.mean(parts))

    cands["lifestyle_score"] = cands.apply(lifestyle_row, axis=1)

    # --- location score (needs lat/lon) ---
    cands["loc_score"] = 0.0
    if {"lat","lon"}.issubset(cands.columns) and {"lat","lon"}.issubset(df.columns):
        q = df.loc[df["user_id"] == user_id, ["lat","lon"]]
        if not q.empty and not q.isna().any(axis=None):
            q_lat, q_lon = float(q.iloc[0]["lat"]), float(q.iloc[0]["lon"])
            valid = cands["lat"].notna() & cands["lon"].notna()
            dists = np.full(len(cands), np.nan)
            dists[valid.values] = haversine_km(q_lat, q_lon, cands.loc[valid, "lat"].values, cands.loc[valid, "lon"].values)
            cands["distance_km"] = dists
            if max_km > 0:
                cands["loc_score"] = np.clip(1 - (cands["distance_km"] / max_km), 0, 1).fillna(0)

    return cands
 

In [36]:
def apply_weights(cands, w_text=0.70, w_age=0.15, w_life=0.10, w_loc=0.05):
    """
    final_score = 0.70*text_sim + 0.15*age_score + 0.10*lifestyle_score + 0.05*loc_score
    Returns a new DataFrame sorted by final_score desc.
    """
    # normalize weights so any numbers work
    total = float(w_text + w_age + w_life + w_loc)
    if total <= 0:
        w_text, w_age, w_life, w_loc = 1.0, 0.0, 0.0, 0.0
        total = 1.0
    w_text, w_age, w_life, w_loc = (w_text/total, w_age/total, w_life/total, w_loc/total)

    # make sure component cols exist
    for col in ["text_sim","age_score","lifestyle_score","loc_score"]:
        if col not in cands.columns:
            cands[col] = 0.0

    cands = cands.copy()
    cands["final_score"] = (w_text * cands["text_sim"] + w_age  * cands["age_score"] + w_life * cands["lifestyle_score"] + w_loc  * cands["loc_score"])
    return cands.sort_values("final_score", ascending=False)

In [37]:
# 1) Get your filtered candidates with your existing pipeline
cands = get_matches(
    user_id=1234,
    k=5000,                    
    use_orientation=True,
    use_location=True, max_km=50,
    use_age=True,
    use_diet=True,
    use_drinks=True, drinks_strict=False,
    use_smokes=True, smokes_strict=False,
    use_drugs=True, drugs_strict=True,
    allow_missing=True
)

# 2) Compute component scores on that pool
cands = compute_component_scores(
    user_id=1234, cands=cands, max_km=50,
    drinks_strict=False, smokes_strict=False, drugs_strict=True,
    allow_missing=True
)

# 3) Apply weights and get the top-k you actually want to show
ranked = apply_weights(cands, w_text=0.70, w_age=0.15, w_life=0.10, w_loc=0.05)
ranked.head(10)   # your final ranked matches


Start pool: 5000 (max_candidates=5000)
After orientation: 2023
Unfiltered: 2023 | After location filter (≤50 km): 1717
After location (≤50 km): 1717
After age rule: 1342
Diet| before: 1342 → after: 1152
After diet: 1152
Drinks | before: 1152 → after: 1152 (strict=False)
After drinks (strict=False): 1152
Smokes | before: 1152 → after: 1117 (strict=False)
After smokes (strict=False): 1117
Drugs  | before: 1117 → after: 1008 (strict=True)
After drugs (strict=True): 1008


Unnamed: 0,user_id,age,sex,orientation,location,text_sim,diet_c,drinks_c,smokes_c,drugs_c,lat,lon,age_score,lifestyle_score,loc_score,distance_km,final_score
106,4064,34,f,straight,"palo alto, california",0.486174,,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.640322
13,57256,33,f,straight,"oakland, california",0.544072,,light,none,,37.804456,-122.271356,1.0,1.0,0.175382,41.230915,0.639619
111,57586,35,f,straight,"atherton, california",0.484042,omnivore,light,none,none,37.453773,-122.205827,1.0,1.0,0.916146,4.192677,0.634637
67,39282,30,f,straight,"san mateo, california",0.499256,omnivore,light,none,none,37.496904,-122.333057,1.0,1.0,0.672682,16.365892,0.633113
16,53843,47,f,straight,"oakland, california",0.533671,,light,none,none,37.804456,-122.271356,1.0,1.0,0.175382,41.230915,0.632339
33,33605,34,f,straight,"castro valley, california",0.515604,,light,none,none,37.713606,-122.071566,1.0,1.0,0.381272,30.936387,0.629986
18,4015,38,f,straight,"san francisco, california",0.531963,,light,none,,37.779259,-122.419329,1.0,1.0,0.126063,43.696864,0.628677
19,30039,24,f,straight,"san francisco, california",0.531841,,light,none,none,37.779259,-122.419329,1.0,1.0,0.126063,43.696864,0.628592
32,49650,31,f,straight,"oakland, california",0.517111,omnivore,light,none,none,37.804456,-122.271356,1.0,1.0,0.175382,41.230915,0.620747
29,20404,43,f,straight,"san francisco, california",0.518943,,light,none,none,37.779259,-122.419329,1.0,1.0,0.126063,43.696864,0.619563


In [38]:
ranked_local = apply_weights(cands, w_text=0.30, w_age=0.10, w_life=0.10, w_loc=0.50)
ranked_local.head(10)

Unnamed: 0,user_id,age,sex,orientation,location,text_sim,diet_c,drinks_c,smokes_c,drugs_c,lat,lon,age_score,lifestyle_score,loc_score,distance_km,final_score
106,4064,34,f,straight,"palo alto, california",0.486174,,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.845852
386,13202,25,f,straight,"palo alto, california",0.446574,,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.833972
1097,48707,26,f,straight,"palo alto, california",0.415007,,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.824502
1099,47132,28,f,straight,"palo alto, california",0.414993,,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.824498
1532,10289,28,f,straight,"palo alto, california",0.403475,omnivore,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.821042
2080,27216,26,f,straight,"palo alto, california",0.392013,omnivore,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.817604
2497,50486,35,f,straight,"palo alto, california",0.384409,,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.815323
2501,28737,46,f,straight,"palo alto, california",0.384343,,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.815303
2635,51549,28,f,straight,"palo alto, california",0.382397,omnivore,light,none,none,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.814719
2647,40607,23,f,straight,"palo alto, california",0.382205,omnivore,light,none,,37.444329,-122.159847,1.0,1.0,1.0,0.0,0.814662
