In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("okcupid_profiles.csv")
print("shape:", df.shape)
df.columns.tolist()

shape: (59946, 31)


['age',
 'status',
 'sex',
 'orientation',
 'body_type',
 'diet',
 'drinks',
 'drugs',
 'education',
 'ethnicity',
 'height',
 'income',
 'job',
 'last_online',
 'location',
 'offspring',
 'pets',
 'religion',
 'sign',
 'smokes',
 'speaks',
 'essay0',
 'essay1',
 'essay2',
 'essay3',
 'essay4',
 'essay5',
 'essay6',
 'essay7',
 'essay8',
 'essay9']

In [3]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
age,22,35,38,23,29
status,single,single,available,single,single
sex,m,m,m,m,m
orientation,straight,straight,straight,straight,straight
body_type,a little extra,average,thin,thin,athletic
diet,strictly anything,mostly other,anything,vegetarian,
drinks,socially,often,socially,socially,socially
drugs,never,sometimes,,,never
education,working on college/university,working on space camp,graduated from masters program,working on college/university,graduated from college/university
ethnicity,"asian, white",white,,white,"asian, black, other"


In [4]:
(df.isna().mean().sort_values(ascending=False) * 100).round(1)

offspring      59.3
diet           40.7
religion       33.7
pets           33.2
essay8         32.1
drugs          23.5
essay6         23.0
essay9         21.0
essay7         20.8
essay3         19.1
sign           18.4
essay5         18.1
essay4         17.6
essay2         16.1
job            13.7
essay1         12.6
education      11.1
ethnicity       9.5
smokes          9.2
essay0          9.2
body_type       8.8
drinks          5.0
speaks          0.1
height          0.0
status          0.0
location        0.0
last_online     0.0
income          0.0
orientation     0.0
sex             0.0
age             0.0
dtype: float64

In [5]:
essay_cols = [f"essay{i}" for i in range(10)]  # essay0 to essay9

# Replace NaNs with empty strings so join works cleanly
for col in essay_cols:
    df[col] = df[col].fillna("")

# Join into one string per row, with a separator
df["bio_text"] = df[essay_cols].agg(" <e> ".join, axis=1)

df[["bio_text"]].head(5)


Unnamed: 0,bio_text
0,about me: i would love to think that i was so...
1,i am a chef: this is what that means. 1. i am ...
2,"i'm not ashamed of much, but writing public te..."
3,i work in a library and go to school. . . <e> ...
4,hey how's it going? currently vague on the pro...


In [6]:
!pip -q install sentence-transformers

You should consider upgrading via the 'C:\Users\veera\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [8]:
df = df.reset_index().rename(columns={"index":"user_id"})
corpus = df["bio_text"].fillna("").astype(str).str.strip().tolist()

In [10]:
# normalize_embeddings=True makes vectors unit-length,
# so cosine similarity = dot product
# embeddings = model.encode(
    # corpus,
    # batch_size=64,
    # show_progress_bar=True,
    # convert_to_numpy=True,
    # normalize_embeddings=True
 # shape: (n_users, 384)

# save for reuse (avoid recomputing)
#np.save("okcupid_sbert_embeddings.npy", embeddings)

In [11]:
embeddings = np.load("okcupid_sbert_embeddings.npy")

In [12]:
from numpy.linalg import norm

E = embeddings  # alias

def top_k_semantic_neighbors_by_text(user_id, k=5):
    # map user_id -> row index
    i = df.index[df["user_id"] == user_id][0]
    # dot with everyone (cosine, since normalized)
    sims = E @ E[i]
    sims[i] = -1  # exclude self
    # top-k indices
    idx = np.argpartition(-sims, k)[:k]
    idx = idx[np.argsort(-sims[idx])]
    out = df.loc[idx, ["user_id","age","sex","orientation","location"]].copy()
    out["score"] = sims[idx]
    return out

In [13]:
# Pretty-print the query + its matches
# Prints the query user's basic info and a snippet of their bio, then prints the top-k matched users with their own bio snippets."""
def show_query_and_matches(user_id, matches_df, k=5, bio_chars=300):
    # get the query row
    qrow = df.loc[df["user_id"] == user_id].iloc[0]
    print(f"QUERY user {int(qrow.user_id)} | age {qrow.age} | {qrow.sex} {qrow.orientation} | {qrow.location}")
    print("-" * 90)
    qb = (qrow.bio_text or "").strip()
    print(qb[:bio_chars] + ("..." if len(qb) > bio_chars else ""))
    print("\n=== Matches ===\n")

    # loop through top-k rows from the matches_df
    for _, r in matches_df.head(k).iterrows():
        cand_bio = df.loc[df["user_id"] == r.user_id, "bio_text"].iloc[0]
        print(f"user {int(r.user_id)} | age {r.age} | {r.sex} {r.orientation} | {r.location} | score={r.score:.3f}")
        print((cand_bio or "").strip()[:bio_chars] + ("..." if len(cand_bio) > bio_chars else ""))
        print("-" * 90)
m = top_k_semantic_neighbors_by_text(user_id=1234, k=5)
show_query_and_matches(1234, m, k=5, bio_chars=280)

QUERY user 1234 | age 31 | m straight | palo alto, california
------------------------------------------------------------------------------------------
hello! thanks for stopping by! i hope you enjoy yourself!  i am extremely financially responsible and nice! need a new kidney? i'll give you mine! if the doctor won't let us do that, i'll just buy you a new one! no expense is too much for you!  i definitely have skinny genes! rea...

=== Matches ===

user 44713 | age 23 | m straight | hayward, california | score=0.594
hey my name is jacob i'm a die hard raider! lol i work as a massage therapist at chiropractic offices and my personal clients i do some personal tranning generally weight loss, nutrition, and hypertrophy. i'm currently attending ohlone college working on a physical therapy assist...
------------------------------------------------------------------------------------------
user 10612 | age 28 | m gay | san francisco, california | score=0.564
high energy goofy nerd-jock-goo

In [14]:
import numpy as np
import pandas as pd

# 1) Define interest_set (who this person is likely interested in) ---
def interest_set(row):
    sex = str(row.get("sex", "")).strip().lower()
    orient = str(row.get("orientation", "")).strip().lower()
    if orient == "straight":
        return {"m"} if sex == "f" else {"f"}
    if orient == "gay":
        return {"f"} if sex == "f" else {"m"}
    if orient == "bisexual":
        return {"m","f"}
    # unknown/empty → be permissive
    return {"m","f"}
    
# Filters a candidate DataFrame to keep only those with mutual interest.
def filter_by_orientation(user_id, candidates_df):
    # query's sex and interest set
    qrow = df.loc[df["user_id"] == user_id].iloc[0]
    q_interest = interest_set(qrow)
    q_sex = str(qrow.get("sex", "")).strip().lower()

    def mutual_interest(row):
        their_interest = interest_set(row)
        cand_sex = str(row.get("sex", "")).strip().lower()
        return (cand_sex in q_interest) and (q_sex in their_interest)

    filtered = candidates_df[candidates_df.apply(mutual_interest, axis=1)]
    return filtered

In [15]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import numpy as np

# --- 1) Geocoding helper ---
geolocator = Nominatim(user_agent="okcupid-matcher")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.0)

def geocode_location(loc_str):
    """Input: 'city, state' string. Output: (lat, lon) as floats, or (np.nan, np.nan) if lookup fails."""
    if not isinstance(loc_str, str) or not loc_str.strip():
        return (np.nan, np.nan)
    try:
        res = geocode(loc_str.strip())
        if res is None:
            return (np.nan, np.nan)
        return (res.latitude, res.longitude)
    except Exception:
        return (np.nan, np.nan)

# Pre-geocode unique locations
unique_locs = df["location"].dropna().unique()
loc_to_coords = {}
for loc in unique_locs:
    loc_to_coords[loc] = geocode_location(loc)

# Add lat/lon columns
df["lat"] = df["location"].map(lambda loc: loc_to_coords[loc][0])
df["lon"] = df["location"].map(lambda loc: loc_to_coords[loc][1])

# --- 2) Distance helper ---
def haversine_km(lat1, lon1, lat2, lon2):
    """Great-circle distance between two points (in km)."""
    R = 6371.0088  # Earth's mean radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    return R * 2 * np.arcsin(np.sqrt(a))

# --- 3) Reusable filter ---
def filter_by_location(user_id, candidates_df, max_km=50):
    """Filters candidates to keep only those within max_km of the query's location. Keeps order intact."""
    q_lat = df.loc[df["user_id"] == user_id, "lat"].iloc[0]
    q_lon = df.loc[df["user_id"] == user_id, "lon"].iloc[0]
    
    def within_distance(row):
        if np.isnan(row["lat"]) or np.isnan(row["lon"]):
            return False
        return haversine_km(q_lat, q_lon, row["lat"], row["lon"]) <= max_km

    filtered = candidates_df[candidates_df.apply(within_distance, axis=1)].copy()
    print(f"Unfiltered: {len(candidates_df)} | After location filter: {len(filtered)}")
    return filtered

In [16]:
cands = top_k_semantic_neighbors_by_text(user_id=1234, k=5000).copy()
cands.head(5)

Unnamed: 0,user_id,age,sex,orientation,location,score
44713,44713,23,m,straight,"hayward, california",0.593896
10612,10612,28,m,gay,"san francisco, california",0.564325
30903,30903,22,m,straight,"novato, california",0.560441
31197,31197,20,f,straight,"berkeley, california",0.559427
23695,23695,21,m,straight,"san francisco, california",0.559414


In [17]:
cands = filter_by_orientation(1234, cands)
cands.head(5)

Unnamed: 0,user_id,age,sex,orientation,location,score
31197,31197,20,f,straight,"berkeley, california",0.559427
20714,20714,18,f,straight,"richmond, california",0.555317
57256,57256,33,f,straight,"oakland, california",0.544072
26993,26993,20,f,bisexual,"oakland, california",0.534152
53843,53843,47,f,straight,"oakland, california",0.533671


In [18]:
cands = filter_by_location(1234, cands, max_km=50)
cands.head(5)

KeyboardInterrupt: 

In [19]:
 """Keep only candidates whose age satisfies the half-your-age-plus-seven rule in both directions."""
def filter_by_age_half_plus_seven(user_id, candidates):
    # Get query user's age
    q_age = int(df.loc[df["user_id"] == user_id, "age"].iloc[0])
    # Query's acceptable range
    q_min = (q_age / 2) + 7
    q_max = (q_age - 7) * 2

    def mutual_age_ok(row):
        cand_age = int(row["age"])
        # Candidate's acceptable range
        c_min = (cand_age / 2) + 7
        c_max = (cand_age - 7) * 2

        # Both must find each other acceptable
        return (q_min <= cand_age <= q_max) and (c_min <= q_age <= c_max)

    filtered = candidates[candidates.apply(mutual_age_ok, axis=1)].copy()
    return filtered


In [21]:
cands = filter_by_age_half_plus_seven(1234, cands)
cands.head(5)

Unnamed: 0,user_id,age,sex,orientation,location,score
57256,57256,33,f,straight,"oakland, california",0.544072
53843,53843,47,f,straight,"oakland, california",0.533671
4015,4015,38,f,straight,"san francisco, california",0.531963
30039,30039,24,f,straight,"san francisco, california",0.531841
52460,52460,24,f,straight,"san leandro, california",0.525988
