In [1]:
import pandas as pd

In [10]:
df = pd.read_csv("okcupid_profiles.csv")
print("shape:", df.shape)
df.columns.tolist()

shape: (59946, 31)


['age',
 'status',
 'sex',
 'orientation',
 'body_type',
 'diet',
 'drinks',
 'drugs',
 'education',
 'ethnicity',
 'height',
 'income',
 'job',
 'last_online',
 'location',
 'offspring',
 'pets',
 'religion',
 'sign',
 'smokes',
 'speaks',
 'essay0',
 'essay1',
 'essay2',
 'essay3',
 'essay4',
 'essay5',
 'essay6',
 'essay7',
 'essay8',
 'essay9']

In [11]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
age,22,35,38,23,29
status,single,single,available,single,single
sex,m,m,m,m,m
orientation,straight,straight,straight,straight,straight
body_type,a little extra,average,thin,thin,athletic
diet,strictly anything,mostly other,anything,vegetarian,
drinks,socially,often,socially,socially,socially
drugs,never,sometimes,,,never
education,working on college/university,working on space camp,graduated from masters program,working on college/university,graduated from college/university
ethnicity,"asian, white",white,,white,"asian, black, other"


In [12]:
(df.isna().mean().sort_values(ascending=False) * 100).round(1)

offspring      59.3
diet           40.7
religion       33.7
pets           33.2
essay8         32.1
drugs          23.5
essay6         23.0
essay9         21.0
essay7         20.8
essay3         19.1
sign           18.4
essay5         18.1
essay4         17.6
essay2         16.1
job            13.7
essay1         12.6
education      11.1
ethnicity       9.5
smokes          9.2
essay0          9.2
body_type       8.8
drinks          5.0
speaks          0.1
height          0.0
status          0.0
location        0.0
last_online     0.0
income          0.0
orientation     0.0
sex             0.0
age             0.0
dtype: float64

In [16]:
essay_cols = [f"essay{i}" for i in range(10)]  # essay0 to essay9

# Replace NaNs with empty strings so join works cleanly
for col in essay_cols:
    df[col] = df[col].fillna("")

# Join into one string per row, with a separator
df["bio_text"] = df[essay_cols].agg(" <e> ".join, axis=1)

df[["bio_text"]].head(5)


Unnamed: 0,bio_text
0,about me: i would love to think that i was so...
1,i am a chef: this is what that means. 1. i am ...
2,"i'm not ashamed of much, but writing public te..."
3,i work in a library and go to school. . . <e> ...
4,hey how's it going? currently vague on the pro...


In [17]:
!pip -q install sentence-transformers

You should consider upgrading via the 'C:\Users\veera\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [18]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
df = df.reset_index().rename(columns={"index":"user_id"})
corpus = df["bio_text"].fillna("").astype(str).str.strip().tolist()

In [20]:
# normalize_embeddings=True makes vectors unit-length,
# so cosine similarity = dot product
embeddings = model.encode(
    corpus,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)  # shape: (n_users, 384)

# save for reuse (avoid recomputing)
np.save("okcupid_sbert_embeddings.npy", embeddings)

Batches:   0%|          | 0/937 [00:00<?, ?it/s]

In [22]:
from numpy.linalg import norm

E = embeddings  # alias

def top_k_semantic_neighbors(user_id, k=5):
    # map user_id -> row index
    i = df.index[df["user_id"] == user_id][0]
    # dot with everyone (cosine, since normalized)
    sims = E @ E[i]
    sims[i] = -1  # exclude self
    # top-k indices
    idx = np.argpartition(-sims, k)[:k]
    idx = idx[np.argsort(-sims[idx])]
    out = df.loc[idx, ["user_id","age","sex","orientation","location"]].copy()
    out["score"] = sims[idx]
    return out

top_k_semantic_neighbors(user_id=1234, k=5)


Unnamed: 0,user_id,age,sex,orientation,location,score
44713,44713,23,m,straight,"hayward, california",0.593896
10612,10612,28,m,gay,"san francisco, california",0.564325
30903,30903,22,m,straight,"novato, california",0.560441
31197,31197,20,f,straight,"berkeley, california",0.559427
23695,23695,21,m,straight,"san francisco, california",0.559414
