In [8]:
import boto3
import pandas as pd
import pyarrow.parquet as pq
import s3fs
import numpy as np


silver_path = "krish-redditadhd-silver/adhd_posts_silver/"

# Create an S3 filesystem
fs = s3fs.S3FileSystem()

# Load Parquet dataset (automatically reads all partitions)
dataset = pq.ParquetDataset(silver_path, filesystem=fs)

# Convert to Pandas DataFrame
df = dataset.read().to_pandas()
df["clean_title"] = df["clean_title"].fillna("")
df["clean_body"] = df["clean_body"].fillna("")

df["text_for_embedding"] = (
    df["clean_title"].str.strip()
    + "\n\n"
    + df["clean_body"].str.strip()
)

df["text_for_embedding"] = df["text_for_embedding"].replace("", np.nan)
df = df.dropna(subset=["text_for_embedding"])
df.head()




Unnamed: 0,post_id,subreddit,author,title,body,score,num_comments,created_utc,created_at,title_length,...,has_body,clean_body,clean_title,body_lower,title_lower,post_age_days,year,month,day,text_for_embedding
0,1ola0cc,ADHD,AcyanidePancake,My room feels like a monument to financial ruin,"I (23) have a ton of stuff (I'm not a hoarder,...",10,7,1761955000,2025-10-31 23:56:40,47,...,True,"I (23) have a ton of stuff (I'm not a hoarder,...",My room feels like a monument to financial ruin,"i (23) have a ton of stuff (i'm not a hoarder,...",my room feels like a monument to financial ruin,11,2025,10,31,My room feels like a monument to financial rui...
1,1ol9qc9,ADHD,Minute_Personality79,Does medikinet work for you?,Can anyone tell me their experience with medik...,0,3,1761954188,2025-10-31 23:43:08,28,...,True,Can anyone tell me their experience with medik...,Does medikinet work for you?,can anyone tell me their experience with medik...,does medikinet work for you?,11,2025,10,31,Does medikinet work for you?\n\nCan anyone tel...
2,1ol9kvc,ADHD,skyrimisagood,I forgot my groceries in the mall,I did some light shopping and then I had lunch...,39,14,1761953746,2025-10-31 23:35:46,33,...,True,I did some light shopping and then I had lunch...,I forgot my groceries in the mall,i did some light shopping and then i had lunch...,i forgot my groceries in the mall,11,2025,10,31,I forgot my groceries in the mall\n\nI did som...
3,1ol9khx,ADHD,knockedownupagain,They say a person with ADHD can focus on somet...,"Ive been struggling with interest, ""drive"", co...",39,58,1761953717,2025-10-31 23:35:17,118,...,True,"Ive been struggling with interest, ""drive"", co...",They say a person with ADHD can focus on somet...,"ive been struggling with interest, ""drive"", co...",they say a person with adhd can focus on somet...,11,2025,10,31,They say a person with ADHD can focus on somet...
4,1ol8rll,ADHD,Junior-Hair3363,Why are generics allowed to be made differently,Might be a very vague and general question but...,82,39,1761951461,2025-10-31 22:57:41,47,...,True,Might be a very vague and general question but...,Why are generics allowed to be made differently,might be a very vague and general question but...,why are generics allowed to be made differently,11,2025,10,31,Why are generics allowed to be made differentl...


In [9]:
!pip install -q sentence-transformers

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

  import pynvml  # type: ignore[import]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
sample_text = df["text_for_embedding"].iloc[0]
embedding = model.encode(sample_text)

print(type(embedding))
print(len(embedding))
print(embedding[:10])

<class 'numpy.ndarray'>
384
[ 0.07754449 -0.02071184 -0.03151453  0.05262284  0.03956795  0.0212282
  0.07508712  0.03332366 -0.0101432   0.00728439]


In [12]:
df_usable = df[
    df["text_for_embedding"].notna()
    & (df["text_for_embedding"].str.strip() != "")
].copy()

len(df_usable)

1320

In [13]:
sample_size = min(2000, len(df_usable))
df_sample = df_usable.sample(sample_size, random_state=42).copy()

texts = df_sample["text_for_embedding"].tolist()
len(texts)

1320

In [14]:
import numpy as np

embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True
)

embeddings = np.array(embeddings)
embeddings.shape

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

(1320, 384)

In [15]:
df_sample = df_sample.reset_index(drop=True)
df_sample["embedding"] = list(embeddings)

df_sample.head(2)

Unnamed: 0,post_id,subreddit,author,title,body,score,num_comments,created_utc,created_at,title_length,...,clean_body,clean_title,body_lower,title_lower,post_age_days,year,month,day,text_for_embedding,embedding
0,1oo05nq,ADHD,Livid_Ad8826,ADHD kinship?,High level summary of my situation: M48 ADHD h...,2,3,1762237254,2025-11-04 06:20:54,13,...,High level summary of my situation: M48 ADHD h...,ADHD kinship?,high level summary of my situation: m48 adhd h...,adhd kinship?,7,2025,11,4,ADHD kinship?\n\nHigh level summary of my situ...,"[0.034755897, -0.013595211, -0.017276961, 0.04..."
1,1onw7vu,ADHD,hockeyguy1115,Recommended brand or generic for Adderall xr o...,Wanted to give either of those a try but hear ...,0,3,1762224669,2025-11-04 02:51:09,60,...,Wanted to give either of those a try but hear ...,Recommended brand or generic for Adderall xr o...,wanted to give either of those a try but hear ...,recommended brand or generic for adderall xr o...,7,2025,11,4,Recommended brand or generic for Adderall xr o...,"[-0.05174595, -0.1365774, 0.036687538, -0.0734..."


In [16]:
query = "I feel overwhelmed at work and can't focus on anything"
query_emb = model.encode(query)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

sims = cosine_similarity([query_emb], embeddings)[0]  # shape: (sample_size,)

# Get top 5 most similar posts
top_k = 5
top_idx = sims.argsort()[::-1][:top_k]

df_sample[["title", "body"]].iloc[top_idx]


Unnamed: 0,title,body
1216,To tired to work and force myself to focus but...,It is really frustrating. I want to get stuff ...
662,I am overwhelmed by the amount of work at work,Whenever someone at work or school gives me a ...
1258,never feel like i get enough done,no matter how much i accomplish in a day i don...
353,Daily tasks for work,I started limiting myself to 5 tasks max and i...
911,Feeling useless and anxiety for not finishing ...,I’m curious that do you experience this too? M...


In [None]:
results = df_sample[["title", "body"]].iloc[top_idx].copy()
results["similarity"] = sims[top_idx]
results

: 