### Load data

In [27]:
import pandas as pd

data = pd.read_json("../AwareData/reddit.json")

### Preprocess

Add a column of reddit subreddit labels.

In [47]:
subreddits = data['reddit_subreddit'].unique()
label_dict = dict(zip(subreddits, range(len(subreddits))))
data['reddit_subreddit_label'] = data['reddit_subreddit'].apply(lambda x: label_dict[x])

Add a column of indices.

In [48]:
data.loc[:, 'index'] = data.index

Make a new dataframe of submissions only.

In [49]:
subs = data.loc[data['aware_post_type']=='submission'].reset_index(drop=True)

Combine submission title and text.

In [50]:
subs.loc[:, 'text_and_title'] = [subs['reddit_title'].iloc[i] + ' [SEP] ' + subs['reddit_text'].iloc[i] for i in range(len(subs))]

Add a column of comment indices

In [51]:
subs['comment_indices'] = [[] for _ in range(len(subs))]
for i in range(len(subs)):
    sub_index = subs.loc[i, 'index']
    next_sub_index = len(data) if i == len(subs)-1 else subs.loc[i+1,'index']
    for j in range(sub_index+1, next_sub_index):
        if data.loc[j, 'reddit_parent_id'] == subs.loc[i, 'reddit_name']:
            subs['comment_indices'].iloc[i].append(data.loc[j,'index'])

Drop those submissons with no comments

In [52]:
subs = subs.drop(subs[subs['comment_indices'].str.len()==0].index)

In [53]:
len(subs)

20613

### Embedding

In [54]:
import torch

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

In [55]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2", device)

In [56]:
vectors = model.encode(subs['text_and_title'].values.tolist(),
                       convert_to_numpy=True,
                       normalize_embeddings=True)
subs['vector'] = vectors.tolist()

### KNN

In [57]:
import lancedb

In [58]:
db = lancedb.connect("../.lancedb")
table = db.create_table("reddit", subs, mode="overwrite")
# table = db.create_table("reddit", subs, exist_ok=True)

[2024-04-19T04:51:34Z WARN  lance::dataset] No existing dataset at /Users/yangxiaoluo/Documents/projects/aware_nlp/code/../.lancedb/reddit.lance, it will be created


In [34]:
# table.create_index(
#      num_partitions=256,
#      num_sub_vectors=96,
#      accelerator="mps"
# )  

In [59]:
query_string = "What is chai made of?" 
query = model.encode(query_string,
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()

In [60]:
response = table.search(query).limit(4).to_pandas()
response

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,reddit_subreddit_label,index,text_and_title,comment_indices,vector,_distance
0,submission,2023-04-08T09:17:03,12flsj2,t3_12flsj2,1680959823,GeminiDragon60,Newbie here. How is a hot Chai latte made? Tha...,/r/starbucks/comments/12flsj2/chai_latte/,Chai latte,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,69813,Chai latte [SEP] Newbie here. How is a hot Cha...,[69814],"[-0.082629226, -0.044056986, -0.059876796, 0.0...",0.864564
1,submission,2023-03-31T09:00:33,127kqyb,t3_127kqyb,1680267633,rabby10,Hot or iced!,/r/starbucksbaristas/comments/127kqyb/what_is_...,What is your favorite way to enjoy a chai?,https://www.reddit.com/r/starbucksbaristas/com...,starbucksbaristas,,,,1,47626,What is your favorite way to enjoy a chai? [SE...,"[47627, 47628, 47629, 47630, 47631, 47632, 476...","[-0.069242835, -0.0132296635, -0.029213658, 0....",1.117867
2,submission,2023-04-21T00:00:23,12tpnpv,t3_12tpnpv,1682049623,black_whitecookie23,Hey there I like the iced chai tea latte but t...,/r/starbucks/comments/12tpnpv/iced_chai_tea_la...,Iced chai tea latte,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,235168,Iced chai tea latte [SEP] Hey there I like the...,"[235169, 235170, 235171, 235172]","[-0.059432235, -0.09943229, -0.04372349, -0.02...",1.124923
3,submission,2023-04-08T18:30:16,12g0k3f,t3_12g0k3f,1680993016,Outrageous_Ad798,I made a drink for my friend who loves gingerb...,/r/starbucks/comments/12g0k3f/my_best_creation/,My best creation,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,74149,My best creation [SEP] I made a drink for my f...,"[74150, 74151, 74152, 74153, 74154]","[-0.110355295, 0.021308728, 2.632368e-05, -0.0...",1.144603


In [74]:
for i in range(len(response)):
    sub_index = response.loc[i, 'index']
    comment_indices = response.loc[i, 'comment_indices']
    print('Submission title: ', data.loc[sub_index, 'reddit_title'])
    print('Submission text', data.loc[sub_index, 'reddit_text'])
    # for j, k in enumerate(comment_indices):
    #     print('Comment', j+1, ':\n', data.loc[k, 'reddit_text'])
    print('Comment:\n', data.loc[comment_indices[0], 'reddit_text'])
    print()

Submission title:  Chai latte
Submission text Newbie here. How is a hot Chai latte made? Thanks!
Comment:
 1. Steam 2% milk
2. Pump chai: 2/3/4/5 for short/tall/grande/venti 
3. Fill cup halfway with hot water
4. Fill remaining half with steamed milk

(https://sbuxdates.com is a great site for checking recipes)

Submission title:  What is your favorite way to enjoy a chai?
Submission text Hot or iced!
Comment:
 iced with oatmilk and brown sugar

Submission title:  Iced chai tea latte
Submission text Hey there I like the iced chai tea latte but the chai tea concentrate in stores in so expensive and barely lasts a week. Is there anyway I can make the chai tea at home? Like actual homemade chai tea with spices and tea? Also do you know what brand is the most similar to the Starbucks brand soy milk? Thanks so much!
Comment:
 I mean chai tea is pretty easy to find in the grocery store, or higher end tea suppliers. Just add a fuckton of sugar 🤷‍♀️

And Silk is most similar to sbux.

Submissi

### Finetune the pretrained model

In [62]:
from sentence_transformers import losses
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

Use (text and title, subreddit label) pairs to finetune the model, so that submissions in same subreddit are closer to each other in the embedding.

In [63]:
train_loss = losses.BatchAllTripletLoss(model=model)
train_data = [InputExample(texts=[subs['text_and_title'].iloc[i]], label=subs['reddit_subreddit_label'].iloc[i]) for i in range(len(subs))]
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)

In [64]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1289 [00:00<?, ?it/s]

In [65]:
model.save(path='../model', model_name='finetuned')

### KNN with the finetuned embedding

In [67]:
model_finetuned = SentenceTransformer("../model", device='mps')

vectors_finetuned = model_finetuned.encode(subs['text_and_title'].values.tolist(),
                       convert_to_numpy=True,
                       normalize_embeddings=True)
subs['vector_finetuned'] = vectors_finetuned.tolist()

In [68]:
table = db.create_table("reddit_finetuned", subs, mode="overwrite")

[2024-04-19T05:09:00Z WARN  lance::dataset] No existing dataset at /Users/yangxiaoluo/Documents/projects/aware_nlp/code/../.lancedb/reddit_finetuned.lance, it will be created


In [69]:
query_string = "What is chai made of?" 
query = model_finetuned.encode(query_string,
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()

In [70]:
response2 = table.search(query).limit(4).to_pandas()
response2

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,...,reddit_link_id,reddit_parent_id,reddit_submission,reddit_subreddit_label,index,text_and_title,comment_indices,vector,vector_finetuned,_distance
0,submission,2023-04-08T09:17:03,12flsj2,t3_12flsj2,1680959823,GeminiDragon60,Newbie here. How is a hot Chai latte made? Tha...,/r/starbucks/comments/12flsj2/chai_latte/,Chai latte,https://www.reddit.com/r/starbucks/comments/12...,...,,,,0,69813,Chai latte [SEP] Newbie here. How is a hot Cha...,[69814],"[-0.082629226, -0.044056986, -0.059876796, 0.0...","[-0.07382379472255707, -0.06260678172111511, -...",0.854807
1,submission,2023-03-31T09:00:33,127kqyb,t3_127kqyb,1680267633,rabby10,Hot or iced!,/r/starbucksbaristas/comments/127kqyb/what_is_...,What is your favorite way to enjoy a chai?,https://www.reddit.com/r/starbucksbaristas/com...,...,,,,1,47626,What is your favorite way to enjoy a chai? [SE...,"[47627, 47628, 47629, 47630, 47631, 47632, 476...","[-0.069242835, -0.0132296635, -0.029213658, 0....","[-0.04132311791181564, -0.016466502100229263, ...",1.050847
2,submission,2023-04-11T21:00:12,12j4gjk,t3_12j4gjk,1681261212,nicolelynnejones,,/r/starbucksbaristas/comments/12j4gjk/iced_cha...,iced chai custies ♥️🫶🏻😭,https://i.redd.it/83iq8ker8eta1.jpg,...,,,,1,88998,iced chai custies ♥️🫶🏻😭 [SEP],"[88999, 89000, 89001]","[-0.114683196, -0.013103594, -0.013807522, 0.0...","[-0.08317598700523376, -0.022117894142866135, ...",1.086236
3,submission,2023-04-21T00:00:23,12tpnpv,t3_12tpnpv,1682049623,black_whitecookie23,Hey there I like the iced chai tea latte but t...,/r/starbucks/comments/12tpnpv/iced_chai_tea_la...,Iced chai tea latte,https://www.reddit.com/r/starbucks/comments/12...,...,,,,0,235168,Iced chai tea latte [SEP] Hey there I like the...,"[235169, 235170, 235171, 235172]","[-0.059432235, -0.09943229, -0.04372349, -0.02...","[-0.046944793313741684, -0.1008346751332283, -...",1.087873


In [73]:
for i in range(len(response2)):
    sub_index = response.loc[i, 'index']
    comment_indices = response.loc[i, 'comment_indices']
    print('Submission title: ', data.loc[sub_index, 'reddit_title'])
    print('Submission text:', data.loc[sub_index, 'reddit_text'])
    # for j, k in enumerate(comment_indices):
    #     print('Comment', j+1, ':\n', data.loc[k, 'reddit_text'])
    print('Comment:\n', data.loc[comment_indices[0], 'reddit_text'])
    print()

Submission title:  Chai latte
Submission text: Newbie here. How is a hot Chai latte made? Thanks!
Comment:
 1. Steam 2% milk
2. Pump chai: 2/3/4/5 for short/tall/grande/venti 
3. Fill cup halfway with hot water
4. Fill remaining half with steamed milk

(https://sbuxdates.com is a great site for checking recipes)

Submission title:  What is your favorite way to enjoy a chai?
Submission text: Hot or iced!
Comment:
 iced with oatmilk and brown sugar

Submission title:  Iced chai tea latte
Submission text: Hey there I like the iced chai tea latte but the chai tea concentrate in stores in so expensive and barely lasts a week. Is there anyway I can make the chai tea at home? Like actual homemade chai tea with spices and tea? Also do you know what brand is the most similar to the Starbucks brand soy milk? Thanks so much!
Comment:
 I mean chai tea is pretty easy to find in the grocery store, or higher end tea suppliers. Just add a fuckton of sugar 🤷‍♀️

And Silk is most similar to sbux.

Submi