### Load data

In [1]:
import pandas as pd

data = pd.read_json("../AwareData/reddit.json")

### Preprocess

Add a column of reddit subreddit labels.

In [2]:
subreddits = data['reddit_subreddit'].unique()
label_dict = dict(zip(subreddits, range(len(subreddits))))
data['reddit_subreddit_label'] = data['reddit_subreddit'].apply(lambda x: label_dict[x])

Add a column of indices.

In [5]:
data.loc[:, 'index'] = data.index

Make a new dataframe of submissions only.

In [6]:
subs = data.loc[data['aware_post_type']=='submission']

Combine submission title and text.

In [7]:
subs.loc[:, 'text_and_title'] = [subs['reddit_title'].iloc[i] + ' ' + subs['reddit_text'].iloc[i] for i in range(len(subs))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subs.loc[:, 'text_and_title'] = [subs['reddit_title'].iloc[i] + ' ' + subs['reddit_text'].iloc[i] for i in range(len(subs))]


Drop those submissons with no comments

In [8]:
for i in range(len(subs)):
    ind = subs.iloc[i]['index']
    subs.loc[ind, 'has_comment'] = data.loc[ind + 1, 'aware_post_type'] == 'comment'

subs = subs.drop(subs[subs['has_comment']==False].index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subs.loc[ind, 'has_comment'] = data.loc[ind + 1, 'aware_post_type'] == 'comment'


In [9]:
len(subs)

20710

### Embedding

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2", device="mps")

In [11]:
vectors = model.encode(subs['text_and_title'].values.tolist(),
                       convert_to_numpy=True,
                       normalize_embeddings=True)
subs['vector'] = vectors.tolist()

### KNN

In [12]:
import lancedb

In [13]:
db = lancedb.connect("~/.lancedb")
table = db.create_table("reddit", subs, mode="overwrite")
# table = db.create_table("reddit", subs, exist_ok=True)

In [34]:
# table.create_index(
#      num_partitions=256,
#      num_sub_vectors=96,
#      accelerator="mps"
# )  

In [14]:
query_string = "What is chai made of?" 
query = model.encode(query_string,
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()

In [15]:
response = table.search(query).limit(4).to_pandas()
response

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,reddit_subreddit_label,index,text_and_title,has_comment,vector,_distance
0,submission,2023-04-08T09:17:03,12flsj2,t3_12flsj2,1680959823,GeminiDragon60,Newbie here. How is a hot Chai latte made? Tha...,/r/starbucks/comments/12flsj2/chai_latte/,Chai latte,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,69813,Chai latte Newbie here. How is a hot Chai latt...,True,"[-0.082613766, -0.047615167, -0.059928607, 0.0...",0.871238
1,submission,2023-03-31T09:00:33,127kqyb,t3_127kqyb,1680267633,rabby10,Hot or iced!,/r/starbucksbaristas/comments/127kqyb/what_is_...,What is your favorite way to enjoy a chai?,https://www.reddit.com/r/starbucksbaristas/com...,starbucksbaristas,,,,1,47626,What is your favorite way to enjoy a chai? Hot...,True,"[-0.07057586, -0.014952984, -0.032797873, 0.05...",1.105819
2,submission,2023-04-21T00:00:23,12tpnpv,t3_12tpnpv,1682049623,black_whitecookie23,Hey there I like the iced chai tea latte but t...,/r/starbucks/comments/12tpnpv/iced_chai_tea_la...,Iced chai tea latte,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,235168,Iced chai tea latte Hey there I like the iced ...,True,"[-0.064972535, -0.099334784, -0.046199642, -0....",1.11111
3,submission,2023-04-08T18:30:16,12g0k3f,t3_12g0k3f,1680993016,Outrageous_Ad798,I made a drink for my friend who loves gingerb...,/r/starbucks/comments/12g0k3f/my_best_creation/,My best creation,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,74149,My best creation I made a drink for my friend ...,True,"[-0.10863018, 0.014598468, -0.00557208, -0.018...",1.129766


In [25]:
for i in response['index']:
    print('Submission title: ', data.loc[i, 'reddit_title'])
    print('Submission text: ', data.loc[i, 'reddit_text'])
    print('Comment:\n', data.loc[i + 1, 'reddit_text'])
    print()

Submission title:  Chai latte
Submission text:  Newbie here. How is a hot Chai latte made? Thanks!
Comment:
 1. Steam 2% milk
2. Pump chai: 2/3/4/5 for short/tall/grande/venti 
3. Fill cup halfway with hot water
4. Fill remaining half with steamed milk

(https://sbuxdates.com is a great site for checking recipes)

Submission title:  What is your favorite way to enjoy a chai?
Submission text:  Hot or iced!
Comment:
 iced with oatmilk and brown sugar

Submission title:  Iced chai tea latte
Submission text:  Hey there I like the iced chai tea latte but the chai tea concentrate in stores in so expensive and barely lasts a week. Is there anyway I can make the chai tea at home? Like actual homemade chai tea with spices and tea? Also do you know what brand is the most similar to the Starbucks brand soy milk? Thanks so much!
Comment:
 I mean chai tea is pretty easy to find in the grocery store, or higher end tea suppliers. Just add a fuckton of sugar 🤷‍♀️

And Silk is most similar to sbux.

Su

### Finetune the pretrained model

In [31]:
from sentence_transformers import losses
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

Use (text and title, subreddit label) pairs to finetune the model, so that submissions in same subreddit are closer to each other in the embedding.

In [32]:
train_loss = losses.BatchAllTripletLoss(model=model)
train_data = [InputExample(texts=[subs['text_and_title'].iloc[i]], label=subs['reddit_subreddit_label'].iloc[i]) for i in range(len(subs))]
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)

In [33]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1295 [00:00<?, ?it/s]

In [34]:
model.save(path='../model', model_name='finetuned')

### KNN with the finetuned embedding

In [35]:
model_finetuned = SentenceTransformer("../model", device='mps')

vectors = model_finetuned.encode(subs['text_and_title'].values.tolist(),
                       convert_to_numpy=True,
                       normalize_embeddings=True)
subs['vector'] = vectors.tolist()

In [36]:
table = db.create_table("reddit2", subs, mode="overwrite")

In [37]:
query_string = "What is chai made of?" 
query = model.encode(query_string,
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()

In [38]:
response2 = table.search(query).limit(4).to_pandas()
response2

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,reddit_subreddit_label,index,text_and_title,has_comment,vector,_distance
0,submission,2023-03-31T09:00:33,127kqyb,t3_127kqyb,1680267633,rabby10,Hot or iced!,/r/starbucksbaristas/comments/127kqyb/what_is_...,What is your favorite way to enjoy a chai?,https://www.reddit.com/r/starbucksbaristas/com...,starbucksbaristas,,,,1,47626,What is your favorite way to enjoy a chai? Hot...,True,"[-0.047044724, -0.02101022, -0.017901227, 0.07...",0.927126
1,submission,2023-04-08T09:17:03,12flsj2,t3_12flsj2,1680959823,GeminiDragon60,Newbie here. How is a hot Chai latte made? Tha...,/r/starbucks/comments/12flsj2/chai_latte/,Chai latte,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,0,69813,Chai latte Newbie here. How is a hot Chai latt...,True,"[-0.07323092, -0.067822, -0.04523185, 0.047510...",0.998666
2,submission,2023-04-15T19:53:52,12nnkli,t3_12nnkli,1681602832,TwistyTacos,,/r/starbucks/comments/12nnkli/20_pumps_of_chai/,20 pumps of chai….,https://i.redd.it/742smopkg6ua1.jpg,starbucks,,,,0,166955,20 pumps of chai….,True,"[-0.045237705, 0.07069547, -0.041915175, 0.005...",1.033151
3,submission,2023-04-11T21:00:12,12j4gjk,t3_12j4gjk,1681261212,nicolelynnejones,,/r/starbucksbaristas/comments/12j4gjk/iced_cha...,iced chai custies ♥️🫶🏻😭,https://i.redd.it/83iq8ker8eta1.jpg,starbucksbaristas,,,,1,88998,iced chai custies ♥️🫶🏻😭,True,"[-0.08417167, -0.02565384, -0.002591417, 0.039...",1.047739


In [39]:
for i in response2['index']:
    print('Submission title: ', data.loc[i, 'reddit_title'])
    print('Submission text: ', data.loc[i, 'reddit_text'])
    print('Comment:\n', data.loc[i + 1, 'reddit_text'])
    print()

Submission title:  What is your favorite way to enjoy a chai?
Submission text:  Hot or iced!
Comment:
 iced with oatmilk and brown sugar

Submission title:  Chai latte
Submission text:  Newbie here. How is a hot Chai latte made? Thanks!
Comment:
 1. Steam 2% milk
2. Pump chai: 2/3/4/5 for short/tall/grande/venti 
3. Fill cup halfway with hot water
4. Fill remaining half with steamed milk

(https://sbuxdates.com is a great site for checking recipes)

Submission title:  20 pumps of chai….
Submission text:  
Comment:
 This person is getting enough chai for the week. Starbucks is cutting all this out soon.

Submission title:  iced chai custies ♥️🫶🏻😭
Submission text:  
Comment:
 I say “slay 😐” whenever customers are rude in drive thru

