In [1]:
#installation and import statements
from __future__ import annotations
import numpy as np
import time
import lancedb # our vector database
from lancedb.embeddings import get_registry #embedding registry
from lancedb.pydantic import LanceModel, Vector, pydantic_to_schema
from lancedb.embeddings import TextEmbeddingFunction
from datasets import load_dataset
import pandas as pd
import pyarrow as pa
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from datasets import load_dataset



  from .autonotebook import tqdm as notebook_tqdm


# Get Dataset

In [2]:
data = pd.read_json("../AwareData/reddit.json")
data.head()
print(len(data))

404686


Add a column of submission titles.

In [3]:
data['submission_title'] = data['reddit_title'].copy()
for i in range(len(data)):
    if data.loc[i, 'submission_title'] == None:
        data.loc[i, 'submission_title'] = data.loc[i - 1, 'submission_title']

In [9]:
# data['text_and_title'] = [data.submission_title.values[i] + ' [SEP] ' + data.reddit_text.values[i] if data.reddit_title.values[i] != None else data.reddit_text.values[i] for i in range(len(data))]
data['text_and_title'] = [data.submission_title.values[i] + ' [SEP] ' + data.reddit_text.values[i] for i in range(len(data))]

In [11]:
data.head()

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,submission_title,text_and_title
0,submission,2023-04-02T13:58:03,129sqka,t3_129sqka,1680458283,MoodyStarGirl,That's it.,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,Hot chai lattes shouldn't have water,Hot chai lattes shouldn't have water [SEP] Tha...
1,comment,2023-04-02T14:32:57,jeounwc,t1_jeounwc,1680460377,Lost_Treat_6296,We should make the chai tea latte with the sam...,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t3_129sqka,129sqka,Hot chai lattes shouldn't have water,Hot chai lattes shouldn't have water [SEP] We ...
2,comment,2023-04-02T14:48:18,jeowus2,t1_jeowus2,1680461298,MoodyStarGirl,Oh like using the chai tea bags?,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeounwc,129sqka,Hot chai lattes shouldn't have water,Hot chai lattes shouldn't have water [SEP] Oh ...
3,comment,2023-04-02T14:48:49,jeowxe5,t1_jeowxe5,1680461329,Lost_Treat_6296,"No, the whole half water and half milk thing",/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowus2,129sqka,Hot chai lattes shouldn't have water,"Hot chai lattes shouldn't have water [SEP] No,..."
4,comment,2023-04-02T21:59:22,jeqiuw3,t1_jeqiuw3,1680487162,MoodyStarGirl,That's a lot of water :(,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowxe5,129sqka,Hot chai lattes shouldn't have water,Hot chai lattes shouldn't have water [SEP] Tha...


In [21]:
sb = data.loc[data['reddit_subreddit'].str.contains('starbucks')].copy()
len(sb)
sb['id'] = np.arange(len(sb))

# Connect to LanceDB and create table

In [22]:
db = lancedb.connect("~/.lancedb")

table = db.create_table("reddit", sb, mode="overwrite")


# Embeddings
1) Define the embedding function
2) Define the embedding model or schema
3) Create table and add data
4) Query table

## Define the embedding function

In [23]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
vectors = model.encode(sb.text_and_title.values.tolist(),
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()

In [25]:
vectors[0][0:10]

[-0.014269068837165833,
 -0.04020996019244194,
 -0.020817028358578682,
 0.005154034588485956,
 0.02226731926202774,
 -0.060844842344522476,
 0.05836697295308113,
 -0.06468521803617477,
 -0.04143568500876427,
 -0.0431983545422554]

### Save embedding vectors to table

In [26]:
from lance.vector import vec_to_table
import numpy as np
import pyarrow as pa

embeddings = vec_to_table(vectors)
embeddings = embeddings.append_column("id", pa.array(np.arange(len(table))+1))
embeddings.to_pandas().head()

Unnamed: 0,vector,id
0,"[-0.014269069, -0.04020996, -0.020817028, 0.00...",1
1,"[-0.02594117, -0.04166178, -0.020809583, -0.02...",2
2,"[-0.0116989, -0.020555083, -0.051307622, 0.004...",3
3,"[0.0046593216, -0.04497111, 0.0058308006, 0.00...",4
4,"[-0.00542182, -0.048168555, -0.0035562972, -0....",5


In [27]:
table.merge(embeddings, left_on="id")
table.head().to_pandas()

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,text_and_title,id,vector
0,submission,2023-04-02T13:58:03,129sqka,t3_129sqka,1680458283,MoodyStarGirl,That's it.,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,Hot chai lattes shouldn't have water [SEP] Tha...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,comment,2023-04-02T14:32:57,jeounwc,t1_jeounwc,1680460377,Lost_Treat_6296,We should make the chai tea latte with the sam...,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,,starbucks,t3_129sqka,t3_129sqka,129sqka,Hot chai lattes shouldn't have water [SEP] We ...,1,"[-0.014269069, -0.04020996, -0.020817028, 0.00..."
2,comment,2023-04-02T14:48:18,jeowus2,t1_jeowus2,1680461298,MoodyStarGirl,Oh like using the chai tea bags?,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,,starbucks,t3_129sqka,t1_jeounwc,129sqka,Hot chai lattes shouldn't have water [SEP] Oh ...,2,"[-0.02594117, -0.04166178, -0.020809583, -0.02..."
3,comment,2023-04-02T14:48:49,jeowxe5,t1_jeowxe5,1680461329,Lost_Treat_6296,"No, the whole half water and half milk thing",/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,,starbucks,t3_129sqka,t1_jeowus2,129sqka,"Hot chai lattes shouldn't have water [SEP] No,...",3,"[-0.0116989, -0.020555083, -0.051307622, 0.004..."
4,comment,2023-04-02T21:59:22,jeqiuw3,t1_jeqiuw3,1680487162,MoodyStarGirl,That's a lot of water :(,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,,starbucks,t3_129sqka,t1_jeowxe5,129sqka,Hot chai lattes shouldn't have water [SEP] Tha...,4,"[0.0046593216, -0.04497111, 0.0058308006, 0.00..."


## Build Approximate Nearest Neighbors Index

In [28]:
# !pip install tantivy

# table.create_fts_index("text")
table.create_index(
     num_partitions=6, #6
     num_sub_vectors=6, #6
     accelerator="mps"
)    


 22%|██▏       | 11/50 [00:00<00:00, 72.28it/s]
3it [00:00, 10.82it/s]


## Embed the Query

In [29]:
query_string = "What is chai made of?" 
query = model.encode(query_string,
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()
print(query[0:10])


[-0.09422026574611664, 0.034345194697380066, -0.04547582566738129, 0.02944718673825264, 0.03271962329745293, -0.08513285219669342, 0.15210065245628357, -0.049321118742227554, -0.006353107281029224, 0.00408329488709569]


## Search for Approximate Nearest Neighbors
By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN). For tables with more than 50K vectors, creating an ANN index is recommended to speed up search performance.

Using an ANN index is faster, but less accurate than kNN or brute force search because, in essence, the index is a lossy representation of the data.

In [30]:
response = table.search(query).limit(4).nprobes(20).refine_factor(10).to_pandas()
print(response["reddit_text"][0])

My friend got it and sent me the pic! He didn’t ask what it was unfortunately. He’s saying the green was actually minty and it tasted like a thin mint.


In [31]:
print(query_string)
print(response["reddit_text"][0:5])

What is chai made of?
0    My friend got it and sent me the pic! He didn’...
1    That’s fair. Tbh, I was responding to the tail...
2                                    Literally so ugly
3            My first thought was a chicken foot lolol
Name: reddit_text, dtype: object
