In [1]:
#installation and import statements
from __future__ import annotations
import numpy as np
import time
import lancedb # our vector database
from lancedb.embeddings import get_registry #embedding registry
from lancedb.pydantic import LanceModel, Vector, pydantic_to_schema
from lancedb.embeddings import TextEmbeddingFunction
from datasets import load_dataset
import pandas as pd
import pyarrow as pa
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from datasets import load_dataset



# Get Dataset

In [2]:
data = pd.read_json("../AwareData/reddit.json")
data.head()
print(len(data))

404686


In [3]:
data['text_and_title'] = [data.reddit_title.values[i] + ' [SEP] ' + data.reddit_text.values[i] if data.reddit_title.values[i] != None else data.reddit_text.values[i] for i in range(len(data))]

In [4]:
data.head()

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,text_and_title
0,submission,2023-04-02T13:58:03,129sqka,t3_129sqka,1680458283,MoodyStarGirl,That's it.,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,Hot chai lattes shouldn't have water [SEP] Tha...
1,comment,2023-04-02T14:32:57,jeounwc,t1_jeounwc,1680460377,Lost_Treat_6296,We should make the chai tea latte with the sam...,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t3_129sqka,129sqka,We should make the chai tea latte with the sam...
2,comment,2023-04-02T14:48:18,jeowus2,t1_jeowus2,1680461298,MoodyStarGirl,Oh like using the chai tea bags?,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeounwc,129sqka,Oh like using the chai tea bags?
3,comment,2023-04-02T14:48:49,jeowxe5,t1_jeowxe5,1680461329,Lost_Treat_6296,"No, the whole half water and half milk thing",/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowus2,129sqka,"No, the whole half water and half milk thing"
4,comment,2023-04-02T21:59:22,jeqiuw3,t1_jeqiuw3,1680487162,MoodyStarGirl,That's a lot of water :(,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowxe5,129sqka,That's a lot of water :(


In [5]:
sb = data.loc[data['reddit_subreddit'].str.contains('starbucks')]
len(sb)
sb['id'] = np.arange(len(sb))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sb['id'] = np.arange(len(sb))


# Connect to LanceDB and create table

In [6]:
db = lancedb.connect("~/.lancedb")

table = db.create_table("reddit", sb, mode="overwrite")


# Embeddings
1) Define the embedding function
2) Define the embedding model or schema
3) Create table and add data
4) Query table

## Define the embedding function

In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
vectors = model.encode(sb.text_and_title.values.tolist(),
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()

In [8]:
vectors[0][0:10]

[-0.014269034378230572,
 -0.040209949016571045,
 -0.020817002281546593,
 0.005154069978743792,
 0.022267308086156845,
 -0.06084483489394188,
 0.058367032557725906,
 -0.06468523293733597,
 -0.04143567010760307,
 -0.04319833591580391]

### Save embedding vectors to table

In [9]:
from lance.vector import vec_to_table
import numpy as np
import pyarrow as pa

embeddings = vec_to_table(vectors)
embeddings = embeddings.append_column("id", pa.array(np.arange(len(table))+1))
embeddings.to_pandas().head()

Unnamed: 0,vector,id
0,"[-0.014269034, -0.04020995, -0.020817002, 0.00...",1
1,"[-0.049435094, -0.036684588, -0.03507152, -0.0...",2
2,"[-0.049866207, 0.058588855, -0.045397144, 0.02...",3
3,"[-0.011465802, 0.007062405, 0.06518951, 0.0355...",4
4,"[0.026740449, 0.00762673, 0.07855508, 0.029588...",5


In [10]:
table.merge(embeddings, left_on="id")
table.head().to_pandas()

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,text_and_title,id,vector
0,submission,2023-04-02T13:58:03,129sqka,t3_129sqka,1680458283,MoodyStarGirl,That's it.,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,,Hot chai lattes shouldn't have water [SEP] Tha...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,comment,2023-04-02T14:32:57,jeounwc,t1_jeounwc,1680460377,Lost_Treat_6296,We should make the chai tea latte with the sam...,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t3_129sqka,129sqka,We should make the chai tea latte with the sam...,1,"[-0.014269034, -0.04020995, -0.020817002, 0.00..."
2,comment,2023-04-02T14:48:18,jeowus2,t1_jeowus2,1680461298,MoodyStarGirl,Oh like using the chai tea bags?,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeounwc,129sqka,Oh like using the chai tea bags?,2,"[-0.049435094, -0.036684588, -0.03507152, -0.0..."
3,comment,2023-04-02T14:48:49,jeowxe5,t1_jeowxe5,1680461329,Lost_Treat_6296,"No, the whole half water and half milk thing",/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowus2,129sqka,"No, the whole half water and half milk thing",3,"[-0.049866207, 0.058588855, -0.045397144, 0.02..."
4,comment,2023-04-02T21:59:22,jeqiuw3,t1_jeqiuw3,1680487162,MoodyStarGirl,That's a lot of water :(,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowxe5,129sqka,That's a lot of water :(,4,"[-0.011465802, 0.007062405, 0.06518951, 0.0355..."


## Build Approximate Nearest Neighbors Index

In [12]:
# !pip install tantivy

# table.create_fts_index("text")
table.create_index(
     num_partitions=6, #6
     num_sub_vectors=6, #6
     accelerator="mps"
)    


 38%|████████████████▎                          | 19/50 [00:00<00:00, 49.64it/s]


0it [00:00, ?it/s]

  tensor = torch.from_numpy(arr.to_numpy(zero_copy_only=False))


## Embed the Query

In [13]:
query_string = "What is chai made of?" 
query = model.encode(query_string,
                       convert_to_numpy=True,
                       normalize_embeddings=True).tolist()
print(query[0:10])


[-0.09422024339437485, 0.03434516116976738, -0.04547581821680069, 0.029447222128510475, 0.032719649374485016, -0.08513286709785461, 0.15210068225860596, -0.049321141093969345, -0.006353082600980997, 0.004083271604031324]


## Search for Approximate Nearest Neighbors
By default, LanceDB runs a brute-force scan over dataset to find the K nearest neighbours (KNN). For tables with more than 50K vectors, creating an ANN index is recommended to speed up search performance.

Using an ANN index is faster, but less accurate than kNN or brute force search because, in essence, the index is a lossy representation of the data.

In [14]:
response = table.search(query).limit(4).nprobes(20).refine_factor(10).to_pandas()
print(response["reddit_text"][0])

I get this and I’m Black! 🤣🤣🤣😩


In [15]:
print(query_string)
print(response["reddit_text"][0:5])

What is chai made of?
0                       I get this and I’m Black! 🤣🤣🤣😩
1    Iced is my favorite but a chai Frappuccino is ...
2    iced chai with oatmilk and cinnamon dolche syr...
3    Yeah, working here made me realize that I can ...
Name: reddit_text, dtype: object
