In [48]:
import sqlite3
import pandas as pd
import random

In [49]:
df = pd.read_csv('chunked_data.csv')

In [50]:
df.shape

(71994, 3)

In [51]:
! pip install sentence-transformers



In [52]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [55]:
sampled_df = df.sample(frac=0.05)
sampled_df = sampled_df.reset_index(drop=True)

In [56]:
# Encode the chunk text to get semantic embeddings
chunk_embeddings = model.encode(sampled_df['chunk_text'].tolist())

# Add the embeddings to the DataFrame
sampled_df['embeddings'] = chunk_embeddings.tolist()

In [57]:
sampled_df.head()

Unnamed: 0,name,chunk_index,chunk_text,embeddings
0,the.repair.shop.s08.e05.painting.of.queen.henr...,4,fit bent lever yellow submarine popup hatch me...,"[-0.059462856501340866, -0.07083207368850708, ..."
1,diagnosis.murder.s07.e15.jakes.women.(2000).en...,6,yeah hard live uh uh france wanted call say hi...,"[-0.09489841014146805, -0.0891512930393219, 0...."
2,jehanabad.of.love.war.s01.e08.milan.().eng.1cd,1,lift lift happened sonu happened happen munawa...,"[-0.07662401348352432, -0.002949650166556239, ..."
3,teen.wolf.s05.e03.dreamcatchers.(2015).eng.1cd,1,think dont know yet previously teen wolf cant ...,"[-0.04792579263448715, -0.11751582473516464, 0..."
4,welcome.to.eden.s02.e02.episode.2.2.(2023).eng...,2,come oh cant excuse havent finished match yet ...,"[-0.07520384341478348, -0.0391005203127861, 0...."


In [58]:
sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         3600 non-null   object
 1   chunk_index  3600 non-null   int64 
 2   chunk_text   3600 non-null   object
 3   embeddings   3600 non-null   object
dtypes: int64(1), object(3)
memory usage: 112.6+ KB


In [59]:
!pip install chromadb



In [60]:
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd

In [61]:
chroma_client = chromadb.PersistentClient(path="my_chromadb")

In [62]:
# using the distilbert-base-nli-mean-tokens model for embedding function
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="distilbert-base-nli-mean-tokens")
collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef, metadata={"hnsw:space": "cosine"})

In [63]:
collection.add(
    documents=sampled_df['name'].tolist(),
    metadatas=[{"item_id": str(idx)} for idx in range(len(sampled_df))],
    ids=[str(idx) for idx in range(len(sampled_df))],
)

#note: while running this keep you laptop charging , becaz it will heatup !!!..
#hm it took me 4 minutes



In [64]:
# getting user input
user_query = input("Enter your search query: ")

# querying the collection
results = collection.query(
    query_texts=[user_query],
    n_results=10,
    include=['documents', 'distances', 'metadatas']
)

# displaying the user input
print(f"Your search query: {user_query}")

# displaying output documents
for document in results['documents'][0]:
    print(f"{document}")

Enter your search query: why are you hurt so much
Your search query: why are you hurt so much
 *** down.and.dangerous.(2013).eng.1cd *** 
 *** bad.behaviour.s01.e03.exeat.(2022).eng.1cd *** 
 *** the.enemy.within.(1994).eng.1cd *** 
 *** the.enemy.within.(1994).eng.1cd *** 
 *** according.to.jim.s04.e14.a.crying.shame.(2005).eng.1cd *** 
 *** a.different.world.s04.e22.monet.is.the.root.of.all.evil.(1991).eng.1cd *** 
 *** betrayal.(1983).eng.1cd *** 
 *** victoria.s03.e07.a.public.inconvenience.(2019).eng.1cd *** 
 *** bad.behaviour.s01.e01.moth.to.a.flame.(2023).eng.1cd *** 
 *** moron.5.and.the.crying.lady.(2012).eng.1cd *** 
