In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_data2.csv')

In [3]:
df

Unnamed: 0,num,name,clean_file_content
0,9251120,maybe.this.time.(2014),watch video online opensubtitles free browser ...
1,9211589,down.the.shore.s01.e10.and.justice.for.all.(1992),oh know getting late dont wan na go home im hu...
2,9380845,uncontrollably.fond.s01.e07.heartache.(2016),itiming subtitle uncontrollable lovebird team ...
3,9301436,screen.two.s13.e04.the.precious.blood.(1996),ethereal music apiopensubtitlesorg deprecated ...
4,9408707,battlebots.(2015),chris oh minibots yelling oh leave little bot ...
...,...,...,...
16495,9418128,saranghanda.saranghaji.anneunda.(2011),advertise product brand contact wwwopensubtitl...
16496,9238476,slasher.s01.e04.as.water.is.corrupted.unless.i...,phone ringing ipreviously slasheri cant stay r...
16497,9413962,naruto.shippuden.s01.e136.mangekyo.sharingan.n...,support u become vip member remove ad wwwopens...
16498,9229663,tales.of.the.walking.dead.s01.e05.davon.(2022),man davon wheres boy groaning gasping wheezing...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16500 entries, 0 to 16499
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   num                 16500 non-null  int64 
 1   name                16500 non-null  object
 2   clean_file_content  16500 non-null  object
dtypes: int64(1), object(2)
memory usage: 386.8+ KB


In [5]:
df['num'] = df['num'].apply(lambda x : str(x)) #converting 'num' id to string as chromaDB ids need to be in string format

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16500 entries, 0 to 16499
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   num                 16500 non-null  object
 1   name                16500 non-null  object
 2   clean_file_content  16500 non-null  object
dtypes: object(3)
memory usage: 386.8+ KB


In [7]:
#!pip install sentence-transformers

In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = 'paraphrase-MiniLM-L3-v2' #all-MiniLM-L6-v2
model = SentenceTransformer(model_name, device='cpu')

In [9]:
def semantic_chunking(document, similarity_threshold=0.9):

    # Tokenize the document into sentences
    sentences = document.split('.')

    # Initialize variables for semantic chunks
    chunks = []
    current_chunk = sentences[0]

    # Generate embeddings for the sentences
    sentence_embeddings = model.encode(sentences)

    # Iterate over the sentences and group semantically similar sentences into chunks
    for i in range(1, len(sentences)):
        # Calculate cosine similarity between the current sentence and the previous sentence
        similarity_score = np.dot(sentence_embeddings[i], sentence_embeddings[i-1]) / (np.linalg.norm(sentence_embeddings[i]) * np.linalg.norm(sentence_embeddings[i-1]))

        # If similarity score is above the threshold, add the sentence to the current chunk
        if similarity_score >= similarity_threshold:
            current_chunk += '.' + sentences[i]
        else:
            # If similarity score is below the threshold, start a new chunk
            chunks.append(current_chunk)
            current_chunk = sentences[i]

    # Add the last chunk
    chunks.append(current_chunk)

    return chunks

In [10]:
# temporary dataframes to split the data into two parts

temp_1 = pd.DataFrame()
temp_2 = pd.DataFrame()

In [11]:
temp_1['num']=df['num'][:8250]

In [12]:
temp_2['num']=df['num'][8250:]

In [15]:
# 1st section using joblib for parallel processing on the first part of the data

from joblib import Parallel, delayed
import time
start = time.time()
temp_1['chunks'] = Parallel(n_jobs=-1)(delayed(semantic_chunking)(item) for item in df['clean_file_content'].values[:8250])
end=time.time()
print(f"Total time in seconds = {end-start}")

Total time in seconds = 1786.451331615448


In [16]:
#saving to json file

temp_1.to_json("database.json") #saving data to json file to restrart the kernel and save RAM

In [13]:
# 2nd section - using joblib for parallel processing on the second part of the data
from joblib import Parallel, delayed
import time
start = time.time()
temp_2['chunks'] = Parallel(n_jobs=-1)(delayed(semantic_chunking)(item) for item in df['clean_file_content'].values[8250:])
end=time.time()
print(f"Total time in seconds = {end-start}")

Total time in seconds = 1762.5092811584473


In [14]:
temp_2.to_json("database_p2.json") #saving data to json file to restrart the kernel and save RAM

In [14]:
# restarting the kernel
# interacting with each part of the json file

import json

json_file_path = "database.json" #database_p2.json
with open(json_file_path, 'r') as f:
  data = json.load(f)

df = pd.DataFrame(data)

In [15]:
df

Unnamed: 0,num,chunks
0,9251120,[watch video online opensubtitles free browser...
1,9211589,[oh know getting late dont wan na go home im h...
2,9380845,[itiming subtitle uncontrollable lovebird team...
3,9301436,[ethereal music apiopensubtitlesorg deprecated...
4,9408707,[chris oh minibots yelling oh leave little bot...
...,...,...
8245,9194538,[watch video online opensubtitles free browser...
8246,9461187,[hey twizzle meow hey twizzle twizzle girl huh...
8247,9309873,[script info title english u original script t...
8248,9255721,[life span link eternity begun must end advert...


In [16]:
df['chunks'][0] # First row of chunked data

  df['chunks'][0] # First row of chunked data


['watch video online opensubtitles free browser extension osdblinkext iit couldve another summeri ibut set foot sandi ithat summer suddenly felt differenti ilike going summeri ithat would change lifei ithe summer freedomi ithe summer endless possibilitiesi ithe summer ooh aah ooh oh oh oh ooh ithat summer mei youre quite dancer stop come keep dancing whatever im kidding dont get mad huh hey im going get towel stop thought gon na kiss excuse wan na kiss yet mean yet youre girl mean girl girlfriend miss wish dont call miss dont pretend gentleman youre clearly call rude snob bitch douche handsome conceited like huh jerk exactly type leave alone steph aha steph ill call tep remove f way im tonio still tomorrow dont leave yet im going court ii chose walk away youi ibut fate different plani councilor teaching basic english literacy well teaching child read write english yes long program run okay entire summer ii thought could escape youi ibut somehow found againi tep couldnt resist huh youre

## Generating Text Embeddings

### Part 1: Preparing the indexes

In [17]:
def indexer(item):
    index=[]
    temp=int(df[df['num']==item].index[0])
    for j in range(len(df['chunks'].iloc[temp])):
        index.append(item+"-"+str(j))# since id needs to be unique adding the j index with a hyphen to create a unique id
    return index

In [18]:
df['num_list'] = df['num'].apply(lambda x : indexer(x)) #indexing the embeddings

In [19]:
df

Unnamed: 0,num,chunks,num_list
0,9251120,[watch video online opensubtitles free browser...,[9251120-0]
1,9211589,[oh know getting late dont wan na go home im h...,[9211589-0]
2,9380845,[itiming subtitle uncontrollable lovebird team...,[9380845-0]
3,9301436,[ethereal music apiopensubtitlesorg deprecated...,[9301436-0]
4,9408707,[chris oh minibots yelling oh leave little bot...,[9408707-0]
...,...,...,...
8245,9194538,[watch video online opensubtitles free browser...,[9194538-0]
8246,9461187,[hey twizzle meow hey twizzle twizzle girl huh...,[9461187-0]
8247,9309873,[script info title english u original script t...,[9309873-0]
8248,9255721,[life span link eternity begun must end advert...,[9255721-0]


In [23]:
df['num_list'][0]

  df['num_list'][0]


['9251120-0']

### Part 2: Creating the text embeddings

In [20]:
from sentence_transformers import SentenceTransformer

model_name = 'paraphrase-MiniLM-L3-v2' #all-MiniLM-L6-v2
model = SentenceTransformer(model_name, device='cpu')

In [21]:
def embedding_gen(data):
  return model.encode(data).tolist()

In [22]:
from joblib import Parallel, delayed
import time
start = time.time()
df['embeddings'] = Parallel(n_jobs=-1)(delayed(embedding_gen)(item) for item in df['chunks'].values)
end = time.time()
print(f"Total time in seconds = {end-start}")

Total time in seconds = 1719.1567568778992


In [24]:
df

Unnamed: 0,num,chunks,num_list,embeddings
0,9251120,[watch video online opensubtitles free browser...,[9251120-0],"[[0.008589413948357105, -0.08819494396448135, ..."
1,9211589,[oh know getting late dont wan na go home im h...,[9211589-0],"[[0.09815378487110138, -0.08552570641040802, 0..."
2,9380845,[itiming subtitle uncontrollable lovebird team...,[9380845-0],"[[0.27235913276672363, -0.10438156127929688, 0..."
3,9301436,[ethereal music apiopensubtitlesorg deprecated...,[9301436-0],"[[0.22709397971630096, 0.060651995241642, 0.02..."
4,9408707,[chris oh minibots yelling oh leave little bot...,[9408707-0],"[[-0.151687353849411, -0.15321648120880127, 0...."
...,...,...,...,...
8245,9194538,[watch video online opensubtitles free browser...,[9194538-0],"[[-0.12424875050783157, 0.029957635328173637, ..."
8246,9461187,[hey twizzle meow hey twizzle twizzle girl huh...,[9461187-0],"[[0.07549671828746796, -0.03596583753824234, 0..."
8247,9309873,[script info title english u original script t...,[9309873-0],"[[-0.08611387014389038, 0.06799688935279846, -..."
8248,9255721,[life span link eternity begun must end advert...,[9255721-0],"[[-0.18428510427474976, 0.16856567561626434, 0..."


In [25]:
df["embeddings"][0]

  df["embeddings"][0]


[[0.008589413948357105,
  -0.08819494396448135,
  0.3474327325820923,
  0.04450773820281029,
  -0.13636669516563416,
  -0.027391467243433,
  0.4440867602825165,
  -0.22191067039966583,
  -0.04648255929350853,
  -0.05163887143135071,
  -0.017414862290024757,
  0.02495034597814083,
  0.13749660551548004,
  0.07140470296144485,
  -0.14921823143959045,
  0.08758902549743652,
  0.1677783727645874,
  0.14777332544326782,
  0.07618239521980286,
  0.18247151374816895,
  0.02071760967373848,
  -0.21852253377437592,
  0.07975809276103973,
  0.0476725697517395,
  -0.10989607125520706,
  -0.1418619304895401,
  0.09663974493741989,
  0.20419131219387054,
  0.11492633819580078,
  -0.041010405868291855,
  0.1857718974351883,
  0.14292101562023163,
  -0.06957104802131653,
  0.0025930972769856453,
  0.04487820714712143,
  -0.0954572781920433,
  -0.13995306193828583,
  -0.270480215549469,
  -0.10864361375570297,
  -0.04571092873811722,
  0.06139260530471802,
  -0.02180173620581627,
  -0.1480872482061386

### Storing Data in Chromadb

#### Setting up chromadb

In [27]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)
     ------------------------------------ 526.8/526.8 kB 393.4 kB/s eta 0:00:00
Collecting opentelemetry-api>=1.2.0
  Downloading opentelemetry_api-1.24.0-py3-none-any.whl (60 kB)
     ---------------------------------------- 60.1/60.1 kB 3.1 MB/s eta 0:00:00
Collecting uvicorn[standard]>=0.18.3
  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
     ---------------------------------------- 60.8/60.8 kB 1.1 MB/s eta 0:00:00
Collecting chroma-hnswlib==0.7.3
  Downloading chroma_hnswlib-0.7.3-cp39-cp39-win_amd64.whl (150 kB)
     -------------------------------------- 150.6/150.6 kB 1.1 MB/s eta 0:00:00
Collecting orjson>=3.9.12
  Downloading orjson-3.10.1-cp39-none-win_amd64.whl (138 kB)
     ------------------------------------ 138.9/138.9 kB 748.2 kB/s eta 0:00:00
Collecting onnxruntime>=1.14.1
  Downloading onnxruntime-1.17.3-cp39-cp39-win_amd64.whl (5.6 MB)
     ---------------------------------------- 5.6

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.2.2 requires pyqt5<5.13, which is not installed.
spyder 5.2.2 requires pyqtwebengine<5.13, which is not installed.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.


In [30]:
import chromadb
client = chromadb.PersistentClient(path=r"C:\Users\shetk\OneDrive\Desktop\Innomatics_Internship\Task_9_Search Engine\Webapp_Hanna_Reference")
collection = client.get_or_create_collection(name="search_engine", metadata={"hnsw:space": "cosine"})
collection_2 = client.get_or_create_collection(name="search_engine_FileName", metadata={"hnsw:space": "cosine"})

#### Creating function to add filenames of our subtitles¶

In [33]:
client = chromadb.PersistentClient(path=r"C:\Users\shetk\OneDrive\Desktop\Innomatics_Internship\Task_9_Search Engine\Webapp_Hanna_Reference") #_test_db
collection = client.get_collection(name="search_engine") #test_collection
collection_name = client.get_collection(name="search_engine_FileName")
model_name="paraphrase-MiniLM-L3-V2"
model = SentenceTransformer(model_name, device="cpu")

### Creating function to add the chunks, embeddings and unique identifiers for our subtitle files

In [34]:
# Ran this part already before splitting data into 2 temporary dataframes
def add_func_v1():
    for i in range(df.shape[0]): #setting the range as total no. of rows in dataframe
        collection_2.add(
            documents=[df['name'].iloc[i]], # adding each filename
            embeddings=[[1,2,34,45]], # adding a random data, as we don't need it when retrieving file_name
            ids=[df['num'].iloc[i]] # entering unique 'num' id
        )