In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("clean_data.csv")

In [3]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [4]:
df.head()

Unnamed: 0,subtitle_id,name,clean_data
0,9251120,maybe.this.time.(2014).eng.1cd,watch any video online with open subtitle free...
1,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,oh i know that it s getting late but i don t w...
2,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,i timing and subtitle by the uncontrollable lo...
3,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,ethereal music api opensubtitles org is deprec...
4,9408707,battlebots.(2015).eng.1cd,chris oh no not the minibots yelling oh you le...


In [5]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()
def chunk_text(text, chunk_size=500, overlap_size=50):
    chunks = []
    words = text.split()
    start_idx = 0
    while start_idx < len(words):
        end_idx = min(start_idx + chunk_size, len(words))
        chunk = ' '.join(words[start_idx:end_idx])
        chunks.append(chunk)
        start_idx += chunk_size - overlap_size
    return chunks

# Apply chunking function to the 'clean_subtitles' column
chunk_size = 500  # Number of tokens per chunk
overlap_size = 50  # Number of tokens to overlap between chunks

df['chunks'] = df['clean_data'].progress_apply(lambda x: chunk_text(x, chunk_size, overlap_size))

100%|██████████████████████████████████████████████████████████████████████████| 24749/24749 [00:17<00:00, 1393.57it/s]


In [6]:
df.shape

(24749, 4)

In [7]:
df_exploded = df.explode('chunks')

In [8]:
df_exploded.head()

Unnamed: 0,subtitle_id,name,clean_data,chunks
0,9251120,maybe.this.time.(2014).eng.1cd,watch any video online with open subtitle free...,watch any video online with open subtitle free...
0,9251120,maybe.this.time.(2014).eng.1cd,watch any video online with open subtitle free...,a point oh tonio seventy snack seven seven sil...
0,9251120,maybe.this.time.(2014).eng.1cd,watch any video online with open subtitle free...,like it yes you know this is my favorite spot ...
0,9251120,maybe.this.time.(2014).eng.1cd,watch any video online with open subtitle free...,i wa suddenly given you surprised me a vacatio...
0,9251120,maybe.this.time.(2014).eng.1cd,watch any video online with open subtitle free...,presentation in fairness that s so good your p...


In [9]:
df_exploded.shape

(311548, 4)

In [10]:
pip install -U sentence-transformers




In [7]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
from tqdm import tqdm
tqdm.pandas()

def encode_and_convert_to_list(text):
    encoded_vector = model.encode(text).tolist()
    return encoded_vector

# Apply the function to each element of 'chunks'
df_exploded['doc_vector_pretrained_bert'] = df_exploded['chunks'].progress_apply(encode_and_convert_to_list)

100%|███████████████████████████████████████████████████████████████████████| 311548/311548 [16:23:31<00:00,  5.28it/s]


In [13]:
df_bert_pretrained = list(df_exploded.doc_vector_pretrained_bert)

In [15]:
# Initialize empty lists
documents = []
metadatas = []
ids = []
id = 1

# Iterate over DataFrame rows
for index, row in df_exploded.iterrows():
    # Append document from DataFrame to documents list
    documents.append(row['chunks'])

    # Append metadata (subtitle_id) from DataFrame to metadatas list
    metadatas.append({"subtitle_id": row['subtitle_id'], "name": row['name']})

    # Append unique ID to ids list
    ids.append(str(id))

    # Increment ID
    id += 1

In [16]:
pip install chromadb




In [17]:
import chromadb

In [18]:
client = chromadb.PersistentClient(path="vectordb")

In [19]:
client.heartbeat()

1713843177545200400

In [20]:
collection = client.create_collection(
        name="Data",
        metadata={"hnsw:space": "cosine"}
    )

In [21]:
from tqdm import tqdm

batch_size = 5000
total_batches = (len(documents) + batch_size - 1) // batch_size

for i in tqdm(range(total_batches), desc="Adding batches"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(documents))

    batch_doc = documents[start_idx:end_idx]
    batch_metadatas = metadatas[start_idx:end_idx]
    batch_ids = ids[start_idx:end_idx]
    embed = df_bert_pretrained[start_idx:end_idx]

    collection.add(
        embeddings=embed,
        documents=batch_doc,
        metadatas=batch_metadatas,
        ids=batch_ids
    )


Adding batches: 100%|██████████████████████████████████████████████████████████████| 63/63 [14:09:20<00:00, 808.90s/it]


In [2]:
import chromadb
client = chromadb.PersistentClient(path="vectordb")
collection = client.get_collection(name="Data")

In [3]:
def encode_query(query_point, model):
    """
    Encode the query point using the provided model.
    """
    return model.encode(query_point).tolist()

In [4]:
def print_movie_info(result):
    """
    Extract movie names and subtitle IDs from the query results.
    """
    movie_info = [(meta['name'], meta['subtitle_id']) for sublist in result['metadatas'] for meta in sublist if 'name' in meta and 'subtitle_id' in meta]
    return movie_info

In [5]:
prompt="Horror"

In [8]:
doc_vector = encode_query(prompt, model)

In [17]:
result = collection.query(
    query_embeddings=doc_vector,
    n_results=10
)

In [18]:
movie_info = result

In [19]:
movie_info

{'ids': [['165679',
   '301121',
   '116303',
   '165643',
   '265161',
   '301123',
   '108003',
   '126093',
   '165680',
   '116300']],
 'distances': [[0.6076998710632324,
   0.613140881061554,
   0.6222995519638062,
   0.6291930079460144,
   0.6295484304428101,
   0.6315937638282776,
   0.6480051279067993,
   0.6513936519622803,
   0.6526635885238647,
   0.6533650159835815]],
 'metadatas': [[{'name': 'in.search.of.darkness.part.iii.(2022).eng.2cd',
    'subtitle_id': 9417477},
   {'name': 'blumhouses.compendium.of.horror.s01.e01.american.monsters.(2022).eng.1cd',
    'subtitle_id': 9260194},
   {'name': 'living.with.chucky.(2022).eng.1cd', 'subtitle_id': 9493886},
   {'name': 'in.search.of.darkness.part.iii.(2022).eng.2cd',
    'subtitle_id': 9417477},
   {'name': 'man.to.man.with.dean.learner.s01.e01.garth.marenghi.(2006).eng.1cd',
    'subtitle_id': 9383265},
   {'name': 'blumhouses.compendium.of.horror.s01.e01.american.monsters.(2022).eng.1cd',
    'subtitle_id': 9260194},
   {'

In [22]:
movie_info =print_movie_info(result)

In [23]:
movie_info

[('in.search.of.darkness.part.iii.(2022).eng.2cd', 9417477),
 ('blumhouses.compendium.of.horror.s01.e01.american.monsters.(2022).eng.1cd',
  9260194),
 ('living.with.chucky.(2022).eng.1cd', 9493886),
 ('in.search.of.darkness.part.iii.(2022).eng.2cd', 9417477),
 ('man.to.man.with.dean.learner.s01.e01.garth.marenghi.(2006).eng.1cd',
  9383265),
 ('blumhouses.compendium.of.horror.s01.e01.american.monsters.(2022).eng.1cd',
  9260194),
 ('unknown.dimension.the.story.of.paranormal.activity.(2021).eng.1cd',
  9295030),
 ('one.of.us.is.lying.s02.e01.simon.says.game.on.(2022).eng.1cd', 9281598),
 ('in.search.of.darkness.part.iii.(2022).eng.2cd', 9417477),
 ('living.with.chucky.(2022).eng.1cd', 9493886)]