In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("phiitm/marvel-cinematic-universe-dialogue-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/phiitm/marvel-cinematic-universe-dialogue-dataset?dataset_version_number=1...


100%|██████████| 474k/474k [00:01<00:00, 444kB/s]

Extracting files...
Path to dataset files: C:\Users\abhay\.cache\kagglehub\datasets\phiitm\marvel-cinematic-universe-dialogue-dataset\versions\1





In [9]:
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter

In [None]:
subtitle_dir = r"C:\Users\abhay\OneDrive\Desktop\PJ_Revamp\Advanced-Semantic-Search-Engine\version_2\subtitles"
documents = []
# make sure you change the path in subtitles_dir

def read_file_with_fallback(filepath):
    encodings = ["utf-8", "latin-1", "cp1252"]
    for enc in encodings:
        try:
            with open(filepath, "r", encoding=enc) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
    print(f"Failed to decode {filepath}")
    return ""

for filename in os.listdir(subtitle_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(subtitle_dir, filename)
        text = read_file_with_fallback(filepath)
        if text:
            documents.append(Document(page_content=text, metadata={"source": filename}))

In [14]:
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = []
for doc in documents:
    splits = splitter.split_text(doc.page_content)
    for chunk in splits:
        docs.append(Document(page_content=chunk, metadata={"source": doc.metadata["source"]}))

Created a chunk of size 25878, which is longer than the specified 500


In [15]:
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")

# Save ChromaDB locally in ./chroma_subtitles
db = Chroma.from_documents(docs, embedding=embedding_function, persist_directory="./chroma_subtitles")
db.persist()

In [16]:
def find_movie_from_line(subtitle_line: str, top_k: int = 3):
    results = db.similarity_search_with_score(subtitle_line, k=top_k)
    for doc, score in results:
        print(f"Movie: {doc.metadata['source']} | Score: {score:.4f}")
        print(f"Matched Subtitle: {doc.page_content[:200]}")
        print("-" * 80)

# Example usage:
user_input = "I can do this all day."
find_movie_from_line(user_input)

Movie: Avengers.Endgame.txt | Score: 45.3334
Matched Subtitle: Okay, hold on, don't shoot.
- You see where you're going? - Mm-hmm.
Okay.
Now, let's worry about how you get there.
Gotta move your foot here.
Point your toe this way.
Your hips here.
Okay?
- Can you 
--------------------------------------------------------------------------------
Movie: Spider-Man.Homecoming.txt | Score: 45.4038
Matched Subtitle: Things are never gonna be the same now.
I mean, look at this.
You got aliens.
You got big green guys tearing down buildings.
When I was a kid, I used to draw cowboys and Indians.
Actually, it's native
--------------------------------------------------------------------------------
Movie: Captain.America.The.First.Avenger.txt | Score: 45.7433
Matched Subtitle: Are you the guys from Washington?
You get many other visitors out here?
How long you been on-site?
Since this morning.
A Russian oil team called it in about 18 hours ago.
How come nobody spotted it be
------------------------

In [18]:
# Example usage:
user_input = "I can do this all day"
find_movie_from_line(user_input)

Movie: Avengers.Endgame.txt | Score: 47.0558
Matched Subtitle: Okay, hold on, don't shoot.
- You see where you're going? - Mm-hmm.
Okay.
Now, let's worry about how you get there.
Gotta move your foot here.
Point your toe this way.
Your hips here.
Okay?
- Can you 
--------------------------------------------------------------------------------
Movie: Spider-Man.Homecoming.txt | Score: 47.2353
Matched Subtitle: Things are never gonna be the same now.
I mean, look at this.
You got aliens.
You got big green guys tearing down buildings.
When I was a kid, I used to draw cowboys and Indians.
Actually, it's native
--------------------------------------------------------------------------------
Movie: Captain.America.The.First.Avenger.txt | Score: 47.3899
Matched Subtitle: Are you the guys from Washington?
You get many other visitors out here?
How long you been on-site?
Since this morning.
A Russian oil team called it in about 18 hours ago.
How come nobody spotted it be
------------------------

In [19]:
# Example usage:
user_input = "You got big green guys tearing down buildings."
find_movie_from_line(user_input)

Movie: Spider-Man.Homecoming.txt | Score: 43.6696
Matched Subtitle: Things are never gonna be the same now.
I mean, look at this.
You got aliens.
You got big green guys tearing down buildings.
When I was a kid, I used to draw cowboys and Indians.
Actually, it's native
--------------------------------------------------------------------------------
Movie: Spider-Man.Homecoming.txt | Score: 49.8828
Matched Subtitle: Things are never gonna be the same now.
I mean, look at this.
You got aliens.
You got big green guys tearing down buildings.
When I was a kid, I used to draw cowboys and Indians.
Actually, it's native
--------------------------------------------------------------------------------
Movie: Captain.America.The.First.Avenger.txt | Score: 51.1737
Matched Subtitle: Are you the guys from Washington?
You get many other visitors out here?
How long you been on-site?
Since this morning.
A Russian oil team called it in about 18 hours ago.
How come nobody spotted it be
-------------------