In [None]:
!pip install sentence-transformers
!pip install torch
!pip install transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Download the punkt_tab data
from nltk.tokenize import sent_tokenize

embedder = SentenceTransformer('all-MiniLM-L6-v2')
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", framework="pt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Device set to use cpu


In [None]:
# Load movie plots
with open("/content/plots.json", "r") as f:
    plots = json.load(f)

In [None]:
paragraphs = []
metadata = []  # Keep track of which paragraph belongs to which movie

for plot in plots:
    sentences = sent_tokenize(plot['text'])
    chunk_size = 4  # 4 sentences per paragraph (you can tune this)
    for i in range(0, len(sentences), chunk_size):
        paragraph = ' '.join(sentences[i:i+chunk_size])
        paragraphs.append(paragraph)
        metadata.append(plot['title'])

# Embed all paragraphs
paragraph_embeddings = embedder.encode(paragraphs, convert_to_tensor=True)

In [None]:
def find_best_paragraph(query):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, paragraph_embeddings)[0]
    best_idx = torch.argmax(cos_scores).item()
    return metadata[best_idx], paragraphs[best_idx]

def answer_question(query):
    movie_title, best_paragraph = find_best_paragraph(query)
    answer = qa_pipeline(question=query, context=best_paragraph)
    return movie_title, answer['answer']


In [None]:
question = "What happens to Cooper in the end?"
movie, answer = answer_question(question)
print(f"Movie: {movie}\nAnswer: {answer}")

Movie: Interstellar
Answer: He reunites with Murph


In [None]:
question = "How does Andy escape from prison?"
movie, answer = answer_question(question)
print(f"Movie: {movie}\nAnswer: {answer}")

Movie: The Shawshank Redemption
Answer: he dreams of living in Zihuatanejo


In [None]:
question = "What is the moon in Moonfall?"
movie, answer = answer_question(question)
print(f"Movie: {movie}\nAnswer: {answer}")

Movie: Moonfall
Answer: hollow
