In [71]:
from sentence_transformers import SentenceTransformer
import json
import numpy as np
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import os
from langchain.chains.retrieval import create_retrieval_chain
from langchain import hub
from langchain_groq import ChatGroq
from langchain.chains.combine_documents import create_stuff_documents_chain


In [53]:
# 1. Load the scenes from the JSONL file
scene_chunks_path = "../data/scene_chunks.jsonl"
scene_chunks = []
with open(scene_chunks_path, "r", encoding="utf-8") as f:
    for line in f:
        scene_chunks.append(json.loads(line))

In [None]:
# 2. Create LangChain Documents
documents = [
    Document(
        page_content=scene["text"],
        metadata={
            "scene_id": scene.get("scene_id", idx),
            "speakers": scene.get("speakers", [])
            }
    )
    for idx, scene in enumerate(scene_chunks)
]

In [59]:
print(f"Loaded {len(documents)} documents.")
print("Scene Text:", documents[0].page_content)
print("Metadata:", documents[0].metadata)

Loaded 8157 documents.
Scene Text: Michael: All right Jim. Your quarterlies look very good. How are things at the library?
Jim: Oh, I told you. I couldn't close it. So...
Michael: So you've come to the master for guidance? Is this what you're saying, grasshopper?
Jim: Actually, you called me in here, but yeah.
Michael: All right. Well, let me show you how it's done.
Metadata: {'scene_id': 'S1E1_Scene1', 'speakers': ['Michael', 'Jim']}


In [56]:
# 3. Load HuggingFace Embedding Model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [57]:
# 4. Build FAISS vector database
vectorstore = FAISS.from_documents(
    documents=documents,
    embedding=embedding_model
)

In [58]:
# 5. Save the vector database locally
output_folder = "../data/vector_databases"
vectorstore.save_local(os.path.join(output_folder, "scene_db"))

Build a Retriever

In [60]:
# 1. Load your saved vector database
retriever = FAISS.load_local(
    folder_path="../data/vector_databases/scene_db",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
).as_retriever()

In [None]:
# 2. Test a basic retrieval
# Search for scenes related to "Jim's prank"
query = "Tell me about Jim's pranks on Dwight."

# Retrieve top k
retrieved_docs = retriever.get_relevant_documents(query, k=5)

# Show the results
for idx, doc in enumerate(retrieved_docs):
    print(f"\nScene {idx+1}:")
    print("Text:", doc.page_content)
    print("Metadata:", doc.metadata)


Scene 1:
Text: Jim: Dwight has made me his bestisch mensch. Which is Schrute for best man. He's putting himself entirely in my hands tonight. And I know for over 12 years I've done nothing but trick and prank him but tonight...only good surprises. "Guten Pranken". 
Metadata: {'scene_id': 'S9E24_Scene8055', 'speakers': ['Jim']}

Scene 2:
Text: Jim: You know, when I saw Dwight, I realized how stupid and petty all those pranks I pulled on him were. And then he spoke. I wonder how hard it would be to get a copy of his room key.
Metadata: {'scene_id': 'S3E2_Scene1282', 'speakers': ['Jim']}

Scene 3:
Text: Pam: I feel horrible for blowing Jim's prank. I don't know if you can tell, but he's mildly upset. And Dwight hasn't been messed with in a while, so he's become a monster. I need to make this right.
Metadata: {'scene_id': 'S7E1_Scene5137', 'speakers': ['Pam']}

Scene 4:
Text: Jim: Oh no, this is different. The CIA thing, that was a prank on Dwight. This is more like a umm... OK, it's pret

In [80]:
# 3. Chain the Retriever to LLM for a nice answer
llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0.5,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# Stuff documents chain (for combining retrieved docs)
stuff_documents_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=hub.pull("langchain-ai/retrieval-qa-chat")
)
# chain = hub.pull("langchain-ai/retrieval-qa-chat") | llm

# Full Retrieval-Augmented chain
scene_retrieval_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain = stuff_documents_chain
)

# Query the system
output = scene_retrieval_chain.invoke({"input": "You are Jim Halpert. Answer in a witty, sarcastic tone like Jim. Now, tell me about your pranks on Dwight."})
print(output["answer"])


  for url, api_key in _write_api_urls.items():


You want to know about the good old days, huh? Well, let me tell you, I've spent a small fortune on staplers, and a significant amount of time perfecting the art of desk-jockeying. I mean, who needs actual work when you can spend your days tormenting the Assistant (to the) Regional Manager?

But, in all seriousness, I've got to admit, some of those pranks were pretty genius. I mean, who else could make a stapler look like a snake? Or turn Dwight's desk into a giant Jenga game? (Okay, maybe that one was a bit of a stretch, but still.)

But, as I was saying, I've come to realize that all those pranks were just a bit...petty. I mean, what's the point of filling Dwight's stapler with Jell-O or putting his chair on wheels? It's not like it's going to change the course of human history or anything. Although, I do have to admit, it's been a pretty good way to pass the time.

So, now that I've had a chance to reflect on my prankster days, I've decided to take a more...diplomatic approach. I me