In [1]:
# Apply Black formatting (optional, but recommended for consistent style)
%load_ext jupyter_black

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_mistralai import ChatMistralAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import textwrap


# Load environment variables
load_dotenv()

# Retrieve API key
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
if not MISTRAL_API_KEY:
    raise ValueError(
        "Mistral API key not found. Check your .env file and ensure MISTRAL_API_KEY is set."
    )

print("Mistral API key loaded from .env")

# Initialize the Mistral client
llm = ChatMistralAI(
    model="mistral-small-latest",
    temperature=0,
    max_retries=2,
    api_key=MISTRAL_API_KEY,  # Pass the API key directly
)

# Initialize the Mistral client
llm = ChatMistralAI(
    model="mistral-small-latest",
    temperature=0,
    max_retries=2,
    api_key=MISTRAL_API_KEY,  # Pass the API key directly
)

# Test Mistral connection
try:
    # Use the `invoke` method with the correct `input` argument
    chat_response = llm.invoke(input="Hello, how are you?")
    print("Test response from Mistral:")
    print(chat_response)  # Print the full response to inspect its structure
except Exception as e:
    print(f"Error testing Mistral connection: {e}")

# Load CSV Data
CSV_FILE_PATH = os.path.join("..", "data", "The-Office-Lines-V4.csv")

try:
    df = pd.read_csv(CSV_FILE_PATH)
    print(f"CSV data loaded successfully from {CSV_FILE_PATH}")
    # Display the first few rows of the DataFrame to verify
    print(df.head())
except FileNotFoundError:
    print(
        f"Error: File not found at {CSV_FILE_PATH}. Make sure the path is correct and the file exists."
    )
    df = pd.DataFrame()  # Create an empty DataFrame to avoid errors later
except Exception as e:
    print(f"Error reading CSV file: {e}")
    df = pd.DataFrame()

Mistral API key loaded from .env
Test response from Mistral:
content="Hello! I'm functioning as intended, thank you. How about you? How are you doing today?" additional_kwargs={} response_metadata={'token_usage': {'prompt_tokens': 9, 'total_tokens': 31, 'completion_tokens': 22}, 'model_name': 'mistral-small-latest', 'model': 'mistral-small-latest', 'finish_reason': 'stop'} id='run-2936ae82-18de-4744-9939-44eb3fddb1cf-0' usage_metadata={'input_tokens': 9, 'output_tokens': 22, 'total_tokens': 31}
CSV data loaded successfully from ../data/The-Office-Lines-V4.csv
   season  episode  title  scene  speaker  \
0       1        1  Pilot      1  Michael   
1       1        1  Pilot      1      Jim   
2       1        1  Pilot      1  Michael   
3       1        1  Pilot      1      Jim   
4       1        1  Pilot      1  Michael   

                                                line Unnamed: 6  
0  All right Jim. Your quarterlies look very good...        NaN  
1         Oh, I told you. I cou

In [7]:
# Load the CSV file and handle trailing commas
df = pd.read_csv(CSV_FILE_PATH, skip_blank_lines=True, on_bad_lines="skip")

# Drop any unnamed columns if they still exist
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

print(df.head())

   season  episode  title  scene  speaker  \
0       1        1  Pilot      1  Michael   
1       1        1  Pilot      1      Jim   
2       1        1  Pilot      1  Michael   
3       1        1  Pilot      1      Jim   
4       1        1  Pilot      1  Michael   

                                                line  
0  All right Jim. Your quarterlies look very good...  
1         Oh, I told you. I couldn't close it. So...  
2  So you've come to the master for guidance? Is ...  
3         Actually, you called me in here, but yeah.  
4    All right. Well, let me show you how it's done.  


In [8]:
# Group dialogue lines by scene
grouped_dialogues = (
    df.groupby(["season", "episode", "scene"])["line"].apply(" ".join).reset_index()
)

print(grouped_dialogues.head())

   season  episode  scene                                               line
0       1        1      1  All right Jim. Your quarterlies look very good...
1       1        1      2   Yes, I'd like to speak to your office manager...
2       1        1      3  I've, uh, I've been at Dunder Mifflin for 12 y...
3       1        1      4  People say I am the best boss. They go, "God w...
4       1        1      5   Shall I play for you? Pa rum pump um pum  I h...


I commented out the code cell below since the vector database was already created.

In [None]:
# from langchain.vectorstores import FAISS  # Correct import for LangChain's FAISS wrapper
# from langchain.schema import Document
# from langchain_huggingface import HuggingFaceEmbeddings
# import os

# # Prepare the data for FAISS
# # Convert grouped dialogues into a list of `Document` objects
# chunks = [
#     Document(
#         page_content=row["line"],  # The text to embed
#         metadata={
#             "season": row["season"],
#             "episode": row["episode"],
#             "scene": row["scene"],
#         },
#     )
#     for _, row in grouped_dialogues.iterrows()
# ]


# # Define the function to create and save the FAISS vector database
# def create_embedding_vector_db(
#     chunks, db_name, target_directory=f"../vector_databases"
# ):
#     """
#     This function uses the open-source embedding model HuggingFaceEmbeddings
#     to create embeddings and store those in a vector database called FAISS,
#     which allows for efficient similarity search.
#     """
#     # Instantiate embedding model
#     embedding = HuggingFaceEmbeddings(
#         model_name="sentence-transformers/all-mpnet-base-v2"
#     )
#     # Create the vector store
#     vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding)
#     # Save vector database locally
#     if not os.path.exists(target_directory):
#         os.makedirs(target_directory)
#     vectorstore.save_local(f"{target_directory}/{db_name}_vector_db")
#     print(f"Vector database saved at {target_directory}/{db_name}_vector_db")


# # Use the function to create and save the vector database
# create_embedding_vector_db(
#     chunks, db_name="the_office", target_directory="../vector_databases"
# )

Vector database saved at ../vector_databases/the_office_vector_db


In [9]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Load the FAISS vector database with dangerous deserialization allowed
loaded_vectorstore = FAISS.load_local(
    "../vector_databases/the_office_vector_db",
    embeddings=embeddings,
    allow_dangerous_deserialization=True,  # Enable deserialization
)

# Perform a similarity search
query = "What does Michael Scott say about leadership?"
results = loaded_vectorstore.similarity_search(query, k=5)

# Display the results
for result in results:
    print(result)

page_content='Michael, what if somebody asks you a question at this meeting? Are you just gonna wave, or what? I will have to answer. I'll ask you a question. Make it a softball. Something he can, like, crank out of the park. Michael Scott you run the most profitable branch of Dunder Mifflin. How do you do it? No, no. That's too hard. Say your name is Zamboni and then I will say, 'Well, we're sort of on thin ice.'  I won't say that. I'll something like that. This is your big day. Come on. Oh, my god. This is it.' metadata={'season': 6, 'episode': 11, 'scene': 4481}
page_content='You know, Michael? You want to succeed? You got to apply the same- ' metadata={'season': 5, 'episode': 22, 'scene': 3852}
page_content='Oh my god. He's Michael Scott.' metadata={'season': 9, 'episode': 16, 'scene': 7725}
page_content='Here's the thing. Michael is doing something right. And in this economic climate, no method of success can be ignored. It's not really time for executives to start getting judgmen

In [21]:
# Use the loaded FAISS vector store as a retriever
retriever = loaded_vectorstore.as_retriever()

# Define a custom prompt template
prompt_template = """
You can be any character from the office who can be chatted with.
Use the following context to answer the question:
{context}

Question: {question}
Answer:
"""
prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

# Create the RetrievalQA chain with a single output key
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=False,  # Exclude source documents
    chain_type_kwargs={"prompt": prompt},
)

# Test the RetrievalQA chain
query = "Hey Michael, how can we increase sales?"
response = qa_chain.run(query)  # Now `run` will work

# Display the response
print("\n".join(textwrap.wrap(response, width=80)))  # Adjust width as needed

Well, Pammy, first of all, you gotta believe in yourself. You're good at sales,
you just gotta stick to the script. Make the call, say the lines, make the sale.
That's the key. And remember, it's not just about the price. It's about the
relationship, the personal touch. You gotta put a little more face-to-face time
with your clients.  And hey, we're offering a $50 bonus tonight to the person
with the most sales. So, there's a little extra motivation for you. Just keep
pushing, keep smiling, and keep making those calls. You got this, Pammy! That's
what she said!
