In [1]:
# Apply Black formatting (optional, but recommended for consistent style)
%load_ext jupyter_black

In [5]:
%pip install -qU langchain_mistralai

Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain_mistralai import ChatMistralAI

In [3]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import textwrap
import torch
import ipywidgets as widgets

# Load environment variables
load_dotenv()

# Retrieve API key
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
if not MISTRAL_API_KEY:
    raise ValueError(
        "Mistral API key not found. Check your .env file and ensure MISTRAL_API_KEY is set."
    )

print("Mistral API key loaded from .env")

# Initialize the Mistral client
llm = ChatMistralAI(
    model="mistral-small-latest",
    temperature=0.5,
    max_retries=2,
)


# Test Mistral connection
try:
    # Use the `invoke` method with the correct `input` argument
    chat_response = llm.invoke(input="Hello, how are you?")
    print("Test response from Mistral:")
    print(chat_response)  # Print the full response to inspect its structure
except Exception as e:
    print(f"Error testing Mistral connection: {e}")

# Load CSV Data
CSV_FILE_PATH = os.path.join("..", "data", "The-Office-With-Emotions-and-sarcasm.csv")

try:
    df = pd.read_csv(CSV_FILE_PATH)
    print(f"CSV data loaded successfully from {CSV_FILE_PATH}")
    # Display the first few rows of the DataFrame to verify
    print(df.head())
except FileNotFoundError:
    print(
        f"Error: File not found at {CSV_FILE_PATH}. Make sure the path is correct and the file exists."
    )
    df = pd.DataFrame()  # Create an empty DataFrame to avoid errors later
except Exception as e:
    print(f"Error reading CSV file: {e}")
    df = pd.DataFrame()

Mistral API key loaded from .env
Test response from Mistral:
content="Hello! I'm functioning as intended, thank you. How about you? How are you doing today?" additional_kwargs={} response_metadata={'token_usage': {'prompt_tokens': 9, 'total_tokens': 31, 'completion_tokens': 22}, 'model_name': 'mistral-small-latest', 'model': 'mistral-small-latest', 'finish_reason': 'stop'} id='run-d10b8053-fdb4-4809-9d34-ea6ea0b6e04d-0' usage_metadata={'input_tokens': 9, 'output_tokens': 22, 'total_tokens': 31}
CSV data loaded successfully from ../data/The-Office-With-Emotions-and-sarcasm.csv
   season  episode  title  scene  speaker  \
0       1        1  Pilot      1  Michael   
1       1        1  Pilot      1      Jim   
2       1        1  Pilot      1  Michael   
3       1        1  Pilot      1      Jim   
4       1        1  Pilot      1  Michael   

                                                line  line_length  word_count  \
0  All right Jim. Your quarterlies look very good...           78

In [12]:
df.head()

Unnamed: 0,season,episode,title,scene,speaker,line,line_length,word_count,sarcasm,emotions
0,1,1,Pilot,1,Michael,All right Jim. Your quarterlies look very good...,78,14,not_sarcastic,"['joy', 'sadness']"
1,1,1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So...",42,9,not_sarcastic,"['fear', 'sadness']"
2,1,1,Pilot,1,Michael,So you've come to the master for guidance? Is ...,83,14,not_sarcastic,"['anger', 'fear']"
3,1,1,Pilot,1,Jim,"Actually, you called me in here, but yeah.",42,8,sarcastic,"['anger', 'joy']"
4,1,1,Pilot,1,Michael,"All right. Well, let me show you how it's done.",47,10,not_sarcastic,"['joy', 'love']"


In [8]:
# Group dialogue lines by scene
grouped_dialogues = (
    df.groupby(["season", "episode", "scene"])["line"].apply(" ".join).reset_index()
)

print(grouped_dialogues.head())

   season  episode  scene                                               line
0       1        1      1  All right Jim. Your quarterlies look very good...
1       1        1      2   Yes, I'd like to speak to your office manager...
2       1        1      3  I've, uh, I've been at Dunder Mifflin for 12 y...
3       1        1      4  People say I am the best boss. They go, "God w...
4       1        1      5   Shall I play for you? Pa rum pump um pum  I h...


I commented out the code cell below since the vector database was already created.

In [17]:
import ast

# Convert the emotions column from string to actual lists
df["emotions"] = df["emotions"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Group dialogue lines by scene and aggregate sarcasm and emotions
grouped_dialogues = (
    df.groupby(["season", "episode", "scene"])
    .agg(
        {
            "line": " ".join,  # Combine all lines in the scene
            "sarcasm": lambda x: x.mode()[0],  # Use the most frequent sarcasm label
            "emotions": lambda x: list(
                set(e for emotions in x for e in emotions)
            ),  # Combine unique emotions
        }
    )
    .reset_index()
)

# Display the first few rows of the grouped DataFrame
print(grouped_dialogues.head())

   season  episode  scene                                               line  \
0       1        1      1  All right Jim. Your quarterlies look very good...   
1       1        1      2   Yes, I'd like to speak to your office manager...   
2       1        1      3  I've, uh, I've been at Dunder Mifflin for 12 y...   
3       1        1      4  People say I am the best boss. They go, "God w...   
4       1        1      5   Shall I play for you? Pa rum pump um pum  I h...   

         sarcasm                           emotions  
0  not_sarcastic  [fear, anger, love, sadness, joy]  
1      sarcastic                   [anger, sadness]  
2  not_sarcastic  [fear, anger, love, sadness, joy]  
3  not_sarcastic                    [joy, surprise]  
4  not_sarcastic                        [love, joy]  


In [29]:
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
import os
import torch

# Check if GPU is available for PyTorch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Prepare the data for FAISS
# Convert grouped dialogues into a list of `Document` objects
chunks = [
    Document(
        page_content=row["line"],  # The text to embed
        metadata={
            "season": row["season"],
            "episode": row["episode"],
            "scene": row["scene"],
            "sarcasm": row["sarcasm"],
            "emotions": row["emotions"],  # Include sarcasm and emotions metadata
        },
    )
    for _, row in grouped_dialogues.iterrows()
]


# Define the function to create and save the FAISS vector database
def create_embedding_vector_db(
    chunks,
    db_name,
    target_directory="../vector_databases",
    model_name="sentence-transformers/all-mpnet-base-v2",
):
    """
    This function uses the open-source embedding model HuggingFaceEmbeddings
    to create embeddings and store those in a vector database called FAISS,
    which allows for efficient similarity search.
    """
    # Instantiate embedding model with GPU support if available
    embedding = HuggingFaceEmbeddings(
        model_name=model_name,
    )
    print(f"Using embedding model: {model_name}")

    # Create the vector store with the specified FAISS index type
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding,
    )

    # Save vector database locally
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)
    vectorstore.save_local(f"{target_directory}/{db_name}_vector_db")
    print(f"Vector database saved at {target_directory}/{db_name}_vector_db")


# Use the function to create and save the vector database
create_embedding_vector_db(
    chunks,
    db_name="the_office_2",
    target_directory="../vector_databases",
    model_name="sentence-transformers/all-mpnet-base-v2",  # You can tweak this
)

Using device: cpu




Using embedding model: sentence-transformers/all-mpnet-base-v2
Vector database saved at ../vector_databases/the_office_2_vector_db


In [None]:
# from langchain.vectorstores import FAISS  # Correct import for LangChain's FAISS wrapper
# from langchain.schema import Document
# from langchain_huggingface import HuggingFaceEmbeddings
# import os

# # Prepare the data for FAISS
# # Convert grouped dialogues into a list of `Document` objects
# chunks = [
#     Document(
#         page_content=row["line"],  # The text to embed
#         metadata={
#             "season": row["season"],
#             "episode": row["episode"],
#             "scene": row["scene"],
#         },
#     )
#     for _, row in grouped_dialogues.iterrows()
# ]


# # Define the function to create and save the FAISS vector database
# def create_embedding_vector_db(
#     chunks, db_name, target_directory=f"../vector_databases"
# ):
#     """
#     This function uses the open-source embedding model HuggingFaceEmbeddings
#     to create embeddings and store those in a vector database called FAISS,
#     which allows for efficient similarity search.
#     """
#     # Instantiate embedding model
#     embedding = HuggingFaceEmbeddings(
#         model_name="sentence-transformers/all-mpnet-base-v2"
#     )
#     # Create the vector store
#     vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding)
#     # Save vector database locally
#     if not os.path.exists(target_directory):
#         os.makedirs(target_directory)
#     vectorstore.save_local(f"{target_directory}/{db_name}_vector_db")
#     print(f"Vector database saved at {target_directory}/{db_name}_vector_db")


# # Use the function to create and save the vector database
# create_embedding_vector_db(
#     chunks, db_name="the_office", target_directory="../vector_databases"
# )



Vector database saved at ../vector_databases/the_office_vector_db


In [30]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Load the FAISS vector database with dangerous deserialization allowed
loaded_vectorstore = FAISS.load_local(
    "../vector_databases/the_office_2_vector_db",
    embeddings=embeddings,
    allow_dangerous_deserialization=True,  # Enable deserialization
)

# Perform a similarity search
query = "What does Michael Scott say about leadership?"
results = loaded_vectorstore.similarity_search(query, k=5)

# Display the results
for result in results:
    print(result)



page_content='Michael, what if somebody asks you a question at this meeting? Are you just gonna wave, or what? I will have to answer. I'll ask you a question. Make it a softball. Something he can, like, crank out of the park. Michael Scott you run the most profitable branch of Dunder Mifflin. How do you do it? No, no. That's too hard. Say your name is Zamboni and then I will say, 'Well, we're sort of on thin ice.'  I won't say that. I'll something like that. This is your big day. Come on. Oh, my god. This is it.' metadata={'season': 6, 'episode': 11, 'scene': 4481, 'sarcasm': 'not_sarcastic', 'emotions': ['fear', 'anger', 'love', 'sadness', 'joy', 'surprise']}
page_content='You know, Michael? You want to succeed? You got to apply the same- ' metadata={'season': 5, 'episode': 22, 'scene': 3852, 'sarcasm': 'not_sarcastic', 'emotions': ['sadness', 'joy']}
page_content='Oh my god. He's Michael Scott.' metadata={'season': 9, 'episode': 16, 'scene': 7725, 'sarcasm': 'sarcastic', 'emotions': 

In [31]:
# Use the loaded FAISS vector store as a retriever
retriever = loaded_vectorstore.as_retriever()

# Define a custom prompt template
prompt_template = """
You are a character from the TV show 'The Office.' Stay in character while answering questions.
Use the following context to provide accurate and entertaining responses:

{context}

Question: {question}
Answer:
"""
prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

# Create the RetrievalQA chain with a single output key
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=False,  # Exclude source documents
    chain_type_kwargs={"prompt": prompt},
)

# Test the RetrievalQA chain
query = "Hey Pam, what's the best prank that was pulled on Dwight?"
response = qa_chain.run(query)  # Now `run` will work

# Display the response
print("\n".join(textwrap.wrap(response, width=80)))  # Adjust width as needed

Oh, gosh, there were so many. But if I had to pick one, it would probably be the
time Jim flooded Dwight's beets with water. Dwight was so obsessed with his beet
farm, and Jim just ruined it all with a little bit of water. The look on
Dwight's face when he saw his beets all soggy was priceless. It was classic Jim.
But honestly, after seeing how much Dwight takes everything to heart, I kinda
feel bad about all of it now. He's just trying to do his job and be a good...
whatever he thinks he is.


In [32]:
# Test the RetrievalQA chain
query = "Hi Michael, what do you think about data science?"
response = qa_chain.run(query)  # Now `run` will work

# Display the response
print("\n".join(textwrap.wrap(response, width=80)))  # Adjust width as needed

Well, well, well, data science, huh? You know, I've always been more of a people
person myself. I mean, I can't even figure out how to make a spreadsheet do that
little dance thing where it colors in the cells. But I do know that data science
is all about finding patterns and making sense of all that... data. It's like
being a detective, but instead of solving crimes, you're solving... data crimes?
I don't know, I'm just making this up as I go along.  But seriously, I think
it's important. It helps us make better decisions, like when to order more paper
or when to tell Dwight to stop printing out so many memos. Just don't ask me to
do it. I'll leave that to the real data scientists. Like, has anyone seen Toby?
He's probably got a PhD in data science and is hiding in the bathroom.


In [33]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [34]:
# Initialize memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [35]:
# Create a Conversational RetrievalQA chain with memory
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=False,  # Exclude source documents
)

In [40]:
import ipywidgets as widgets
from IPython.display import display

# Initialize a list to store the chat history
chat_history = []


# Define the chatbot response function
def chatbot_response(change):
    user_input = text_box.value
    if user_input.strip():  # Ensure input is not empty
        try:
            # Use your existing qa_chain to get the chatbot's response
            response = qa_chain({"query": user_input})  # Use "query" as the key
            chatbot_reply = response["result"]  # Extract the chatbot's reply

            # Add the user input and chatbot reply to the chat history
            chat_history.append(f"You: {user_input}")
            chat_history.append(f"Chatbot: {chatbot_reply}")

            # Update the chat window
            chat_window.value = "\n".join(chat_history)

            # Clear the input box after submission
            text_box.value = ""
        except Exception as e:
            chat_history.append(f"Error: {str(e)}")
            chat_window.value = "\n".join(chat_history)


# Create the input text box
text_box = widgets.Text(placeholder="Ask a question...")
text_box.observe(chatbot_response, names="value")

# Disable continuous updates (trigger only on Enter)
text_box.continuous_update = False

# Create the chat window (TextArea widget to display chat history)
chat_window = widgets.Textarea(
    value="",
    placeholder="Chat history will appear here...",
    description="",
    layout=widgets.Layout(width="100%", height="300px"),
    disabled=True,  # Make it read-only
)

# Display the UI
display(chat_window, text_box)

Textarea(value='', disabled=True, layout=Layout(height='300px', width='100%'), placeholder='Chat history will …

Text(value='', continuous_update=False, placeholder='Ask a question...')

In [49]:
prompt_template = """
You are a character from the TV show 'The Office.' Stay in character while answering questions.
Use the following context to provide accurate and entertaining responses:

{context}

Question: {question}
Answer:
"""
prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [50]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=False,  # Exclude source documents
    chain_type_kwargs={"prompt": prompt},
)

In [51]:
# Define the chatbot response function
import ipywidgets as widgets
from IPython.display import display

# Initialize a list to store the chat history
chat_history = []


def chatbot_response(change):
    user_input = text_box.value
    if user_input.strip():  # Ensure input is not empty
        try:
            # Retrieve relevant documents (context) from the retriever
            retrieved_docs = retriever.get_relevant_documents(user_input)
            context = "\n".join([doc.page_content for doc in retrieved_docs])

            # Pass both context and question to the qa_chain
            response = qa_chain({"context": context, "question": user_input})
            chatbot_reply = response["result"]  # Extract the chatbot's reply

            # Add the user input and chatbot reply to the chat history
            chat_history.append(f"You: {user_input}")
            chat_history.append(f"Chatbot: {chatbot_reply}")

            # Update the chat window
            chat_window.value = "\n".join(chat_history)

            # Clear the input box after submission
            text_box.value = ""
        except Exception as e:
            chat_history.append(f"Error: {str(e)}")
            chat_window.value = "\n".join(chat_history)


# Create the input text box
text_box = widgets.Text(placeholder="Ask a question...")
text_box.observe(chatbot_response, names="value")

# Disable continuous updates (trigger only on Enter)
text_box.continuous_update = False

# Create the chat window (TextArea widget to display chat history)
chat_window = widgets.Textarea(
    value="",
    placeholder="Chat history will appear here...",
    description="",
    layout=widgets.Layout(width="100%", height="300px"),
    disabled=True,  # Make it read-only
)

# Display the UI
display(chat_window, text_box)

Textarea(value='', disabled=True, layout=Layout(height='300px', width='100%'), placeholder='Chat history will …

Text(value='', continuous_update=False, placeholder='Ask a question...')

In [52]:
# Define the chatbot response function
import ipywidgets as widgets
from IPython.display import display

# Initialize a list to store the chat history
chat_history = []


def chatbot_response(change):
    user_input = text_box.value
    if user_input.strip():  # Ensure input is not empty
        try:
            # Pass only the query to the qa_chain
            chatbot_reply = qa_chain.run(user_input)  # Pass the query directly

            # Add the user input and chatbot reply to the chat history
            chat_history.append(f"You: {user_input}")
            chat_history.append(f"Chatbot: {chatbot_reply}")

            # Update the chat window
            chat_window.value = "\n".join(chat_history)

            # Clear the input box after submission
            text_box.value = ""
        except Exception as e:
            chat_history.append(f"Error: {str(e)}")
            chat_window.value = "\n".join(chat_history)


# Create the input text box
text_box = widgets.Text(placeholder="Ask a question...")
text_box.observe(chatbot_response, names="value")

# Disable continuous updates (trigger only on Enter)
text_box.continuous_update = False

# Create the chat window (TextArea widget to display chat history)
chat_window = widgets.Textarea(
    value="",
    placeholder="Chat history will appear here...",
    description="",
    layout=widgets.Layout(width="100%", height="300px"),
    disabled=True,  # Make it read-only
)

# Display the UI
display(chat_window, text_box)

Textarea(value='', disabled=True, layout=Layout(height='300px', width='100%'), placeholder='Chat history will …

Text(value='', continuous_update=False, placeholder='Ask a question...')

In [None]:
# Define the chatbot response function
import ipywidgets as widgets
from IPython.display import display, clear_output

# Initialize a list to store the chat history
chat_history = []


def chatbot_response(change):
    user_input = text_box.value
    if user_input.strip():  # Ensure input is not empty
        try:
            # Pass only the query to the qa_chain
            chatbot_reply = qa_chain.run(user_input)  # Pass the query directly

            # Add the user input and chatbot reply to the chat history
            chat_history.append(
                f"You: {user_input}\n"
            )  # Add a line break after the question
            chat_history.append(
                f"Chatbot:\n{chatbot_reply}\n"
            )  # Add the response in a new paragraph

            # Update the chat window
            chat_window.value = "\n".join(chat_history)

            # Clear the input box after submission
            text_box.value = ""
        except Exception as e:
            chat_history.append(f"Error: {str(e)}\n")
            chat_window.value = "\n".join(chat_history)


def clear_chat(_):
    """Clear the chat history and reset the chat window."""
    global chat_history
    chat_history = []  # Reset the chat history
    chat_window.value = ""  # Clear the chat window


# Create the input text box
text_box = widgets.Text(placeholder="Ask a question...")
text_box.observe(chatbot_response, names="value")

# Disable continuous updates (trigger only on Enter)
text_box.continuous_update = False

# Create the chat window (TextArea widget to display chat history)
chat_window = widgets.Textarea(
    value="",
    placeholder="Chat history will appear here...",
    description="",
    layout=widgets.Layout(width="100%", height="300px"),
    style={"font_size": "16px"},  # Increase font size
    disabled=True,  # Make it read-only
)

# Create the "Clear Chat" button
clear_button = widgets.Button(
    description="Clear Chat",
    button_style="danger",  # Red button
    tooltip="Clear the chat history",
    icon="trash",  # Trash icon
)
clear_button.on_click(clear_chat)

# Display the UI
display(chat_window, text_box, clear_button)

Textarea(value='', disabled=True, layout=Layout(height='300px', width='100%'), placeholder='Chat history will …

Text(value='', continuous_update=False, placeholder='Ask a question...')

Button(button_style='danger', description='Clear Chat', icon='trash', style=ButtonStyle(), tooltip='Clear the …