# A prototype system capable of identifying movies from an IMDb dataset based on natural language queries.

### Step 1: Pre-proceesing the dataset

In [173]:
import pandas as pd

In [None]:
# Read the Dataset
df = pd.read_csv('imdb.csv')
df.head() # retrive top 5 rows

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [175]:
# Total number of columns and rows in database
df.shape

(1000, 16)

In [176]:
# Type of each column
print('Datatype of each column:')
print(df.dtypes)

Datatype of each column:
Poster_Link       object
Series_Title      object
Released_Year     object
Certificate       object
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object


In [177]:
# Findinf missing values in dataset
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
print('Columns with missing data and their count:')
missing_data

Columns with missing data and their count:


Certificate    101
Meta_score     157
Gross          169
dtype: int64

In [178]:
# Filling missing values
df['Certificate'].fillna(df['Certificate'].mode()[0], inplace=True)
df['Meta_score'].fillna(df['Meta_score'].median(), inplace=True)
df.dropna(subset=['Gross'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Certificate'].fillna(df['Certificate'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Meta_score'].fillna(df['Meta_score'].median(), inplace=True)


In [179]:
# Re-check the dataset for missing value
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
print('Columns with missing data and their count:')
missing_data

Columns with missing data and their count:


Series([], dtype: int64)

In [180]:
# Checking for duplication
duplicated_row = df[df.duplicated()]
print("Duplicated row")
duplicated_row

Duplicated row


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross


In [None]:
# Importing Libraries
import os
from typing import List
from dotenv import load_dotenv

from langchain.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import Document
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from openai import AzureOpenAI

# To load from .env file
load_dotenv()
os.environ['CURL_CA_BUNDLE'] = ''


True

## Step 2: Qdrant and OpenAI setup using Langchain

In [None]:
# Creating metadata and full content in list form to store in vector database
documents = []

for idx, row in df.iterrows():
    metadata = {
        "title": row["Series_Title"],
        "genre": row["Genre"],
        "summary": row["Overview"],
        "actors": f"{row['Star1']}, {row['Star2']}, {row['Star3']}, {row['Star4']}"
    }

    page_content = (
        f"Title: {row['Series_Title']}. "
        f"Genre: {row['Genre']}. "
        f"Directed by {row['Director']}. "
        f"Starring: {metadata['actors']}. "
        f"Certificate: {row['Certificate']}. "
        f"Runtime: {row['Runtime']}. "
        f"IMDb Rating: {row['IMDB_Rating']}. "
        f"Metascore: {row['Meta_score']}. "
        f"Overview: {row['Overview']}. "
        f"Released in {row['Released_Year']}. "
        f"Total Votes: {row['No_of_Votes']}. "
        f"Gross Income: {row['Gross']}. "
        f"Poster Link: {row['Poster_Link']}."
    )

    documents.append(Document(page_content=page_content, metadata=metadata))

In [183]:
# Printing the document structure
documents

[Document(metadata={'title': 'The Shawshank Redemption', 'genre': 'Drama', 'summary': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.', 'actors': 'Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler'}, page_content='Title: The Shawshank Redemption. Genre: Drama. Directed by Frank Darabont. Starring: Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler. Certificate: A. Runtime: 142 min. IMDb Rating: 9.3. Metascore: 80.0. Overview: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.. Released in 1994. Total Votes: 2343110. Gross Income: 28,341,469. Poster Link: https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_UX67_CR0,0,67,98_AL_.jpg.'),
 Document(metadata={'title': 'The Godfather', 'genre': 'Crime, Drama', 'summary': "An organized crime dynasty's aging patriarch transfers 

## Step 3: Vectorize(Embeding) and store in Qdrant 

In [None]:
# Setting up embedding model
from langchain_community.embeddings import AzureOpenAIEmbeddings # using azure embedding model here

embedding_model = AzureOpenAIEmbeddings(
    deployment="text-embedding-3-large", 
    azure_endpoint="https://ai-mbevacloud375524212247.cognitiveservices.azure.com/",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    openai_api_version="2024-02-01",
    chunk_size=512
)


In [None]:
# Creating in-memory Qdrant instance. This is not persisted to disk, all data will be lost once the script exits 
client = QdrantClient(":memory:")

client.create_collection(
    collection_name="demo_collection", # collection name
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE), # vector dimension = 3072, similarlity metric = Cosine
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embedding_model,
)

In [None]:
# Vectorize & store documents into Qdrant vector store
vector_store.add_documents(documents) # add the list of document to qdrant
print(f"Stored {len(documents)} documents in Qdrant.")


Stored 831 documents in Qdrant.


## Step 4: Create Retrieval QA Chain with Memory

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory # for memory retension

retriever = vector_store.as_retriever(search_kwargs={"k": 5}) # extrect top 5 results using cosine similarity 
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) # for chat continution

qa_chain = ConversationalRetrievalChain.from_llm( # main block for question, retrieved docs and converting vector back to Natural Language
    llm=llm,
    retriever=retriever,
    memory=memory,
    verbose=True
)


## Step 6 : Interactive Movie Query Example

In [None]:
# Ask question
query = "Suggest a romantic drama with high IMDb rating and known actors."
response = qa_chain.invoke({"question": query}) 
print("Response:\n", response["answer"])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Title: The Notebook. Genre: Drama, Romance. Directed by Nick Cassavetes. Starring: Gena Rowlands, James Garner, Rachel McAdams, Ryan Gosling. Certificate: A. Runtime: 123 min. IMDb Rating: 7.8. Metascore: 53.0. Overview: A poor yet passionate young man falls in love with a rich young woman, giving her a sense of freedom, but they are soon separated because of their social differences.. Released in 2004. Total Votes: 520284. Gross Income: 81,001,787. Poster Link: https://m.media-amazon.com/images/M/MV5BMTk3OTM5Njg5M15BMl5BanBnXkFtZTYwMzA0ODI3._V1_UX67_CR0,0,67,98_AL_.jpg.

Title: Titanic. Genre: Drama, Romance. Directed by James Cameron. Starring: Leonardo DiCaprio, Kate W

## Step 7: Follow-Up Example

In [None]:
# Ask follow up question/ new
follow_up = "Was it released after 2010?"
follow_up_response = qa_chain.run(follow_up)
print("Follow-Up:\n", follow_up_response)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: Suggest a romantic drama with high IMDb rating and known actors.
Assistant: I recommend "Eternal Sunshine of the Spotless Mind." It has an IMDb rating of 8.3 and stars well-known actors Jim Carrey and Kate Winslet. The film explores the complexities of love and memory in a unique way.
Follow Up Input: Was it released after 2010?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Title: Eternal Sunshine of the Spotless Mind. G

## Step 8: Testing and Evalution

In [199]:
# using set of diverse natural language queries
query = "Movies directed by Christopher Nolan with a complex plot."
response = qa_chain.invoke({"question": query}) 
print("Response:\n", response["answer"])
print("--------------------------------------------------------")
query = "I want a thriller movie with a surprising twist."
response = qa_chain.invoke({"question": query}) 
print("Response:\n", response["answer"])
print("--------------------------------------------------------")
query = "Sci-fi movies from the 1990s with aliens"
response = qa_chain.invoke({"question": query}) 
print("Response:\n", response["answer"])
print("--------------------------------------------------------")
query = "Which animated movies are great for kids under 10?"
response = qa_chain.invoke({"question": query}) 
print("Response:\n", response["answer"])
print("--------------------------------------------------------")



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Title: Inception. Genre: Action, Adventure, Sci-Fi. Directed by Christopher Nolan. Starring: Leonardo DiCaprio, Joseph Gordon-Levitt, Elliot Page, Ken Watanabe. Certificate: UA. Runtime: 148 min. IMDb Rating: 8.8. Metascore: 74.0. Overview: A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O.. Released in 2010. Total Votes: 2067042. Gross Income: 292,576,195. Poster Link: https://m.media-amazon.com/images/M/MV5BMjAxMzY3NjcxNF5BMl5BanBnXkFtZTcwNTI5OTM0Mw@@._V1_UX67_CR0,0,67,98_AL_.jpg.

Title: The Dark Knight Rises. Genre: Action, Adventure. Directed by Christopher Nolan. St

## Limitation Observed
### 1. Model may occasionally misinterpret ambiguous queries.
### 2. The quality of answer is highly dependend on the prompt used.