In [1]:
from dotenv import load_dotenv
import json
import pandas as pd
import pprint

import chromadb

from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.response.notebook_utils import display_response
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore


## Import Data

In [2]:
dataset_df = pd.read_csv("moviedata.csv")
# remove rows where "fullplot" column is NaN
dataset_df = dataset_df.dropna(subset=["fullplot"])
# remove "plot_embedding column"
dataset_df = dataset_df.drop(columns=["plot_embedding"])
# print(dataset_df.isnull().sum())
dataset_df.head(3)

Unnamed: 0,plot,runtime,cast,directors,fullplot,awards,languages,countries,poster,rated,metacritic,num_mflix_comments,writers,type,title,imdb,genres
0,Young Pauline is left a lot of money when her ...,199.0,"['Pearl White', 'Crane Wilbur', 'Paul Panzer',...","['Louis J. Gasnier', 'Donald MacKenzie']",Young Pauline is left a lot of money when her ...,"{'nominations': 0, 'text': '1 win.', 'wins': 1}",['English'],['USA'],https://m.media-amazon.com/images/M/MV5BMzgxOD...,,,0,"['Charles W. Goddard (screenplay)', 'Basil Dic...",movie,The Perils of Pauline,"{'id': 4465, 'rating': 7.6, 'votes': 744}",['Action']
1,A penniless young man tries to save an heiress...,22.0,"['Harold Lloyd', 'Mildred Davis', ""'Snub' Poll...","['Alfred J. Goulding', 'Hal Roach']",As a penniless man worries about how he will m...,"{'nominations': 1, 'text': '1 nomination.', 'w...",['English'],['USA'],https://m.media-amazon.com/images/M/MV5BNzE1OW...,TV-G,,0,['H.M. Walker (titles)'],movie,From Hand to Mouth,"{'id': 10146, 'rating': 7.0, 'votes': 639}","['Comedy', 'Short', 'Action']"
2,"Michael ""Beau"" Geste leaves England in disgrac...",101.0,"['Ronald Colman', 'Neil Hamilton', 'Ralph Forb...",['Herbert Brenon'],"Michael ""Beau"" Geste leaves England in disgrac...","{'nominations': 0, 'text': '1 win.', 'wins': 1}",['English'],['USA'],,,,0,"['Herbert Brenon (adaptation)', 'John Russell ...",movie,Beau Geste,"{'id': 16634, 'rating': 6.9, 'votes': 222}","['Action', 'Adventure', 'Drama']"


### a smaller dataset for test

In [3]:
dataset_df = dataset_df[-10:]
# dataset_df

### end test

## Prepare for Indexing

In [4]:
def df_to_json(df):
    
    # convert the DataFrame to a JSON string representation
    documents_json = dataset_df.to_json(orient="records")
    # load the JSON string into Python List of dictionaries (eact dict is a movie)
    documents_list = json.loads(documents_json) # <== converts "string of a list" to an actual list
    
    return documents_list

documents_list = df_to_json(dataset_df)

## Llama index

In [5]:
embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=256)
llm = OpenAI()

Settings.llm = llm
Settings.embed_model = embed_model

### add metadata

In [6]:
# NOTE: now "metadata" for nations is a string, where countries was an actual list
# NOTE:  metat data must be one of (str, int, float, None)
# use json.dumps() to convert lists and dictionaries into strings

def create_llama_docs(documents_list):
    llama_documents = []

    for movie in documents_list:
        movie["writers"] = json.dumps(movie["writers"])
        movie["languages"] = json.dumps(movie["languages"])
        movie["genres"] = json.dumps(movie["genres"])
        movie["cast"] = json.dumps(movie["cast"])
        movie["directors"] = json.dumps(movie["directors"])
        movie["countries"] = json.dumps(movie["countries"])
        movie["imdb"] = json.dumps(movie["imdb"])
        movie["awards"] = json.dumps(movie["awards"])

        # create a Llama Document object 
        # with text and excluded meta data for llm and embedding model

        llama_document = Document(
            text=movie["fullplot"],
            metadata=movie,
            excluded_llm_metadata_keys=["fullplot", "metacritic"],
            excluded_embed_metadata_keys=["fullplot", "metacritic", "num_mflix_comments", "runtime", "rated"],
            metadata_template="{key}=>{value}",
            text_template="Metadata: {metadata_str}\n-----\nContent: {content}"
        )

        llama_documents.append(llama_document)
    
    return llama_documents

llama_documents = create_llama_docs(documents_list)

In [7]:
# LLM sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))

Metadata: plot=>Jang is a homicide detective who likes to use violence with criminals, while Oh is a prosecutor who believes in the importance of evidence. After the murder of his younger half-brother ...
runtime=>124.0
cast=>"['Sang-Woo Kwon', 'Ji-tae Yu', 'Kil-kang Ahn', 'Seok-hyeon Jo']"
directors=>"['Seong-soo Kim']"
awards=>"{'nominations': 1, 'text': '1 nomination.', 'wins': 0}"
languages=>"['Korean']"
countries=>"['South Korea']"
poster=>https://m.media-amazon.com/images/M/MV5BZTBlNDgzNTEtN2MzZC00MjI2LTljMDgtNzc1ZGRiNmMwZTc2XkEyXkFqcGdeQXVyMTMxMTY0OTQ@._V1_SY1000_SX677_AL_.jpg
rated=>None
num_mflix_comments=>0
writers=>"['Ji-hoon Han (screenplay)']"
type=>movie
title=>Running Wild
imdb=>"{'id': 485552, 'rating': 6.7, 'votes': 537}"
genres=>"['Action', 'Crime', 'Thriller']"
-----
Content: Jang is a homicide detective who likes to use violence with criminals, while Oh is a prosecutor who believes in the importance of evidence. After the murder of his younger half-brother Do-young 

In [8]:
# Embedding model sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata: plot=>Jang is a homicide detective who likes to use violence with criminals, while Oh is a prosecutor who believes in the importance of evidence. After the murder of his younger half-brother ...
cast=>"['Sang-Woo Kwon', 'Ji-tae Yu', 'Kil-kang Ahn', 'Seok-hyeon Jo']"
directors=>"['Seong-soo Kim']"
awards=>"{'nominations': 1, 'text': '1 nomination.', 'wins': 0}"
languages=>"['Korean']"
countries=>"['South Korea']"
poster=>https://m.media-amazon.com/images/M/MV5BZTBlNDgzNTEtN2MzZC00MjI2LTljMDgtNzc1ZGRiNmMwZTc2XkEyXkFqcGdeQXVyMTMxMTY0OTQ@._V1_SY1000_SX677_AL_.jpg
writers=>"['Ji-hoon Han (screenplay)']"
type=>movie
title=>Running Wild
imdb=>"{'id': 485552, 'rating': 6.7, 'votes': 537}"
genres=>"['Action', 'Crime', 'Thriller']"
-----
Content: Jang is a homicide detective who likes to use violence with criminals, while Oh is a prosecutor who believes in the importance of evidence. After the murder of his younger half-brother Do-young and Jin-woo join forces to a gangster boss to jus

## Embedding

In [9]:
 def create_nodes(llama_documents):
    parser = SentenceSplitter()
    
    nodes = parser.get_nodes_from_documents(llama_documents)

    for node in nodes:
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode=MetadataMode.EMBED)
        )
        node.embedding = node_embedding

    
    return nodes

nodes = create_nodes(llama_documents)

## Chroma

In [10]:
# Chroma DB collection name
COLLECTION_NAME = "MOVIE_RAG"

db = chromadb.PersistentClient(path="chroma_db")
print(f"Looking for the {COLLECTION_NAME} collection in the database..." )
if COLLECTION_NAME not in [col.name for col in db.list_collections()]:
    print(f"{COLLECTION_NAME} collection was NOT FOUND in Chroma DB, creating...")
    chroma_collection = db.create_collection(COLLECTION_NAME)
    print("Creating vector store...")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    Settings.embed_model = embed_model
    Settings.llm = OpenAI()
    print("Creating vector store index")
    VectorStoreIndex(
        nodes=nodes,
        storage_context=storage_context,
        store_nodes_override=True
    )
    print(f"record count: {chroma_collection.count()}"     
    )
    
else:
    print(f"{COLLECTION_NAME} collection WAS FOUND in Chroma DB")
    COLLECTION_NAME = db.get_collection(COLLECTION_NAME)
    vector_store = ChromaVectorStore(chroma_collection=COLLECTION_NAME)
    print("Restoring vector store index from the collection...")
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embed_model,
        store_nodes_override=True
    )
#     print("Inserting nodes in the vector store index...")?\
#     index.insert_nodes(nodes)
    print(f"record count: {COLLECTION_NAME.count()}")

Looking for the MOVIE_RAG collection in the database...
MOVIE_RAG collection WAS FOUND in Chroma DB
Restoring vector store index from the collection...
record count: 10


### retrieve from Chroma DB and Query

In [11]:

index = VectorStoreIndex.from_vector_store(vector_store)
     

In [14]:
query_engine = index.as_query_engine(similarity_top_k=3)

query = "What is the runtime of Day of the Dead?"
# query = "What is the runtime of Dirty Harry?"
# query = "What is the runtime of Raiders of the Lost Ark?"

response = query_engine.query(query)
display_response(response)


**`Final Response:`** 86.0

In [None]:
# pprint.pprint(response.source_nodes)

In [None]:
STOP

## Add data

### before a new record is added

In [15]:
try:
    print(f"new DB record count: {chroma_collection.count()}")
except:
    print(f"established DB record count: {COLLECTION_NAME.count()}")

established DB record count: 10


In [17]:
query_engine = index.as_query_engine(similarity_top_k=3)

# query = "What is the runtime of Day of the Dead?"
query = "What is the runtime of Dirty Harry?"
# query = "What is the runtime of Raiders of the Lost Ark?"
response = query_engine.query(query)
display_response(response)

**`Final Response:`** The runtime of "Dirty Harry" is not provided in the given context information.

### add record

In [35]:
dataset_df = pd.read_csv("moviedata.csv")
# remove rows where "fullplot" column is NaN
dataset_df = dataset_df.dropna(subset=["fullplot"])
# remove "plot_embedding column"
dataset_df = dataset_df.drop(columns=["plot_embedding"])
# dataset_df

In [36]:
# add just Dirty Harry
# dataset_df = dataset_df[dataset_df["title"]=="Dirty Harry"]
# add just Raiders of the Lost Ark
dataset_df = dataset_df[dataset_df["title"]=="Raiders of the Lost Ark"]

In [37]:
documents_list = df_to_json(dataset_df)

In [38]:
llama_documents = create_llama_docs(documents_list)

In [39]:
nodes = create_nodes(llama_documents)

In [40]:
index.insert_nodes(nodes)

In [41]:
try:
    print(f"new DB record count: {chroma_collection.count()}")
except:
    print(f"established DB record count: {COLLECTION_NAME.count()}")

established DB record count: 13


In [42]:
query_engine = index.as_query_engine(similarity_top_k=3)

# query = "What is the runtime of Day of the Dead?"
# query = "What is the runtime of Dirty Harry?"
query = "What is the runtime of Raiders of the Lost Ark?"

response = query_engine.query(query)
display_response(response)

**`Final Response:`** The runtime of Raiders of the Lost Ark is 115.0 minutes.

In [43]:
COLLECTION_NAME.count()

13

In [44]:
# COLLECTION_NAME.peek()

{'ids': ['059ecf4a-c2ce-4bec-ad00-4a3f98342b98',
  '09e37f22-faef-42c8-9e2c-94386a3fa5bb',
  '26cf5bcb-a27b-4405-a38e-b2a0c8283b10',
  '2972f9a5-1583-4476-a4a5-a6dfa6aa1f91',
  '3519d5ec-0cb4-4059-9cbe-eee26207dfe0',
  '433d36b3-6987-48a5-a5f3-0e04f21b2f4f',
  '58b6059c-ea94-4532-89e3-7938e9f0059a',
  '783d16e1-6c89-4f6a-9f7e-874303f88f2d',
  '7e1ffacd-5098-4a2b-9db3-c6210f9a5117',
  'a994fbff-3cd5-47a7-8a70-b761881df679'],
 'embeddings': [[-0.05807780101895332,
   0.17979641258716583,
   -0.012287293560802937,
   -0.07037205249071121,
   -0.016855914145708084,
   -0.02785676158964634,
   0.014992306008934975,
   0.0707058310508728,
   0.0009839569684118032,
   -0.041917264461517334,
   0.005955895408987999,
   -0.033072080463171005,
   -0.12349877506494522,
   0.04706304520368576,
   -0.0018236235482618213,
   0.024435512721538544,
   -0.037188708782196045,
   -0.020708296447992325,
   -0.03437938913702965,
   0.048231277614831924,
   -0.06197190657258034,
   0.0967407077550888,
   0.

In [45]:
len(['059ecf4a-c2ce-4bec-ad00-4a3f98342b98',
  '09e37f22-faef-42c8-9e2c-94386a3fa5bb',
  '26cf5bcb-a27b-4405-a38e-b2a0c8283b10',
  '2972f9a5-1583-4476-a4a5-a6dfa6aa1f91',
  '3519d5ec-0cb4-4059-9cbe-eee26207dfe0',
  '433d36b3-6987-48a5-a5f3-0e04f21b2f4f',
  '58b6059c-ea94-4532-89e3-7938e9f0059a',
  '783d16e1-6c89-4f6a-9f7e-874303f88f2d',
  '7e1ffacd-5098-4a2b-9db3-c6210f9a5117',
  'a994fbff-3cd5-47a7-8a70-b761881df679'])

10