In [1]:
import pandas as pd
import ast

In [2]:
def parse_column(value):
    try:
        return ast.literal_eval(value) if isinstance(value, str) and value.startswith("[") else value
    except (ValueError, SyntaxError):
        return value


In [3]:
# Load the CSV back into a dictionary
df_loaded = pd.read_csv("./../cleaned_database/cleaned_final_dataset.csv", converters={
    "writers": parse_column,
    "directors": parse_column,
    "stars": parse_column,
    "countries_origin": parse_column,
    "filming_locations": parse_column,
    "production_companies": parse_column,
    "awards_content": parse_column,
    "genres": parse_column,
    "languages": parse_column
})


In [4]:
movie_data_dict= df_loaded.set_index("id").to_dict(orient="index")

In [5]:
first_5_keys = list(movie_data_dict.keys())[:5]
for key in first_5_keys:
    print(f"Movie ID: {key}, Data: {movie_data_dict[key]}")


Movie ID: tt0073195, Data: {'title': 'Jaws', 'year': 1975, 'duration': 124, 'MPA': 'PG', 'rating': 8.1, 'votes': '690K', 'meta_score': 87.0, 'description': "When a massive killer shark unleashes chaos on a beach community off Long Island, it's up to a local sheriff, a marine biologist, and an old seafarer to hunt the beast down.", 'Movie_Link': 'https://www.imdb.com/title/tt0073195', 'writers': ['Peter Benchley', 'Carl Gottlieb'], 'directors': ['Steven Spielberg'], 'stars': ['Roy Scheider', 'Robert Shaw', 'Richard Dreyfuss', 'Lorraine Gary', 'Murray Hamilton', 'Carl Gottlieb', 'Jeffrey Kramer', 'Susan Backlinie', 'Jonathan Filley', 'Ted Grossman'], 'budget': '$7,000,000 (estimated)', 'opening_weekend_gross': '$7,061,513', 'gross_worldwide': '$477,916,625', 'gross_us_canada': '$267,263,625', 'release_date': 1975.0, 'countries_origin': ['United States'], 'filming_locations': ["Water Street, Edgartown, Martha's Vineyard, Massachusetts, USA"], 'production_companies': ['Zanuck/Brown Product

In [6]:
pip install chromadb

Note: you may need to restart the kernel to use updated packages.


In [7]:
import chromadb

In [8]:
chroma_client = chromadb.PersistentClient(path="./../chromadb_client")

In [9]:
collection = chroma_client.create_collection(name="best_movies_database")

In [13]:
chroma_client.delete_collection(name="best_movies_database")

In [10]:
def insert_movies_to_chroma(movie_data_dict, batch_size=30000):
    separator = " , "
    movie_items = list(movie_data_dict.items())  
    total_movies = len(movie_items)

    for i in range(0, total_movies, batch_size):
        batch = movie_items[i:i + batch_size]  

        documents = []
        metadatas = []
        ids = []

        for movie_id, movie_data in batch:
            joint_stars_string = separator.join(movie_data.get("stars", []))
            joint_genres_string = separator.join(movie_data.get("genres", []))
            joint_productions_string = separator.join(movie_data.get("production_companies", []))
            joint_filming_locations_string = separator.join(movie_data.get("filming_locations", []))
            joint_countries_origin_string = separator.join(movie_data.get("countries_origin", []))
            joint_directors_string = separator.join(movie_data.get("directors", []))
            joint_languages_string = separator.join(movie_data.get("languages", []))

            document = f"{movie_data['title']} | {movie_data['description']} | {joint_stars_string} | {joint_genres_string} | {joint_productions_string} | {joint_filming_locations_string} | {joint_countries_origin_string} | {joint_directors_string} | {joint_languages_string}"
            metadata = {
                "title": movie_data.get("title"),
                "year": movie_data.get("year"),
                "duration": movie_data.get("duration"),
                "MPA": movie_data.get("MPA"),
                "rating": movie_data.get("rating"),
                "votes": movie_data.get("votes"),
                "meta_score": movie_data.get("meta_score"),
                "description": movie_data.get("description"),
                "Movie_Link": movie_data.get("Movie_Link"),
                "writers": separator.join(movie_data.get("writers", [])),
                "directors": joint_directors_string,
                "stars": joint_stars_string,
                "budget": movie_data.get("budget"),
                "opening_weekend_gross": movie_data.get("opening_weekend_gross"),
                "gross_worldwide": movie_data.get("gross_worldwide"),
                "gross_us_canada": movie_data.get("gross_us_canada"),
                "release_date": movie_data.get("release_date"),
                "countries_origin": joint_countries_origin_string,
                "filming_locations": joint_filming_locations_string,
                "production_companies": joint_productions_string,
                "awards_content": separator.join(movie_data.get("awards_content", [])),
                "genres": joint_genres_string,
                "languages": joint_languages_string
            }
            documents.append(document)
            metadatas.append(metadata)
            ids.append(str(movie_id))

        collection.add(documents=documents, metadatas=metadatas, ids=ids)

        print(f"Inserted batch {i // batch_size + 1} ({len(batch)} movies) into Chroma.")

    print(f"Completed: Inserted {total_movies} movies into Chroma in batches of {batch_size}.")


In [11]:
print(movie_data_dict['tt0073486'])

{'title': "One Flew Over the Cuckoo's Nest", 'year': 1975, 'duration': 133, 'MPA': 'R', 'rating': 8.7, 'votes': '1.1M', 'meta_score': 84.0, 'description': 'In the Fall of 1963, a Korean War veteran and criminal pleads insanity and is admitted to a mental institution, where he rallies up the scared patients against the tyrannical nurse.', 'Movie_Link': 'https://www.imdb.com/title/tt0073486', 'writers': ['Lawrence Hauben', 'Bo Goldman', 'Ken Kesey'], 'directors': ['Milos Forman'], 'stars': ['Jack Nicholson', 'Louise Fletcher', 'Michael Berryman', 'Peter Brocco', 'Dean R. Brooks', 'Alonzo Brown', 'Scatman Crothers', 'Mwako Cumbuka', 'Danny DeVito', 'William Duell'], 'budget': '$3,000,000 (estimated)', 'opening_weekend_gross': nan, 'gross_worldwide': '$109,115,366', 'gross_us_canada': '$108,981,275', 'release_date': 1975.0, 'countries_origin': ['United States'], 'filming_locations': ['Oregon State Mental Hospital - 2600 Center Street NE, Salem, Oregon, USA'], 'production_companies': ['Fant

In [12]:
insert_movies_to_chroma(movie_data_dict,10000)

Inserted batch 1 (10000 movies) into Chroma.
Inserted batch 2 (10000 movies) into Chroma.
Inserted batch 3 (10000 movies) into Chroma.
Inserted batch 4 (10000 movies) into Chroma.
Inserted batch 5 (10000 movies) into Chroma.
Inserted batch 6 (10000 movies) into Chroma.
Inserted batch 7 (3249 movies) into Chroma.
Completed: Inserted 63249 movies into Chroma in batches of 10000.


In [18]:
# Assuming movie_data_dict is your dictionary
def insert_first_1000(movie_data_dict):
    """Inserts the first 1000 items from the dictionary."""

    first_1000 = {}
    keys = list(movie_data_dict.keys())

    for i in range(min(1000, len(keys))): #prevent error if the dictionary has less than 1000 items
        print(keys[i])
        first_1000[keys[i]] = movie_data_dict[keys[i]]
        
    insert_movies_to_chroma(first_1000)

In [35]:
insert_first_1000(movie_data_dict)

tt0073195
tt0073629
tt0073486
tt0072890
tt0073692
tt0072081
tt0073026
tt0072653
tt0073812
tt0073802
tt0073317
tt0073075
tt0073312
tt0072951
tt0073190
tt0072926
tt0073018
tt0074539
tt0073053
tt0073440
tt0072750
tt0071853
tt0073015
tt0072665
tt0073580
tt0073043
tt0073540
tt0073778
tt0073076
tt0072443
tt0073155
tt0075040
tt0072976
tt0073424
tt0073114
tt0073650
tt0072684
tt0073768
tt0073453
tt0073631
tt0072856
tt0073198
tt0073747
tt0071650
tt0073341
tt0073705
tt0072752
tt0073582
tt0073349
tt0073636
tt0072705
tt0072730
tt0316900
tt0073115
tt0072735
tt0073092
tt0073559
tt0073172
tt0073470
tt0073594
tt0072973
tt0073597
tt0074682
tt0072930
tt0073906
tt0073501
tt0073240
tt0072912
tt0072732
tt0073019
tt0069930
tt0073600
tt0073707
tt0071411
tt0073298
tt0073496
tt0073691
tt0072820
tt0072848
tt0072709
tt0072869
tt0131526
tt0073637
tt0075140
tt0073722
tt0073203
tt0073870
tt0073324
tt0073605
tt0073000
tt0072761
tt0072798
tt0072764
tt0072184
tt0073615
tt0252487
tt0072895
tt0074800
tt0072933
tt0073282


In [27]:
def query_chroma_and_return_results(query_text, filter_criteria=None, result_size=1):
    
    try:
        if filter_criteria:
            results = collection.query(
                query_texts=[query_text],
                n_results=result_size,  # Get only the closest result
                where=filter_criteria,
            )
        else:
            results = collection.query(
                query_texts=[query_text],
                n_results=result_size,  # Get only the closest result
            )
        return results
    except Exception as e:
        print(f"An error occurred: {e}")



In [26]:
def print_first_result(results):
    if results["documents"] and results["documents"][0]:
        print("Closest Result:")
        print("Document:", results["documents"][0][0])
        if results["metadatas"] and results["metadatas"][0]:
            print("Metadata:", results["metadatas"][0][0]) 
        else:
            print("No metadata found.")
    else:
        print("No results found.")

In [28]:
print_first_result(query_chroma_and_return_results("meow"))

Closest Result:
Document: The Cat's Meow | Semi-true story of the Hollywood murder that occurred at a star-studded gathering aboard William Randolph Hearst's yacht in 1924. | Kirsten Dunst , Cary Elwes , Edward Herrmann , Eddie Izzard , Joanna Lumley , Jennifer Tilly , Claudia Harrison , Victor Slezak , James Laurenson , Ronan Vibert | True Crime , Crime , Drama , Romance | Lionsgate , Dan Films , CP Medien AG | Kyparisi, Greece | United Kingdom , Germany , Canada , United States | Peter Bogdanovich | English
Metadata: {'MPA': 'PG-13', 'Movie_Link': 'https://www.imdb.com/title/tt0266391', 'awards_content': 'Awards , 1 win & 1 nomination total', 'budget': '$7,000,000 (estimated)', 'countries_origin': 'United Kingdom , Germany , Canada , United States', 'description': "Semi-true story of the Hollywood murder that occurred at a star-studded gathering aboard William Randolph Hearst's yacht in 1924.", 'directors': 'Peter Bogdanovich', 'duration': 114, 'filming_locations': 'Kyparisi, Greece'

In [15]:
print("Sample Documents:")
print(collection.peek())

Sample Documents:
{'ids': ['tt0073195', 'tt0073629', 'tt0073486', 'tt0072890', 'tt0073692', 'tt0072081', 'tt0073026', 'tt0072653', 'tt0073812', 'tt0073802'], 'embeddings': array([[-0.078181  , -0.04806019, -0.01801634, ..., -0.02356488,
        -0.01733995, -0.01388508],
       [ 0.02058048, -0.03385413, -0.0162773 , ..., -0.02694552,
        -0.04269272,  0.04451457],
       [ 0.00431272, -0.05058424, -0.09625056, ...,  0.01264142,
        -0.04026107,  0.00609061],
       ...,
       [-0.04223008, -0.0137757 ,  0.01082292, ...,  0.02292746,
         0.06655598,  0.02441884],
       [-0.01347995, -0.05440292,  0.00664415, ...,  0.05125343,
        -0.00833141,  0.04367412],
       [-0.00418359, -0.08803425, -0.08158376, ..., -0.09466381,
        -0.10573022, -0.01495228]]), 'documents': ["Jaws | When a massive killer shark unleashes chaos on a beach community off Long Island, it's up to a local sheriff, a marine biologist, and an old seafarer to hunt the beast down. | Roy Scheider , R

In [19]:
import random

In [33]:
def query_chroma_and_check_accuracy(query_text):
    results = query_chroma_and_return_results(query_text,None,1)
    if results['metadatas'] and results['metadatas'][0][0]:
        resultant_first_title = results['metadatas'][0][0]['title']
        print(f"Resultant first title: {resultant_first_title}, Query Title: {query_text}")
        return  resultant_first_title == query_text
    return False

In [34]:
def evaluate_search_accuracy(movie_data_dict, n_samples=20000):
    if len(movie_data_dict) < n_samples:
        print("Not enough data to sample from.")
        return
    
    sampled_movies = random.sample(list(movie_data_dict.items()), n_samples)
    correct_first_hits = 0

    for movie_id, data in sampled_movies:
        title = data['title']
        if query_chroma_and_check_accuracy(title):
            correct_first_hits += 1

    accuracy = correct_first_hits / n_samples
    print(f"Accuracy of search by name: {accuracy * 100:.2f}%")



In [35]:
# Example usage
evaluate_search_accuracy(movie_data_dict,100)


Resultant first title: Night of the Demons 2, Query Title: Night of the Demons 2
Resultant first title: Chinese Box, Query Title: Chinese Box
Resultant first title: Last Train Home, Query Title: The Last Train
Resultant first title: For Worse, Query Title: For Worse
Resultant first title: The Lower Depths, Query Title: The Lower Depths
Resultant first title: Stolen Kisses, Query Title: Stolen Kisses
Resultant first title: Dishkiyaoon, Query Title: Dishoom
Resultant first title: Castles in Spain, Query Title: Castles in Spain
Resultant first title: Grand Hotel, Query Title: Grand Exit
Resultant first title: Sands of the Desert, Query Title: Desert Dust
Resultant first title: A Boy of Flanders, Query Title: A Boy of Flanders
Resultant first title: Buck, Query Title: Buck
Resultant first title: Poem of the Sea, Query Title: The Sea of Grass
Resultant first title: Nan quan bei tui zhan yan wang, Query Title: Nan quan bei tui zhan yan wang
Resultant first title: The Gay Deceivers, Query Tit

In [36]:
evaluate_search_accuracy(movie_data_dict,100)

Resultant first title: The Heart of the Matter, Query Title: The Heart of the Matter
Resultant first title: Persecuted, Query Title: Persecution
Resultant first title: The Conspirator, Query Title: The Conspirator
Resultant first title: Serpico, Query Title: Frank Serpico
Resultant first title: The Lonely Lady, Query Title: The Lonely Woman
Resultant first title: Last Embrace, Query Title: Broken Embraces
Resultant first title: The Sailor from Gibraltar, Query Title: The Sailor from Gibraltar
Resultant first title: The Conspirators, Query Title: The Conspirators
Resultant first title: S21: The Khmer Rouge Killing Machine, Query Title: S21: The Khmer Rouge Killing Machine
Resultant first title: V for Vendetta, Query Title: V for Vendetta
Resultant first title: Seven Times Seven, Query Title: Seven
Resultant first title: Insurance Investigator, Query Title: Insurance Investigator
Resultant first title: A Haunted Turkish Bathhouse, Query Title: Ladies' Night in a Turkish Bath
Resultant fi

In [38]:
evaluate_search_accuracy(movie_data_dict,1000)

Resultant first title: The Girl in the Pool, Query Title: The Pool
Resultant first title: The Little Shepherd of Kingdom Come, Query Title: The Little Shepherd of Kingdom Come
Resultant first title: The Cheerleaders, Query Title: But I'm a Cheerleader
Resultant first title: Rid i natt!, Query Title: Rid i natt!
Resultant first title: Love Is a Headache, Query Title: Love Is a Headache
Resultant first title: The Naked Gun 2½: The Smell of Fear, Query Title: The Naked Gun
Resultant first title: Caesar the Conqueror, Query Title: Hail, Caesar!
Resultant first title: A Fine Mess, Query Title: A Fine Mess
Resultant first title: The Paradine Case, Query Title: The Paradine Case
Resultant first title: One Way Passage, Query Title: One Way Passage
Resultant first title: Play Dirty, Query Title: Play Dirty
Resultant first title: Barren Lives, Query Title: El barrendero
Resultant first title: Armaguedon, Query Title: Armaguedon
Resultant first title: Dark Tide, Query Title: Night Tide
Resultant 