In [1]:
import pandas as pd
import ast

In [2]:
def parse_column(value):
    try:
        return ast.literal_eval(value) if isinstance(value, str) and value.startswith("[") else value
    except (ValueError, SyntaxError):
        return value


In [3]:
# Load the CSV back into a dictionary
df_loaded = pd.read_csv("./../cleaned_database/cleaned_final_dataset.csv", converters={
    "writers": parse_column,
    "directors": parse_column,
    "stars": parse_column,
    "countries_origin": parse_column,
    "filming_locations": parse_column,
    "production_companies": parse_column,
    "awards_content": parse_column,
    "genres": parse_column,
    "languages": parse_column
})


In [4]:
movie_data_dict= df_loaded.set_index("id").to_dict(orient="index")

In [5]:
first_5_keys = list(movie_data_dict.keys())[:5]
for key in first_5_keys:
    print(f"Movie ID: {key}, Data: {movie_data_dict[key]}")


Movie ID: tt0073195, Data: {'title': 'Jaws', 'year': 1975, 'duration': 124, 'MPA': 'PG', 'rating': 8.1, 'votes': '690K', 'meta_score': 87.0, 'description': "When a massive killer shark unleashes chaos on a beach community off Long Island, it's up to a local sheriff, a marine biologist, and an old seafarer to hunt the beast down.", 'Movie_Link': 'https://www.imdb.com/title/tt0073195', 'writers': ['Peter Benchley', 'Carl Gottlieb'], 'directors': ['Steven Spielberg'], 'stars': ['Roy Scheider', 'Robert Shaw', 'Richard Dreyfuss', 'Lorraine Gary', 'Murray Hamilton', 'Carl Gottlieb', 'Jeffrey Kramer', 'Susan Backlinie', 'Jonathan Filley', 'Ted Grossman'], 'budget': '$7,000,000 (estimated)', 'opening_weekend_gross': '$7,061,513', 'gross_worldwide': '$477,916,625', 'gross_us_canada': '$267,263,625', 'release_date': 1975.0, 'countries_origin': ['United States'], 'filming_locations': ["Water Street, Edgartown, Martha's Vineyard, Massachusetts, USA"], 'production_companies': ['Zanuck/Brown Product

In [6]:
pip install chromadb

Note: you may need to restart the kernel to use updated packages.


In [8]:
import chromadb

In [9]:
chroma_client = chromadb.PersistentClient(path="./../chromadb_client")

In [14]:
collection = chroma_client.create_collection(name="best_movies_database")

In [13]:
chroma_client.delete_collection(name="best_movies_database")

In [20]:
def insert_movies_to_chroma(movie_data_dict, batch_size=30000):
    separator = " , "
    movie_items = list(movie_data_dict.items())  
    total_movies = len(movie_items)

    for i in range(0, total_movies, batch_size):
        batch = movie_items[i:i + batch_size]  

        documents = []
        metadatas = []
        ids = []

        for movie_id, movie_data in batch:
            joint_stars_string = separator.join(movie_data.get("stars", []))
            joint_genres_string = separator.join(movie_data.get("genres", []))
            joint_productions_string = separator.join(movie_data.get("production_companies", []))
            joint_filming_locations_string = separator.join(movie_data.get("filming_locations", []))
            joint_countries_origin_string = separator.join(movie_data.get("countries_origin", []))
            joint_directors_string = separator.join(movie_data.get("directors", []))
            joint_languages_string = separator.join(movie_data.get("languages", []))

            document = f"{movie_data['title']} | {movie_data['description']} | {joint_stars_string} | {joint_genres_string} | {joint_productions_string} | {joint_filming_locations_string} | {joint_countries_origin_string} | {joint_directors_string} | {joint_languages_string}"
            metadata = {
                "title": movie_data.get("title"),
                "year": movie_data.get("year"),
                "duration": movie_data.get("duration"),
                "MPA": movie_data.get("MPA"),
                "rating": movie_data.get("rating"),
                "votes": movie_data.get("votes"),
                "meta_score": movie_data.get("meta_score"),
                "description": movie_data.get("description"),
                "Movie_Link": movie_data.get("Movie_Link"),
                "writers": separator.join(movie_data.get("writers", [])),
                "directors": joint_directors_string,
                "stars": joint_stars_string,
                "budget": movie_data.get("budget"),
                "opening_weekend_gross": movie_data.get("opening_weekend_gross"),
                "gross_worldwide": movie_data.get("gross_worldwide"),
                "gross_us_canada": movie_data.get("gross_us_canada"),
                "release_date": movie_data.get("release_date"),
                "countries_origin": joint_countries_origin_string,
                "filming_locations": joint_filming_locations_string,
                "production_companies": joint_productions_string,
                "awards_content": separator.join(movie_data.get("awards_content", [])),
                "genres": joint_genres_string,
                "languages": joint_languages_string
            }
            documents.append(document)
            metadatas.append(metadata)
            ids.append(str(movie_id))

        collection.add(documents=documents, metadatas=metadatas, ids=ids)

        print(f"Inserted batch {i // batch_size + 1} ({len(batch)} movies) into Chroma.")

    print(f"Completed: Inserted {total_movies} movies into Chroma in batches of {batch_size}.")


In [17]:
print(movie_data_dict['tt0073486'])

{'title': "One Flew Over the Cuckoo's Nest", 'year': 1975, 'duration': 133, 'MPA': 'R', 'rating': 8.7, 'votes': '1.1M', 'meta_score': 84.0, 'description': 'In the Fall of 1963, a Korean War veteran and criminal pleads insanity and is admitted to a mental institution, where he rallies up the scared patients against the tyrannical nurse.', 'Movie_Link': 'https://www.imdb.com/title/tt0073486', 'writers': ['Lawrence Hauben', 'Bo Goldman', 'Ken Kesey'], 'directors': ['Milos Forman'], 'stars': ['Jack Nicholson', 'Louise Fletcher', 'Michael Berryman', 'Peter Brocco', 'Dean R. Brooks', 'Alonzo Brown', 'Scatman Crothers', 'Mwako Cumbuka', 'Danny DeVito', 'William Duell'], 'budget': '$3,000,000 (estimated)', 'opening_weekend_gross': nan, 'gross_worldwide': '$109,115,366', 'gross_us_canada': '$108,981,275', 'release_date': 1975.0, 'countries_origin': ['United States'], 'filming_locations': ['Oregon State Mental Hospital - 2600 Center Street NE, Salem, Oregon, USA'], 'production_companies': ['Fant

In [None]:
insert_movies_to_chroma(movie_data_dict,1000)

Inserted batch 1 (1000 movies) into Chroma.
Inserted batch 2 (1000 movies) into Chroma.
Inserted batch 3 (1000 movies) into Chroma.
Inserted batch 4 (1000 movies) into Chroma.
Inserted batch 5 (1000 movies) into Chroma.
Inserted batch 6 (1000 movies) into Chroma.
Inserted batch 7 (1000 movies) into Chroma.
Inserted batch 8 (1000 movies) into Chroma.
Inserted batch 9 (1000 movies) into Chroma.
Inserted batch 10 (1000 movies) into Chroma.
Inserted batch 11 (1000 movies) into Chroma.
Inserted batch 12 (1000 movies) into Chroma.
Inserted batch 13 (1000 movies) into Chroma.
Inserted batch 14 (1000 movies) into Chroma.
Inserted batch 15 (1000 movies) into Chroma.
Inserted batch 16 (1000 movies) into Chroma.
Inserted batch 17 (1000 movies) into Chroma.
Inserted batch 18 (1000 movies) into Chroma.
Inserted batch 19 (1000 movies) into Chroma.
Inserted batch 20 (1000 movies) into Chroma.
Inserted batch 21 (1000 movies) into Chroma.
Inserted batch 22 (1000 movies) into Chroma.
Inserted batch 23 (

In [18]:
# Assuming movie_data_dict is your dictionary
def insert_first_1000(movie_data_dict):
    """Inserts the first 1000 items from the dictionary."""

    first_1000 = {}
    keys = list(movie_data_dict.keys())

    for i in range(min(1000, len(keys))): #prevent error if the dictionary has less than 1000 items
        print(keys[i])
        first_1000[keys[i]] = movie_data_dict[keys[i]]
        
    insert_movies_to_chroma(first_1000)

In [35]:
insert_first_1000(movie_data_dict)

tt0073195
tt0073629
tt0073486
tt0072890
tt0073692
tt0072081
tt0073026
tt0072653
tt0073812
tt0073802
tt0073317
tt0073075
tt0073312
tt0072951
tt0073190
tt0072926
tt0073018
tt0074539
tt0073053
tt0073440
tt0072750
tt0071853
tt0073015
tt0072665
tt0073580
tt0073043
tt0073540
tt0073778
tt0073076
tt0072443
tt0073155
tt0075040
tt0072976
tt0073424
tt0073114
tt0073650
tt0072684
tt0073768
tt0073453
tt0073631
tt0072856
tt0073198
tt0073747
tt0071650
tt0073341
tt0073705
tt0072752
tt0073582
tt0073349
tt0073636
tt0072705
tt0072730
tt0316900
tt0073115
tt0072735
tt0073092
tt0073559
tt0073172
tt0073470
tt0073594
tt0072973
tt0073597
tt0074682
tt0072930
tt0073906
tt0073501
tt0073240
tt0072912
tt0072732
tt0073019
tt0069930
tt0073600
tt0073707
tt0071411
tt0073298
tt0073496
tt0073691
tt0072820
tt0072848
tt0072709
tt0072869
tt0131526
tt0073637
tt0075140
tt0073722
tt0073203
tt0073870
tt0073324
tt0073605
tt0073000
tt0072761
tt0072798
tt0072764
tt0072184
tt0073615
tt0252487
tt0072895
tt0074800
tt0072933
tt0073282


In [22]:
def query_chroma_and_print_first(query_text, filter_criteria=None):
    
    try:
        if filter_criteria:
            results = collection.query(
                query_texts=[query_text],
                n_results=1,  # Get only the closest result
                where=filter_criteria,
            )
        else:
            results = collection.query(
                query_texts=[query_text],
                n_results=1,  # Get only the closest result
            )

        if results["documents"] and results["documents"][0]:
            print("Closest Result:")
            print("Document:", results["documents"][0][0])
            if results["metadatas"] and results["metadatas"][0]:
                print("Metadata:", results["metadatas"][0][0]) 
            else:
                print("No metadata found.")

        else:
            print("No results found.")

    except Exception as e:
        print(f"An error occurred: {e}")



In [37]:
print("Sample Documents:")
print(collection.peek())

Sample Documents:
{'ids': ['tt0073195', 'tt0073629', 'tt0073486', 'tt0072890', 'tt0073692', 'tt0072081', 'tt0073026', 'tt0072653', 'tt0073812', 'tt0073802'], 'embeddings': array([[-0.04489762, -0.09763828,  0.01215111, ..., -0.02198221,
        -0.04531582,  0.03434723],
       [ 0.02926173, -0.10318424,  0.00296242, ..., -0.01841092,
        -0.00857215,  0.04419202],
       [ 0.01926873, -0.08429298, -0.06182687, ..., -0.00049526,
        -0.02620757,  0.03210481],
       ...,
       [ 0.00117432, -0.0658962 ,  0.01176655, ...,  0.02915817,
         0.04564163,  0.04265287],
       [ 0.03512478, -0.0992896 ,  0.03040159, ...,  0.01723399,
        -0.00963105,  0.03532014],
       [-0.02633684, -0.14905138, -0.08450468, ..., -0.09616607,
        -0.10238259, -0.01046537]], shape=(10, 384)), 'documents': ['Jaws | When a massive killer shark unleashes chaos on a beach community off Long Island, it\'s up to a local sheriff, a marine biologist, and an old seafarer to hunt the beast down. 

In [23]:
# Example usage:
query_chroma_and_print_first("movies about friendships")


Closest Result:
Document: Your Friends and Neighbors | Unhappy couples fall apart and hop into other beds with other people. | Amy Brenneman , Aaron Eckhart , Ben Stiller , Nastassja Kinski , Catherine Keener , Jason Patric , Josh Dotson , Lola Glaudini , Jeffrey Kushon | Dark Comedy , Comedy , Drama , Romance | Polygram Filmed Entertainment , Propaganda Films , Fleece | Los Angeles, California, USA | United States | Neil LaBute | English
Metadata: {'MPA': 'R', 'Movie_Link': 'https://www.imdb.com/title/tt0119517', 'awards_content': 'Awards , 2 wins & 4 nominations total', 'budget': '$5,000,000 (estimated)', 'countries_origin': 'United States', 'description': 'Unhappy couples fall apart and hop into other beds with other people.', 'directors': 'Neil LaBute', 'duration': 100, 'filming_locations': 'Los Angeles, California, USA', 'genres': 'Dark Comedy , Comedy , Drama , Romance', 'gross_us_canada': '$4,714,658', 'gross_worldwide': '$4,714,658', 'languages': 'English', 'meta_score': 70.0, 

In [24]:
# Example with metadata filter (replace with your actual metadata fields):
query_chroma_and_print_first(
    "comedy movies with high ratings", filter_criteria={ "rating": {"$gt": 4.5}}
)



Closest Result:
Document: Thanks for Sharing | A romantic comedy that brings together three disparate characters who are learning to face a challenging and often confusing world as they struggle together against a common demon: sex addiction. | Mark Ruffalo , Tim Robbins , Gwyneth Paltrow , Josh Gad , Joely Richardson , Patrick Fugit , P!nk , Carol Kane , Emily Meade , Isiah Whitlock Jr. | Comedy , Drama , Romance | Class 5 Films , Olympus Pictures | New York City, New York, USA | United States | Stuart Blumberg | English
Metadata: {'MPA': 'R', 'Movie_Link': 'https://www.imdb.com/title/tt1932718', 'awards_content': 'Awards , 2 wins & 2 nominations total', 'countries_origin': 'United States', 'description': 'A romantic comedy that brings together three disparate characters who are learning to face a challenging and often confusing world as they struggle together against a common demon: sex addiction.', 'directors': 'Stuart Blumberg', 'duration': 112, 'filming_locations': 'New York City,

In [31]:
# Example with metadata filter (replace with your actual metadata fields):
query_chroma_and_print_first(
    "This Time Next Year"
)


Closest Result:
Document: Class of 1999 II: The Substitute | The substitute teacher of the future is a robot who answers spit-wads with bullets. He'll clean up the gang situation at the school, too, if a government agent doesn't catch him first for an experimental robot army. | Sasha Mitchell , Caitlin Dulany , Nick Cassavetes , Gregory West , Rick Hill , Jack Knight , Diego Serrano , Bernie Pock , Denney Pierce , Loring Pickering | Action , Horror , Sci-Fi , Thriller | Cinetel Films | California, USA | United States | Spiro Razatos | English
Metadata: {'MPA': 'R', 'Movie_Link': 'https://www.imdb.com/title/tt0109442', 'awards_content': 'n/a', 'countries_origin': 'United States', 'description': "The substitute teacher of the future is a robot who answers spit-wads with bullets. He'll clean up the gang situation at the school, too, if a government agent doesn't catch him first for an experimental robot army.", 'directors': 'Spiro Razatos', 'duration': 87, 'filming_locations': 'California

In [32]:
def query_top_movies(query_text, filter_criteria=None):
    try:
        # Ensure collection is defined
        if 'collection' not in globals():
            raise ValueError("Collection is not initialized. Please define the collection before querying.")

        query_params = {
            "query_texts": [query_text],
            "n_results": 5  # Get top 5 results
        }
        if filter_criteria:
            query_params["where"] = filter_criteria

        results = collection.query(**query_params)

        # Safely retrieve documents and metadata
        documents = results.get("documents", [])
        metadatas = results.get("metadatas", [])

        if documents and documents[0]:
            print("Top 5 Matching Movies:")
            for i in range(min(5, len(documents[0]))):  # Ensures it doesn't break if <5 results
                print(f"\nResult {i+1}:")
                print("Movie:", documents[0][i])
                if metadatas and metadatas[0]:
                    print("Metadata:", metadatas[0][i])
                else:
                    print("No metadata found.")
        else:
            print("No results found.")

    except KeyError as e:
        print(f"KeyError: Missing key in results - {e}")
    except TypeError as e:
        print(f"TypeError: Unexpected data format - {e}")
    except Exception as e:
        print(f"An error occurred: {e}")


In [35]:
# Example with metadata filter (replace with your actual metadata fields):
query_top_movies(
    "iron man"
)


Top 5 Matching Movies:

Result 1:
Movie: Iron Man 2 | With the world now aware of his identity as Iron Man, Tony Stark must contend with both his declining health and a vengeful mad man with ties to his father's legacy. | Robert Downey Jr. , Mickey Rourke , Gwyneth Paltrow , Don Cheadle , Scarlett Johansson , Sam Rockwell , Samuel L. Jackson , Clark Gregg , John Slattery , Garry Shandling | Sci-Fi Epic , Superhero , Action , Sci-Fi | Paramount Pictures , Marvel Entertainment , Marvel Studios | SpaceX - Rocket Road, Hawthorne, California, USA (Hammer Industries Factory) | United States | Jon Favreau | English , French , Russian
Metadata: {'MPA': 'PG-13', 'Movie_Link': 'https://www.imdb.com/title/tt1228705', 'awards_content': 'Nominated for 1 Oscar , 7 wins & 45 nominations total', 'budget': '$200,000,000 (estimated)', 'countries_origin': 'United States', 'description': "With the world now aware of his identity as Iron Man, Tony Stark must contend with both his declining health and a ven