In [1]:
import pandas as pd

# Replace 'your_file.csv' with the name of your actual CSV file
df = pd.read_csv(r'C:\Users\Mansi\Downloads\final_dataset.csv')

# Display the first few rows of the DataFrame
print(df.head())


          id                            title  year duration MPA  rating  \
0  tt0073195                             Jaws  1975    2h 4m  PG     8.1   
1  tt0073629    The Rocky Horror Picture Show  1975   1h 40m   R     7.4   
2  tt0073486  One Flew Over the Cuckoo's Nest  1975   2h 13m   R     8.7   
3  tt0072890                Dog Day Afternoon  1975    2h 5m   R     8.0   
4  tt0073692                          Shampoo  1975   1h 50m   R     6.4   

  votes  meta_score                                        description  \
0  690K        87.0  When a massive killer shark unleashes chaos on...   
1  174K        65.0  A newly-engaged couple have a breakdown in an ...   
2  1.1M        84.0  In the Fall of 1963, a Korean War veteran and ...   
3  281K        86.0  Three amateur robbers plan to hold up a Brookl...   
4   15K        65.0  On Election Day, 1968, irresponsible hairdress...   

                             Movie_Link  ... opening_weekend_gross  \
0  https://www.imdb.com/titl

In [2]:
# 1. Convert all string columns to lowercase
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.lower()

# 2. Convert duration from hours and minutes to total minutes
def convert_duration_to_minutes(duration):
    if pd.isna(duration):  # Check if the value is NaN
        return 0  # Return 0 minutes if the duration is NaN
    total_minutes = 0
    parts = duration.split()
    for part in parts:
        if 'h' in part:
            total_minutes += int(part[:-1]) * 60  # Convert hours to minutes
        elif 'm' in part:
            total_minutes += int(part[:-1])
    return total_minutes

df['duration'] = df['duration'].apply(convert_duration_to_minutes)

# 3. Fill null values based on column data type
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:  # Numeric columns
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna('n/a', inplace=True)

output_file_path = r'C:\Users\Mansi\Downloads\cleaned_final_dataset.csv'

# Save the cleaned DataFrame to a new CSV file
df.to_csv(output_file_path, index=False)

print(f"Cleaned data saved successfully to {output_file_path}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('n/a', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


Cleaned data saved successfully to C:\Users\Mansi\Downloads\cleaned_final_dataset.csv


In [3]:
def parse_list(cell):
    if pd.isnull(cell) or cell == "":
        return []
    return [item.strip() for item in cell.split(", ")]

In [4]:
# Dictionary to hold movie data
movie_data_dict = {}

for _, row in df.iterrows():
    id = row["id"]
    movie_data_dict[id] = {
        "title": row["title"],
        "year": row["year"],
        "duration": row["duration"],
        "MPA": row["MPA"],
        "rating": row["rating"],
        "votes": row["votes"],
        "meta_score": row["meta_score"],
        "description": row["description"],
        "Movie_Link": row["Movie_Link"],
        "writers": parse_list(row["writers"]),
        "directors": parse_list(row["directors"]),
        "stars": parse_list(row["stars"]),
        "budget": row["budget"],
        "opening_weekend_gross": row["opening_weekend_gross"],
        "gross_worldwide": row["gross_worldwide"],
        "gross_us_canada": row["gross_us_canada"],
        "release_date": row["release_date"],
        "countries_origin": parse_list(row["countries_origin"]),
        "filming_locations": parse_list(row["filming_locations"]),
        "production_companies": parse_list(row["production_companies"]),
        "awards_content": parse_list(row["awards_content"]),
        "genres": parse_list(row["genres"]),
        "languages": parse_list(row["languages"])
    }

first_5_keys = list(movie_data_dict.keys())[:5]  
for key in first_5_keys:
    print(f"Movie ID: {key}, Data: {movie_data_dict[key]}")


Movie ID: tt0073195, Data: {'title': 'jaws', 'year': 1975, 'duration': 124, 'MPA': 'pg', 'rating': 8.1, 'votes': '690k', 'meta_score': 87.0, 'description': "when a massive killer shark unleashes chaos on a beach community off long island, it's up to a local sheriff, a marine biologist, and an old seafarer to hunt the beast down.", 'Movie_Link': 'https://www.imdb.com/title/tt0073195', 'writers': ["['peter benchley'", "'carl gottlieb']"], 'directors': ["['steven spielberg']"], 'stars': ["['roy scheider'", "'robert shaw'", "'richard dreyfuss'", "'lorraine gary'", "'murray hamilton'", "'carl gottlieb'", "'jeffrey kramer'", "'susan backlinie'", "'jonathan filley'", "'ted grossman']"], 'budget': '$7,000,000 (estimated)', 'opening_weekend_gross': '$7,061,513', 'gross_worldwide': '$477,916,625', 'gross_us_canada': '$267,263,625', 'release_date': 1975.0, 'countries_origin': ["['united states']"], 'filming_locations': ['["water street', 'edgartown', "martha's vineyard", 'massachusetts', 'usa"]']

In [5]:
pip install chromadb

Note: you may need to restart the kernel to use updated packages.


In [6]:
import chromadb
chroma_client = chromadb.PersistentClient(path="./../chromadb_client")

In [8]:
collection = chroma_client.create_collection(name="movies_database")

In [None]:
def insert_movies_to_chroma(movie_data_dict):
    documents = []
    metadatas = []
    ids = []
    separator = " , "


    for movie_id, movie_data in movie_data_dict.items():
        
        joint_actors_string = separator.join(movie_data.get("actors", []))
        joint_roles_string = separator.join(movie_data.get("roles", []))
        joint_crew_string = separator.join(movie_data.get("crew", []))
        joint_genres_string = separator.join(movie_data.get("genres", []))
        joint_countries_string = separator.join(movie_data.get("countries", []))
        joint_languages_string = separator.join(movie_data.get("languages", []))
        joint_studios_string = separator.join(movie_data.get("studios", []))
        joint_themes_string = separator.join(movie_data.get("themes", []))

        document = f"{movie_data['name']} | {movie_data['description']} | {joint_actors_string} | {joint_roles_string} | {joint_crew_string} | {joint_genres_string} | {joint_countries_string} | {joint_languages_string} | {joint_studios_string} | {joint_themes_string}"
        metadata = {
            "date": movie_data.get("date"),
            "tagline": movie_data.get("tagline"),
            "minute": movie_data.get("minute"),
            "rating": movie_data.get("rating"),
            "actors": joint_actors_string,
            "roles": joint_roles_string,
            "crew": joint_crew_string,
            "work": separator.join(movie_data.get("work", [])),
            "countries": joint_countries_string,
            "genres": joint_genres_string,
            "languages": joint_languages_string,
            "types": separator.join(movie_data.get("types", [])),
            "posters": separator.join(movie_data.get("posters", [])),
            "studios": joint_studios_string,
            "themes": joint_themes_string
        }
        documents.append(document)
        metadatas.append(metadata)
        ids.append(str(movie_id))

    collection.add(documents=documents, metadatas=metadatas, ids=ids)


In [11]:
def insert_movies_to_chroma(movie_data_dict):
    documents = []
    metadatas = []
    ids = []
    separator = " , "

    for id, movie_data in movie_data_dict.items():
        
        joint_stars_string = separator.join(movie_data.get("stars", []))
        joint_genres_string = separator.join(movie_data.get("genres", []))
        joint_production_companies_string = separator.join(movie_data.get("production_companies", []))
        joint_filming_locations_string = separator.join(movie_data.get("filming_locations", []))
        joint_countries_string = separator.join(movie_data.get("countries_origin", []))
        joint_directors_string = separator.join(movie_data.get("directors", []))
        joint_languages_string = separator.join(movie_data.get("languages", []))

        document = f"{movie_data['title']} | {movie_data['description']} | {joint_stars_string} | {joint_genres_string} | {joint_production_companies_string} | {joint_filming_locations_string} | {joint_countries_string} | {joint_directors_string} | {joint_languages_string}"
        
        metadata = {
            "year": movie_data.get("year"),
            "duration": movie_data.get("duration"),
            "MPA": movie_data.get("MPA"),
            "rating": movie_data.get("rating"),
            "votes": movie_data.get("votes"),
            "description": movie_data.get("description"),
            "Movie_Link": movie_data.get("Movie_Link"),
            "writers": separator.join(movie_data.get("writers", [])),
            "directors": joint_directors_string,
            "stars": joint_stars_string,
            "budget": movie_data.get("budget"),
            "release_date": movie_data.get("release_date"),
            "countries": joint_countries_string,
            "production_companies": joint_production_companies_string,
            "filming_locations": joint_filming_locations_string,
            "awards_content": separator.join(movie_data.get("awards_content", [])),
            "genres": joint_genres_string,
            "languages": joint_languages_string
        }

        documents.append(document)
        metadatas.append(metadata)
        ids.append(str(id))

    collection.add(documents=documents, metadatas=metadatas, ids=ids)


In [13]:
print(movie_data_dict['tt0073486'])

{'title': "one flew over the cuckoo's nest", 'year': 1975, 'duration': 133, 'MPA': 'r', 'rating': 8.7, 'votes': '1.1m', 'meta_score': 84.0, 'description': 'in the fall of 1963, a korean war veteran and criminal pleads insanity and is admitted to a mental institution, where he rallies up the scared patients against the tyrannical nurse.', 'Movie_Link': 'https://www.imdb.com/title/tt0073486', 'writers': ["['lawrence hauben'", "'bo goldman'", "'ken kesey']"], 'directors': ["['milos forman']"], 'stars': ["['jack nicholson'", "'louise fletcher'", "'michael berryman'", "'peter brocco'", "'dean r. brooks'", "'alonzo brown'", "'scatman crothers'", "'mwako cumbuka'", "'danny devito'", "'william duell']"], 'budget': '$3,000,000 (estimated)', 'opening_weekend_gross': 'n/a', 'gross_worldwide': '$109,115,366', 'gross_us_canada': '$108,981,275', 'release_date': 1975.0, 'countries_origin': ["['united states']"], 'filming_locations': ["['oregon state mental hospital - 2600 center street ne", 'salem', 

In [14]:
# Assuming movie_data_dict is your dictionary
def insert_first_1000(movie_data_dict):
    """Inserts the first 1000 items from the dictionary."""

    first_1000 = {}
    keys = list(movie_data_dict.keys())

    for i in range(min(1000, len(keys))): #prevent error if the dictionary has less than 1000 items
        print(keys[i])
        first_1000[keys[i]] = movie_data_dict[keys[i]]
        
    insert_movies_to_chroma(first_1000)

In [None]:
# Assuming movie_data_dict is your dictionary
def insert_first_10000(movie_data_dict):
    """Inserts the first 10000 items from the dictionary."""

    first_10000 = {}
    keys = list(movie_data_dict.keys())

    for i in range(min(10000, len(keys))): #prevent error if the dictionary has less than 100 items
        print(keys[i])
        first_10000[keys[i]] = movie_data_dict[keys[i]]
        
    insert_movies_to_chroma(first_10000)

In [15]:
insert_first_1000(movie_data_dict)

tt0073195
tt0073629
tt0073486
tt0072890
tt0073692
tt0072081
tt0073026
tt0072653
tt0073812
tt0073802
tt0073317
tt0073075
tt0073312
tt0072951
tt0073190
tt0072926
tt0073018
tt0074539
tt0073053
tt0073440
tt0072750
tt0071853
tt0073015
tt0072665
tt0073580
tt0073043
tt0073540
tt0073778
tt0073076
tt0072443
tt0073155
tt0075040
tt0072976
tt0073424
tt0073114
tt0073650
tt0072684
tt0073768
tt0073453
tt0073631
tt0072856
tt0073198
tt0073747
tt0071650
tt0073341
tt0073705
tt0072752
tt0073582
tt0073349
tt0073636
tt0072705
tt0072730
tt0316900
tt0073115
tt0072735
tt0073092
tt0073559
tt0073172
tt0073470
tt0073594
tt0072973
tt0073597
tt0074682
tt0072930
tt0073906
tt0073501
tt0073240
tt0072912
tt0072732
tt0073019
tt0069930
tt0073600
tt0073707
tt0071411
tt0073298
tt0073496
tt0073691
tt0072820
tt0072848
tt0072709
tt0072869
tt0131526
tt0073637
tt0075140
tt0073722
tt0073203
tt0073870
tt0073324
tt0073605
tt0073000
tt0072761
tt0072798
tt0072764
tt0072184
tt0073615
tt0252487
tt0072895
tt0074800
tt0072933
tt0073282


In [16]:
def query_chroma_and_print_first(query_text, filter_criteria=None):
    
    try:
        if filter_criteria:
            results = collection.query(
                query_texts=[query_text],
                n_results=1,  # Get only the closest result
                where=filter_criteria,
            )
        else:
            results = collection.query(
                query_texts=[query_text],
                n_results=1,  # Get only the closest result
            )

        if results["documents"] and results["documents"][0]:
            print("Closest Result:")
            print("Document:", results["documents"][0][0])
            if results["metadatas"] and results["metadatas"][0]:
                print("Metadata:", results["metadatas"][0][0]) 
            else:
                print("No metadata found.")

        else:
            print("No results found.")

    except Exception as e:
        print(f"An error occurred: {e}")



In [17]:
print("Sample Documents:")
print(collection.peek())

Sample Documents:
{'ids': ['tt0073195', 'tt0073629', 'tt0073486', 'tt0072890', 'tt0073692', 'tt0072081', 'tt0073026', 'tt0072653', 'tt0073812', 'tt0073802'], 'embeddings': array([[-0.06480973, -0.08592383,  0.00845911, ..., -0.02114604,
        -0.0443121 ,  0.02547549],
       [ 0.00794492, -0.08030885,  0.00203439, ..., -0.02394695,
        -0.03665593,  0.0350316 ],
       [ 0.01283432, -0.07501377, -0.06246576, ..., -0.00391022,
        -0.03194121,  0.0334334 ],
       ...,
       [-0.0164005 , -0.03777725,  0.01653393, ...,  0.0224338 ,
         0.04523428,  0.02987161],
       [ 0.01115306, -0.09731013,  0.01129823, ...,  0.00955473,
        -0.0241531 ,  0.03476324],
       [-0.01630044, -0.12866592, -0.07489571, ..., -0.11093146,
        -0.10047136, -0.01320679]], shape=(10, 384)), 'documents': ['jaws | when a massive killer shark unleashes chaos on a beach community off long island, it\'s up to a local sheriff, a marine biologist, and an old seafarer to hunt the beast down. 

In [18]:
# Example usage:
query_chroma_and_print_first("movies about friendships")


Closest Result:
Document: at long last love | four socialite old friends unexpectedly clash, and switch partners during a party and attempt to make each other jealous. | ['burt reynolds' , 'cybill shepherd' , 'madeline kahn' , 'duilio del prete' , 'eileen brennan' , 'john hillerman' , 'mildred natwick' , 'quinn k. redeker' , 'j. edward mckinley' , 'john stephenson'] | ['jukebox musical' , 'romantic comedy' , 'comedy' , 'musical' , 'romance'] | ['copa del oro' , 'twentieth century fox'] | ['los angeles , california , usa'] | ['united states'] | ['peter bogdanovich'] | ['english']
Metadata: {'MPA': 'g', 'Movie_Link': 'https://www.imdb.com/title/tt0072665', 'awards_content': 'n/a', 'budget': '$6,000,000 (estimated)', 'countries': "['united states']", 'description': 'four socialite old friends unexpectedly clash, and switch partners during a party and attempt to make each other jealous.', 'directors': "['peter bogdanovich']", 'duration': 118, 'filming_locations': "['los angeles , californi

In [19]:
# Example with metadata filter (replace with your actual metadata fields):
query_chroma_and_print_first(
    "comedy movies with high ratings", filter_criteria={ "rating": {"$gt": 4.5}}
)



Closest Result:
Document: a genius, two partners and a dupe | three rogues set out to rob $300,000 from an indian-hating cavalry major. | ['terence hill' , 'miou-miou' , 'robert charlebois' , 'patrick mcgoohan' , 'raimund harmstorf' , 'piero vida' , 'rik battaglia' , 'mario valgoi' , 'mario brega' , 'jean martin'] | ['parody' , 'spaghetti western' , 'comedy' , 'western'] | ['rafran cinematografica' , 'amlf' , 'rialto film'] | ['monument valley , arizona , usa'] | ['italy' , 'france' , 'west germany'] | ['damiano damiani'] | ['italian']
Metadata: {'MPA': 'n/a', 'Movie_Link': 'https://www.imdb.com/title/tt0073036', 'awards_content': 'awards , 1 win total', 'budget': 'itl3,500,000,000 (estimated)', 'countries': "['italy' , 'france' , 'west germany']", 'description': 'three rogues set out to rob $300,000 from an indian-hating cavalry major.', 'directors': "['damiano damiani']", 'duration': 126, 'filming_locations': "['monument valley , arizona , usa']", 'genres': "['parody' , 'spaghetti we