In [2]:
import kagglehub
import pandas as pd
import os


In [8]:
dataset_path = kagglehub.dataset_download("raedaddala/top-500-600-movies-of-each-year-from-1960-to-2024")

csv_filename = os.path.join(dataset_path, "final_dataset.csv")
df = pd.read_csv(csv_filename)

print("Loaded dataset from:", csv_filename)
print(df.head())


Loaded dataset from: C:\Users\Mansi\.cache\kagglehub\datasets\raedaddala\top-500-600-movies-of-each-year-from-1960-to-2024\versions\3\final_dataset.csv
          id                            title  year duration MPA  rating  \
0  tt0073195                             Jaws  1975    2h 4m  PG     8.1   
1  tt0073629    The Rocky Horror Picture Show  1975   1h 40m   R     7.4   
2  tt0073486  One Flew Over the Cuckoo's Nest  1975   2h 13m   R     8.7   
3  tt0072890                Dog Day Afternoon  1975    2h 5m   R     8.0   
4  tt0073692                          Shampoo  1975   1h 50m   R     6.4   

  votes  meta_score                                        description  \
0  690K        87.0  When a massive killer shark unleashes chaos on...   
1  174K        65.0  A newly-engaged couple have a breakdown in an ...   
2  1.1M        84.0  In the Fall of 1963, a Korean War veteran and ...   
3  281K        86.0  Three amateur robbers plan to hold up a Brookl...   
4   15K        65.0  

In [9]:

def convert_duration_to_minutes(duration):
    if pd.isna(duration):  # Check if the value is NaN
        return 0  # Return 0 minutes if the duration is NaN
    total_minutes = 0
    parts = duration.split()
    for part in parts:
        if 'h' in part:
            total_minutes += int(part[:-1]) * 60  # Convert hours to minutes
        elif 'm' in part:
            total_minutes += int(part[:-1])
    return total_minutes

df['duration'] = df['duration'].apply(convert_duration_to_minutes)

for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:  # Numeric columns
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna('n/a', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('n/a', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


In [10]:
import ast
import pandas as pd

def parse_list(cell):
    if pd.isnull(cell) or cell == "":
        return []
    try:
        return ast.literal_eval(cell)  
    except (ValueError, SyntaxError):
        return cell.split(", ")  


In [11]:
# Dictionary to hold movie data
movie_data_dict = {}

for _, row in df.iterrows():
    id = row["id"]
    movie_data_dict[id] = {
        "title": row["title"],
        "year": row["year"],
        "duration": row["duration"],
        "MPA": row["MPA"],
        "rating": row["rating"],
        "votes": row["votes"],
        "meta_score": row["meta_score"],
        "description": row["description"],
        "Movie_Link": row["Movie_Link"],
        "writers": parse_list(row["writers"]),
        "directors": parse_list(row["directors"]),
        "stars": parse_list(row["stars"]),
        "budget": row["budget"],
        "opening_weekend_gross": row["opening_weekend_gross"],
        "gross_worldwide": row["gross_worldwide"],
        "gross_us_canada": row["gross_us_canada"],
        "release_date": row["release_date"],
        "countries_origin": parse_list(row["countries_origin"]),
        "filming_locations": parse_list(row["filming_locations"]),
        "production_companies": parse_list(row["production_companies"]),
        "awards_content": parse_list(row["awards_content"]),
        "genres": parse_list(row["genres"]),
        "languages": parse_list(row["languages"])
    }

first_5_keys = list(movie_data_dict.keys())[:5]  
for key in first_5_keys:
    print(f"Movie ID: {key}, Data: {movie_data_dict[key]}")


Movie ID: tt0073195, Data: {'title': 'Jaws', 'year': 1975, 'duration': 124, 'MPA': 'PG', 'rating': 8.1, 'votes': '690K', 'meta_score': 87.0, 'description': "When a massive killer shark unleashes chaos on a beach community off Long Island, it's up to a local sheriff, a marine biologist, and an old seafarer to hunt the beast down.", 'Movie_Link': 'https://www.imdb.com/title/tt0073195', 'writers': ['Peter Benchley', 'Carl Gottlieb'], 'directors': ['Steven Spielberg'], 'stars': ['Roy Scheider', 'Robert Shaw', 'Richard Dreyfuss', 'Lorraine Gary', 'Murray Hamilton', 'Carl Gottlieb', 'Jeffrey Kramer', 'Susan Backlinie', 'Jonathan Filley', 'Ted Grossman'], 'budget': '$7,000,000 (estimated)', 'opening_weekend_gross': '$7,061,513', 'gross_worldwide': '$477,916,625', 'gross_us_canada': '$267,263,625', 'release_date': 1975.0, 'countries_origin': ['United States'], 'filming_locations': ["Water Street, Edgartown, Martha's Vineyard, Massachusetts, USA"], 'production_companies': ['Zanuck/Brown Product

In [12]:
df_cleaned = pd.DataFrame.from_dict(movie_data_dict, orient="index")
df_cleaned.to_csv("./../cleaned_database/cleaned_final_dataset.csv", index_label="id")
print("Dictionary saved to ./../cleaned_database/cleaned_final_dataset.csv")


Dictionary saved to ./../cleaned_database/cleaned_final_dataset.csv
