In [2]:
import kagglehub
import pandas as pd
import os


In [3]:
dataset_path = kagglehub.dataset_download("raedaddala/top-500-600-movies-of-each-year-from-1960-to-2024")

csv_filename = os.path.join(dataset_path, "final_dataset.csv")
df = pd.read_csv(csv_filename)

print("Loaded dataset from:", csv_filename)
print(df.head())


Loaded dataset from: C:\Users\RohanDalvi\.cache\kagglehub\datasets\raedaddala\top-500-600-movies-of-each-year-from-1960-to-2024\versions\3\final_dataset.csv
          id                            title  year duration MPA  rating  \
0  tt0073195                             Jaws  1975    2h 4m  PG     8.1   
1  tt0073629    The Rocky Horror Picture Show  1975   1h 40m   R     7.4   
2  tt0073486  One Flew Over the Cuckoo's Nest  1975   2h 13m   R     8.7   
3  tt0072890                Dog Day Afternoon  1975    2h 5m   R     8.0   
4  tt0073692                          Shampoo  1975   1h 50m   R     6.4   

  votes  meta_score                                        description  \
0  690K        87.0  When a massive killer shark unleashes chaos on...   
1  174K        65.0  A newly-engaged couple have a breakdown in an ...   
2  1.1M        84.0  In the Fall of 1963, a Korean War veteran and ...   
3  281K        86.0  Three amateur robbers plan to hold up a Brookl...   
4   15K        6

In [4]:
def convert_duration_to_minutes(duration):
    if pd.isna(duration):
        return 0  
    if isinstance(duration, int):
        return duration  

    total_minutes = 0
    parts = duration.split()
    for part in parts:
        if 'h' in part:
            total_minutes += int(part[:-1]) * 60 
        elif 'm' in part:
            total_minutes += int(part[:-1])  
    return total_minutes

df['duration'] = df['duration'].apply(convert_duration_to_minutes)




In [5]:
for col in df.columns:
    if df[col].dtype in ['int', 'int64', 'float64']: 
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


In [6]:
import ast
import pandas as pd

In [7]:
def parse_list(cell):
    if pd.isnull(cell) or cell == "":
        return []
    try:
        return ast.literal_eval(cell)  
    except (ValueError, SyntaxError):
        return cell.split(", ")  


In [None]:
import re
import ast

def get_cleaned_name_string(name_list):
    cleaned_names = []
    if isinstance(name_list, str):
        if name_list != "":
            name_list = ast.literal_eval(name_list)
    if not isinstance(name_list, list):
        name_list = []
    for name in name_list:
        cleaned_name = re.sub(r'[^a-zA-Z]', '', name).lower()
        cleaned_names.append(cleaned_name)
    return ' '.join(cleaned_names)


In [40]:
def get_cleaned_locations(locations):
    cleaned_locations = []
    if isinstance(locations, str):
        if locations != "":
            locations = ast.literal_eval(locations)
    if not isinstance(locations, list):
        locations = []
    for location in locations:
        cleaned_location = desc_cleaning(location)
        cleaned_locations.append(cleaned_location)
    return ' '.join(cleaned_locations)

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RohanDalvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RohanDalvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
def desc_cleaning(desc):
    desc = desc.lower()
    desc = re.sub(r'[^a-z\s]', '', desc)
    words = word_tokenize(desc)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [11]:
def name_cleaning(name):
    name = name.lower()
    words = str.split(name)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [None]:
def get_cleaned_doc(row):
    new_name = name_cleaning(row['title'])
    new_desc = desc_cleaning(row['description'])
    new_stars = get_cleaned_name_string(row['stars'])
    new_directors = get_cleaned_name_string(row['directors'])
    new_genres = get_cleaned_locations(row['genres'])
    new_production = get_cleaned_locations(row['production_companies'])
    new_filming_locations = get_cleaned_locations(row['filming_locations'])
    new_language = get_cleaned_name_string(row['languages'])
    new_countries = get_cleaned_name_string(row['countries_origin'])
    doc_list = [new_name, new_desc, new_stars, new_directors, new_genres, new_production, new_filming_locations, new_language, new_countries]
    return str.join(" ", doc_list)


In [42]:
movie_data_dict = {}

# First iteration to store cleaned document data
for _, row in df.iterrows():
    id = row["id"]
    movie_data_dict[id] = {
        "docs": get_cleaned_doc(row)
    }

# Second iteration to add other movie data
for _, row in df.iterrows():
    id = row["id"]
    if id in movie_data_dict:
        movie_data_dict[id].update({
            "title": row["title"],
            "year": row["year"],
            "duration": row["duration"],
            "MPA": row["MPA"],
            "rating": row["rating"],
            "votes": row["votes"],
            "meta_score": row["meta_score"],
            "description": row["description"],
            "Movie_Link": row["Movie_Link"],
            "writers": parse_list(row["writers"]),
            "directors": parse_list(row["directors"]),
            "stars": parse_list(row["stars"]),
            "budget": row["budget"],
            "opening_weekend_gross": row["opening_weekend_gross"],
            "gross_worldwide": row["gross_worldwide"],
            "gross_us_canada": row["gross_us_canada"],
            "release_date": row["release_date"],
            "countries_origin": parse_list(row["countries_origin"]),
            "filming_locations": parse_list(row["filming_locations"]),
            "production_companies": parse_list(row["production_companies"]),
            "awards_content": parse_list(row["awards_content"]),
            "genres": parse_list(row["genres"]),
            "languages": parse_list(row["languages"])
        })

# Print first 5 movie IDs and their data
first_5_keys = list(movie_data_dict.keys())[:5]
for key in first_5_keys:
    print(f"Movie ID: {key}, Data: {movie_data_dict[key]}")


Movie ID: tt0073195, Data: {'docs': 'jaws massive killer shark unleashes chaos beach community long island local sheriff marine biologist old seafarer hunt beast royscheider robertshaw richarddreyfuss lorrainegary murrayhamilton carlgottlieb jeffreykramer susanbacklinie jonathanfilley tedgrossman stevenspielberg monsterhorror seaadventure survival adventure drama horror thriller zanuckbrownproductions universalpictures water street edgartown marthas vineyard massachusetts usa english unitedstates', 'title': 'Jaws', 'year': 1975, 'duration': 124, 'MPA': 'PG', 'rating': 8.1, 'votes': '690K', 'meta_score': 87.0, 'description': "When a massive killer shark unleashes chaos on a beach community off Long Island, it's up to a local sheriff, a marine biologist, and an old seafarer to hunt the beast down.", 'Movie_Link': 'https://www.imdb.com/title/tt0073195', 'writers': ['Peter Benchley', 'Carl Gottlieb'], 'directors': ['Steven Spielberg'], 'stars': ['Roy Scheider', 'Robert Shaw', 'Richard Drey

In [44]:
df_cleaned = pd.DataFrame.from_dict(movie_data_dict, orient="index")
df_cleaned.to_csv("./../cleaned_database/cleaned_final_dataset3.csv", index_label="id")
print("Dictionary saved to ./../cleaned_database/cleaned_final_dataset3.csv")


Dictionary saved to ./../cleaned_database/cleaned_final_dataset3.csv
