# Movie Dataset Preparation

# Setting Up Necessary Things

In [1]:
# Jupyter Notebook Magic Command - Auto Reloading
%reload_ext autoreload
%autoreload 2

# Jupyter Notebook Magic Command - Inline Plotting
%matplotlib inline

In [2]:
# Ignore All Warnings
import warnings
warnings.filterwarnings("ignore")

# Necessary Imports

In [55]:
# General
import os
from IPython.display import display

# Data
import pandas as pd

# String
import ast

# Data Cleaning

In [33]:
# Get All CSV Files Path
def get_csv_files_path(folder):
    all_files = os.listdir(folder)
    file_names = [file.replace(".csv", "") for file in all_files if file.endswith(".csv")]
    file_paths = [folder + file for file in all_files if file.endswith(".csv")]
    
    file_details = {}
    for name, path in zip(file_names, file_paths):
        file_details[name] = path
    
    return file_details

In [34]:
# Get and Load Raw Data Into a List
def get_dataframe(path):
    df = pd.read_csv(path)
    df.drop(["movie_id", "year", "certificate", "runtime", "rating", "director",
             "director_id", "star", "star_id", "votes", "gross(in $)"], 
            axis = 1, 
            inplace = True)
    df = df.dropna(axis=0, how="any")
    df = df.drop_duplicates(subset = "movie_name", keep='first')
    df = df.drop_duplicates(subset = "description", keep='first')
    return df

In [35]:
# Get DataFrame Information
def get_dataframe_info(df):
    print("Shape of the dataframe: ", df.shape)
    print("Null Values:")
    display(df.isna().sum())
    print("Duplicate Movie Name Count: ", df["movie_name"].duplicated().sum())
    print("Duplicate Description Count: ", df["description"].duplicated().sum())
    print("DataFrame Details:")
    display(df.describe(include="object"))

In [42]:
# Get All DataFrames
def get_all_dataframes():
    csv_file_paths = get_csv_files_path("../../data/raw/movie/")
    
    df_list = {}
    for name, path in csv_file_paths.items():
        df_list[name] = get_dataframe(path)
    
    return df_list

all_movie_df = get_all_dataframes()

In [44]:
# Get Details of Action Movies
get_dataframe_info(all_movie_df["action"])

Shape of the dataframe:  (36637, 3)
Null Values:


movie_name     0
genre          0
description    0
dtype: int64

Duplicate Movie Name Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,movie_name,genre,description
count,36637,36637,36637
unique,36637,358,36637
top,Black Panther: Wakanda Forever,Action,The people of Wakanda fight to protect their h...
freq,1,7078,1


## Combine All The DataFrames

In [47]:
# Get Combined Dataframes
def get_combined_dataframes(df_dict):
    df_list = list(df_dict.values())
    combined_df = pd.concat(df_list, axis=0)
    return combined_df

movie_full_df = get_combined_dataframes(all_movie_df)

In [50]:
# Clean Dataframe
def clean_dataframe(df):
    df = df.drop_duplicates(subset = "movie_name", keep='first')
    df = df.drop_duplicates(subset = "description", keep='first')
    return df

In [51]:
movie_df = clean_dataframe(movie_full_df)

In [64]:
movie_df.head()

Unnamed: 0,movie_name,genre,description
0,Black Panther: Wakanda Forever,"Action, Adventure, Drama",The people of Wakanda fight to protect their h...
1,Avatar: The Way of Water,"Action, Adventure, Fantasy",Jake Sully lives with his newfound family form...
2,Plane,"Action, Thriller",A pilot finds himself caught in a war zone aft...
3,Everything Everywhere All at Once,"Action, Adventure, Comedy",A middle-aged Chinese immigrant is swept up in...
4,Fast X,"Action, Crime, Mystery",Dom Toretto and his family are targeted by the...


In [53]:
get_dataframe_info(movie_df)

Shape of the dataframe:  (164378, 3)
Null Values:


movie_name     0
genre          0
description    0
dtype: int64

Duplicate Movie Name Count:  0
Duplicate Description Count:  0
DataFrame Details:


Unnamed: 0,movie_name,genre,description
count,164378,164378,164378
unique,164378,1590,164378
top,Black Panther: Wakanda Forever,Horror,The people of Wakanda fight to protect their h...
freq,1,9890,1


## Primary Genre and Type

In [76]:
# Create Primary Genre Column From Genre
primary_genre = []

for i in range(movie_df.shape[0]):
    primary_genre.append(movie_df["genre"].iloc[i].split(",")[0])

len(primary_genre), primary_genre[:5]

(164378, ['Action', 'Action', 'Action', 'Action', 'Action'])

In [77]:
# insert Primary Studio column into the dataframe
movie_df["primary_genre"] = primary_genre

In [80]:
# Entertainment Type Movie Enter
movie_df["type"] = "movie"

In [82]:
movie_df.rename(columns={"movie_name": "title",
                   "genre": "genres"},
          inplace=True, errors='raise')

In [83]:
movie_df.head()

Unnamed: 0,title,genres,description,primary_genre,type
0,Black Panther: Wakanda Forever,"Action, Adventure, Drama",The people of Wakanda fight to protect their h...,Action,movie
1,Avatar: The Way of Water,"Action, Adventure, Fantasy",Jake Sully lives with his newfound family form...,Action,movie
2,Plane,"Action, Thriller",A pilot finds himself caught in a war zone aft...,Action,movie
3,Everything Everywhere All at Once,"Action, Adventure, Comedy",A middle-aged Chinese immigrant is swept up in...,Action,movie
4,Fast X,"Action, Crime, Mystery",Dom Toretto and his family are targeted by the...,Action,movie


# Dataframe to CSV

In [92]:
# Method to convert dataframe to CSV and save
def write_dataframe_to_csv(path, dataframe):
    if os.path.exists(path):
        print(f"The file already exists ...! [Find the file in the location '{path}']")
    else:
        dataframe.to_csv(path, index = False)
        print("Dataframe saved successfully: ", path)

In [94]:
movie_path = "../../data/processed/movie/movie.csv"

write_dataframe_to_csv(movie_path, movie_df)

Dataframe saved successfully:  ../../data/processed/movie/movie.csv
