In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os 
import json
import requests

In [2]:
# Load the data
hulu_data_csv = Path("../Input_csv/hulu_data.csv")

# Load the data into a DataFrame
hulu_df = pd.read_csv(hulu_data_csv)

# Rename columns 
hulu_df = hulu_df.rename(columns={'title':'Title', 'type':'Type', 'genres':'Genres', 'releaseYear':'Release Year', 
                                    'imdbId':'IMDB ID', 'imdbAverageRating':'IMDB Average Rating', 
                                    'imdbNumVotes':'IMDB Votes', 'availableCountries':'Available Countries'})
hulu_df.head()


Unnamed: 0,Title,Type,Genres,Release Year,IMDB ID,IMDB Average Rating,IMDB Votes,Available Countries
0,Ariel,movie,"Comedy, Crime, Romance",1988.0,tt0094675,7.4,8874.0,JP
1,Shadows in Paradise,movie,"Comedy, Drama, Music",1986.0,tt0092149,7.5,7657.0,JP
2,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2338524.0,JP
3,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,520103.0,JP
4,My Life Without Me,movie,"Drama, Romance",2003.0,tt0314412,7.4,26108.0,JP


In [3]:
# Create a new DataFrame with the columns needed for the analysis. Drop columns. 
hulu_df = hulu_df[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

hulu_df['Release Year'] = hulu_df['Release Year'].astype(str)
hulu_df['Release Year'] = hulu_df['Release Year'].str.split('.').str[0]
hulu_df.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Ariel,movie,"Comedy, Crime, Romance",1988,7.4,JP
1,Shadows in Paradise,movie,"Comedy, Drama, Music",1986,7.5,JP
2,Forrest Gump,movie,"Drama, Romance",1994,8.8,JP
3,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,JP
4,My Life Without Me,movie,"Drama, Romance",2003,7.4,JP


In [4]:
# Check for NaN values
print("NaN count per column:")
print(hulu_df.isnull().sum())

# Get rows with NaN
rows_with_nan = hulu_df[hulu_df.isnull().any(axis=1)]
print("\nRows with NaN values:")
print(rows_with_nan)

NaN count per column:
Title                   669
Type                      0
Genres                  346
Release Year              0
IMDB Average Rating    1381
Available Countries       0
dtype: int64

Rows with NaN values:
                                        Title   Type  \
859                 Go! Anpanman: Ruby's Wish  movie   
947                              Kokumon Gate  movie   
951                                 Kiss Cam!  movie   
952                                     Genge  movie   
953    Stolen Identity the Final Hacking Game  movie   
...                                       ...    ...   
10153                                     NaN     tv   
10154                                     NaN     tv   
10155                                     NaN     tv   
10158                                     NaN     tv   
10159                                     NaN     tv   

                             Genres Release Year  IMDB Average Rating  \
859    Adventure, Animation,

In [5]:
#drop the nan values to clean data
hulu_df_dropna = hulu_df.dropna()
hulu_df_dropna.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Ariel,movie,"Comedy, Crime, Romance",1988,7.4,JP
1,Shadows in Paradise,movie,"Comedy, Drama, Music",1986,7.5,JP
2,Forrest Gump,movie,"Drama, Romance",1994,8.8,JP
3,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,JP
4,My Life Without Me,movie,"Drama, Romance",2003,7.4,JP


In [6]:
hulu_df_dropna = hulu_df_dropna[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

hulu_df_dropna['Release Year'] = hulu_df_dropna['Release Year'].astype(str)
hulu_df_dropna['Release Year'] = hulu_df_dropna['Release Year'].str.split('.').str[0]

hulu_df_dropna.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Ariel,movie,"Comedy, Crime, Romance",1988,7.4,JP
1,Shadows in Paradise,movie,"Comedy, Drama, Music",1986,7.5,JP
2,Forrest Gump,movie,"Drama, Romance",1994,8.8,JP
3,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,JP
4,My Life Without Me,movie,"Drama, Romance",2003,7.4,JP


In [7]:
# Save cleaned data to a new CSV file
hulu_df_dropna.to_csv('../Output_csv/hulu_cleaned.csv', index=False)