In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os 
import json
import requests

In [2]:
# Load the data
netflix_data_csv = Path("../Input_csv/Netflix_data.csv")

# Load the data into a DataFrame
netflix_df = pd.read_csv(netflix_data_csv)

# Rename columns 
netflix_df = netflix_df.rename(columns={'title':'Title', 'type':'Type', 'genres':'Genres', 'releaseYear':'Release Year', 
                                    'imdbId':'IMDB ID', 'imdbAverageRating':'IMDB Average Rating', 
                                    'imdbNumVotes':'IMDB Votes', 'availableCountries':'Available Countries'})
netflix_df.head()


Unnamed: 0,Title,Type,Genres,Release Year,IMDB ID,IMDB Average Rating,IMDB Votes,Available Countries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,520154.0,"AT, CH, DE"
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,tt0266697,8.2,1231202.0,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CI, CM, CZ..."
2,Jarhead,movie,"Biography, Drama, War",2005.0,tt0418763,7.0,213096.0,"AD, AE, AG, AO, BH, BM, BS, BZ, CI, CM, CO, CR..."
3,Unforgiven,movie,"Drama, Western",1992.0,tt0105695,8.2,447439.0,"AU, BA, BG, CZ, HR, HU, MD, ME, MK, NZ, PL, RO..."
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004.0,tt0338013,8.3,1116956.0,"AD, AE, AG, AL, AO, AR, AU, AZ, BA, BB, BE, BG..."


In [3]:
# Create a new DataFrame with the columns needed for the analysis. Drop columns. 
netflix_df = netflix_df[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

netflix_df['Release Year'] = netflix_df['Release Year'].astype(str)
netflix_df['Release Year'] = netflix_df['Release Year'].str.split('.').str[0]
netflix_df.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,"AT, CH, DE"
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003,8.2,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CI, CM, CZ..."
2,Jarhead,movie,"Biography, Drama, War",2005,7.0,"AD, AE, AG, AO, BH, BM, BS, BZ, CI, CM, CO, CR..."
3,Unforgiven,movie,"Drama, Western",1992,8.2,"AU, BA, BG, CZ, HR, HU, MD, ME, MK, NZ, PL, RO..."
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004,8.3,"AD, AE, AG, AL, AO, AR, AU, AZ, BA, BB, BE, BG..."


In [4]:
# Check for NaN values
print("NaN count per column:")
print(netflix_df.isnull().sum())

# Get rows with NaN
rows_with_nan = netflix_df[netflix_df.isnull().any(axis=1)]
print("\nRows with NaN values:")
print(rows_with_nan)

NaN count per column:
Title                   616
Type                      0
Genres                  331
Release Year              0
IMDB Average Rating    1661
Available Countries       0
dtype: int64

Rows with NaN values:
                                                   Title   Type  \
2208                           Go! Anpanman: Ruby's Wish  movie   
2404                                        Thunder Monk  movie   
2508                               Breathe without Water  movie   
2520                                El Arte De La Guerra  movie   
2542   I Love You, East Garrison (A Video Essay by Ha...  movie   
...                                                  ...    ...   
20595                                                NaN     tv   
20596                                                NaN     tv   
20598                                                NaN     tv   
20599                                                NaN     tv   
20600                                

In [5]:
#drop the NaN values to clean data
netflix_df_dropna = netflix_df.dropna()
netflix_df_dropna.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,"AT, CH, DE"
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003,8.2,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CI, CM, CZ..."
2,Jarhead,movie,"Biography, Drama, War",2005,7.0,"AD, AE, AG, AO, BH, BM, BS, BZ, CI, CM, CO, CR..."
3,Unforgiven,movie,"Drama, Western",1992,8.2,"AU, BA, BG, CZ, HR, HU, MD, ME, MK, NZ, PL, RO..."
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004,8.3,"AD, AE, AG, AL, AO, AR, AU, AZ, BA, BB, BE, BG..."


In [6]:
netflix_df_dropna = netflix_df_dropna[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

netflix_df_dropna['Release Year'] = netflix_df_dropna['Release Year'].astype(str)
netflix_df_dropna['Release Year'] = netflix_df_dropna['Release Year'].str.split('.').str[0]

netflix_df_dropna.head()


Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,"AT, CH, DE"
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003,8.2,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CI, CM, CZ..."
2,Jarhead,movie,"Biography, Drama, War",2005,7.0,"AD, AE, AG, AO, BH, BM, BS, BZ, CI, CM, CO, CR..."
3,Unforgiven,movie,"Drama, Western",1992,8.2,"AU, BA, BG, CZ, HR, HU, MD, ME, MK, NZ, PL, RO..."
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004,8.3,"AD, AE, AG, AL, AO, AR, AU, AZ, BA, BB, BE, BG..."


In [7]:
# save as new CSV file 
netflix_df_dropna.to_csv("../Output_csv/netflix_cleaned.csv", index=False)