In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os 
import json
import requests

In [2]:
# Load the data
amazon_prime_data_csv = Path("../Input_csv/amazon_prime_data.csv")

# Load the data into a DataFrame
amazon_prime_df = pd.read_csv(amazon_prime_data_csv)

# Rename columns 
amazon_prime_df = amazon_prime_df.rename(columns={'title':'Title', 'type':'Type', 'genres':'Genres', 'releaseYear':'Release Year', 
                                    'imdbId':'IMDB ID', 'imdbAverageRating':'IMDB Average Rating', 
                                    'imdbNumVotes':'IMDB Votes', 'availableCountries':'Available Countries'})
amazon_prime_df.head()


Unnamed: 0,Title,Type,Genres,Release Year,IMDB ID,IMDB Average Rating,IMDB Votes,Available Countries
0,Blondie,movie,"Comedy, Family",1938.0,tt0029927,6.9,891.0,ZA
1,Four Rooms,movie,Comedy,1995.0,tt0113101,6.7,113117.0,"AT, DE"
2,Judgment Night,movie,"Action, Crime, Drama",1993.0,tt0107286,6.6,19462.0,"AR, BO, BR, CL, CO, CR, EC, GT, HN, MX, PA, PE..."
3,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2338879.0,"AD, CU, FR, GB, GF, GG, GI, IE, IN, JP, MC, PF..."
4,Citizen Kane,movie,"Drama, Mystery",1941.0,tt0033467,8.3,476390.0,"AD, CA, ES, IN, JP"


In [3]:
# Create a new DataFrame with the columns needed for the analysis. Drop columns. 
amazon_prime_df = amazon_prime_df[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

amazon_prime_df['Release Year'] = amazon_prime_df['Release Year'].astype(str)
amazon_prime_df['Release Year'] = amazon_prime_df['Release Year'].str.split('.').str[0]
amazon_prime_df.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Blondie,movie,"Comedy, Family",1938,6.9,ZA
1,Four Rooms,movie,Comedy,1995,6.7,"AT, DE"
2,Judgment Night,movie,"Action, Crime, Drama",1993,6.6,"AR, BO, BR, CL, CO, CR, EC, GT, HN, MX, PA, PE..."
3,Forrest Gump,movie,"Drama, Romance",1994,8.8,"AD, CU, FR, GB, GF, GG, GI, IE, IN, JP, MC, PF..."
4,Citizen Kane,movie,"Drama, Mystery",1941,8.3,"AD, CA, ES, IN, JP"


In [4]:
# Check for NaN values
print("NaN count per column:")
print(amazon_prime_df.isnull().sum())

# Get rows with NaN
rows_with_nan = amazon_prime_df[amazon_prime_df.isnull().any(axis=1)]
print("\nRows with NaN values:")
print(rows_with_nan)

NaN count per column:
Title                  1672
Type                      0
Genres                 2080
Release Year              0
IMDB Average Rating    8639
Available Countries       0
dtype: int64

Rows with NaN values:
                                       Title   Type         Genres  \
1293                                  Rounds  movie          Short   
3230                                   Turbo  movie    Documentary   
3231                                Realtime  movie    Documentary   
3812   Crusty Demons 14: A Bloodthirsty Saga  movie         Action   
4270        The Wiggles: Live: Hot Potatoes!  movie  Music, Family   
...                                      ...    ...            ...   
69391                                    NaN     tv    Documentary   
69392                         Inside Oranje.     tv    Documentary   
69394                                    NaN     tv            NaN   
69395                                    NaN     tv            NaN   
6939

In [5]:
#drop the nan values to clean data
amazon_prime_df_dropna = amazon_prime_df.dropna()
amazon_prime_df_dropna.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Blondie,movie,"Comedy, Family",1938,6.9,ZA
1,Four Rooms,movie,Comedy,1995,6.7,"AT, DE"
2,Judgment Night,movie,"Action, Crime, Drama",1993,6.6,"AR, BO, BR, CL, CO, CR, EC, GT, HN, MX, PA, PE..."
3,Forrest Gump,movie,"Drama, Romance",1994,8.8,"AD, CU, FR, GB, GF, GG, GI, IE, IN, JP, MC, PF..."
4,Citizen Kane,movie,"Drama, Mystery",1941,8.3,"AD, CA, ES, IN, JP"


In [6]:
amazon_prime_df_dropna = amazon_prime_df_dropna[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

amazon_prime_df_dropna['Release Year'] = amazon_prime_df_dropna['Release Year'].astype(str)
amazon_prime_df_dropna['Release Year'] = amazon_prime_df_dropna['Release Year'].str.split('.').str[0]

amazon_prime_df_dropna.head()


Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Blondie,movie,"Comedy, Family",1938,6.9,ZA
1,Four Rooms,movie,Comedy,1995,6.7,"AT, DE"
2,Judgment Night,movie,"Action, Crime, Drama",1993,6.6,"AR, BO, BR, CL, CO, CR, EC, GT, HN, MX, PA, PE..."
3,Forrest Gump,movie,"Drama, Romance",1994,8.8,"AD, CU, FR, GB, GF, GG, GI, IE, IN, JP, MC, PF..."
4,Citizen Kane,movie,"Drama, Mystery",1941,8.3,"AD, CA, ES, IN, JP"


In [7]:
# save as new CSV file
amazon_prime_df_dropna.to_csv("../Output_csv/amazon_cleaned.csv", index=False)