In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os 
import json
import requests

In [2]:
# Load the data
apple_data_csv = Path("../Input_csv/apple_data.csv")

# Load the data into a DataFrame
apple_df = pd.read_csv(apple_data_csv)

# Rename columns 
apple_df = apple_df.rename(columns={'title':'Title', 'type':'Type', 'genres':'Genres', 'releaseYear':'Release Year', 
                                    'imdbId':'IMDB ID', 'imdbAverageRating':'IMDB Average Rating', 
                                    'imdbNumVotes':'IMDB Votes', 'availableCountries':'Available Countries'})
apple_df.head()


Unnamed: 0,Title,Type,Genres,Release Year,IMDB ID,IMDB Average Rating,IMDB Votes,Available Countries
0,Four Rooms,movie,Comedy,1995.0,tt0113101,6.7,113097.0,DE
1,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2338524.0,"AU, CA, US"
2,American Beauty,movie,Drama,1999.0,tt0169547,8.3,1235685.0,"AT, CH, DE, FR, GB, IE, IT"
3,Citizen Kane,movie,"Drama, Mystery",1941.0,tt0033467,8.3,476316.0,"DE, GB"
4,Metropolis,movie,"Drama, Sci-Fi",1927.0,tt0017136,8.3,191750.0,GB


In [3]:
# Create a new DataFrame with the columns needed for the analysis. Drop columns. 
apple_df = apple_df[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

apple_df['Release Year'] = apple_df['Release Year'].astype(str)
apple_df['Release Year'] = apple_df['Release Year'].str.split('.').str[0]
apple_df.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Four Rooms,movie,Comedy,1995,6.7,DE
1,Forrest Gump,movie,"Drama, Romance",1994,8.8,"AU, CA, US"
2,American Beauty,movie,Drama,1999,8.3,"AT, CH, DE, FR, GB, IE, IT"
3,Citizen Kane,movie,"Drama, Mystery",1941,8.3,"DE, GB"
4,Metropolis,movie,"Drama, Sci-Fi",1927,8.3,GB


In [4]:
# Check for NaN values
print("NaN count per column:")
print(apple_df.isnull().sum())

# Get rows with NaN
rows_with_nan = apple_df[apple_df.isnull().any(axis=1)]
print("\nRows with NaN values:")
print(rows_with_nan)

NaN count per column:
Title                   560
Type                      0
Genres                  655
Release Year              0
IMDB Average Rating    1882
Available Countries       0
dtype: int64

Rows with NaN values:
                                 Title   Type                        Genres  \
1973                       Anthappuram  movie                           NaN   
2923                       Siste trikk  movie                           NaN   
2990             Breathe without Water  movie                           NaN   
2996              El Arte De La Guerra  movie                           NaN   
3006                        The Oracle  movie                 Comedy, Drama   
...                                ...    ...                           ...   
17923                              NaN     tv                           NaN   
17926                              NaN     tv  Comedy, Drama, Reality, Talk   
17927  The Family Business New Orleans     tv          Action, 

In [5]:
#drop the NaN values to clean data
apple_df_dropna = apple_df.dropna()
apple_df_dropna.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Four Rooms,movie,Comedy,1995,6.7,DE
1,Forrest Gump,movie,"Drama, Romance",1994,8.8,"AU, CA, US"
2,American Beauty,movie,Drama,1999,8.3,"AT, CH, DE, FR, GB, IE, IT"
3,Citizen Kane,movie,"Drama, Mystery",1941,8.3,"DE, GB"
4,Metropolis,movie,"Drama, Sci-Fi",1927,8.3,GB


In [6]:
apple_df_dropna = apple_df_dropna[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

apple_df_dropna['Release Year'] = apple_df_dropna['Release Year'].astype(str)
apple_df_dropna['Release Year'] = apple_df_dropna['Release Year'].str.split('.').str[0]

apple_df_dropna.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,Four Rooms,movie,Comedy,1995,6.7,DE
1,Forrest Gump,movie,"Drama, Romance",1994,8.8,"AU, CA, US"
2,American Beauty,movie,Drama,1999,8.3,"AT, CH, DE, FR, GB, IE, IT"
3,Citizen Kane,movie,"Drama, Mystery",1941,8.3,"DE, GB"
4,Metropolis,movie,"Drama, Sci-Fi",1927,8.3,GB


In [7]:
# save as new CSV file 
apple_df_dropna.to_csv("../Output_csv/apple_cleaned.csv", index=False)