In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os 
import json
import requests

In [2]:
# Load the data
hbo_data_csv = Path("../csv_folder/HBO_data.csv")

# Load the data into a DataFrame
hbo_df = pd.read_csv(hbo_data_csv)

# Rename columns 
hbo_df = hbo_df.rename(columns={'title':'Title', 'type':'Type', 'genres':'Genres', 'releaseYear':'Release Year', 
                                    'imdbId':'IMDB ID', 'imdbAverageRating':'IMDB Average Rating', 
                                    'imdbNumVotes':'IMDB Votes', 'availableCountries':'Available Countries'})
hbo_df.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB ID,IMDB Average Rating,IMDB Votes,Available Countries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,519964.0,"DK, FI, NO, SE"
1,Jarhead,movie,"Biography, Drama, War",2005.0,tt0418763,7.0,213029.0,"HK, ID, MY, PH, SG, TH, TW"
2,Unforgiven,movie,"Drama, Western",1992.0,tt0105695,8.2,447253.0,"AG, AR, BB, BE, BO, BR, BS, BZ, CL, CO, CR, DO..."
3,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004.0,tt0338013,8.3,1116278.0,"AG, AR, BB, BO, BS, BZ, CL, CO, CR, DO, EC, GT..."
4,A History of Violence,movie,"Action, Crime, Drama",2005.0,tt0399146,7.4,260655.0,"AG, AR, BB, BO, BR, BS, BZ, CL, CO, CR, DO, EC..."


In [3]:
# Create a new DataFrame with the columns needed for the analysis. Drop columns. 
hbo_df = hbo_df[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

# Convert 'Release Year' to only the year and remove the decimal
hbo_df['Release Year'] = hbo_df['Release Year'].astype(str)
hbo_df['Release Year'] = hbo_df['Release Year'].str.split('.').str[0]
hbo_df.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,"DK, FI, NO, SE"
1,Jarhead,movie,"Biography, Drama, War",2005,7.0,"HK, ID, MY, PH, SG, TH, TW"
2,Unforgiven,movie,"Drama, Western",1992,8.2,"AG, AR, BB, BE, BO, BR, BS, BZ, CL, CO, CR, DO..."
3,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004,8.3,"AG, AR, BB, BO, BS, BZ, CL, CO, CR, DO, EC, GT..."
4,A History of Violence,movie,"Action, Crime, Drama",2005,7.4,"AG, AR, BB, BO, BR, BS, BZ, CL, CO, CR, DO, EC..."


In [4]:
# Check for NaN values
print("NaN count per column:")
print(hbo_df.isnull().sum())

# Get rows with NaN
rows_with_nan = hbo_df[hbo_df.isnull().any(axis=1)]
print("\nRows with NaN values:")
print(rows_with_nan)

NaN count per column:
Title                  605
Type                     0
Genres                 285
Release Year             0
IMDB Average Rating    939
Available Countries      0
dtype: int64

Rows with NaN values:
                                    Title   Type                Genres  \
1483                          Siste trikk  movie                   NaN   
1531                Breathe without Water  movie                   NaN   
1595                                    X  movie      Thriller, Horror   
2455                    Bita e os Animais  movie             Animation   
2462              Hello Ladies: The Movie  movie      Comedy, TV Movie   
...                                   ...    ...                   ...   
9419                                  NaN     tv                 Crime   
9421                                  NaN     tv           Documentary   
9422                                  NaN     tv                   NaN   
9423  Tony Robinson's Marvellous Machine

In [5]:
#drop the nan values to clean data
hbo_df_dropna = hbo_df.dropna()
hbo_df_dropna.head()

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,"DK, FI, NO, SE"
1,Jarhead,movie,"Biography, Drama, War",2005,7.0,"HK, ID, MY, PH, SG, TH, TW"
2,Unforgiven,movie,"Drama, Western",1992,8.2,"AG, AR, BB, BE, BO, BR, BS, BZ, CL, CO, CR, DO..."
3,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004,8.3,"AG, AR, BB, BO, BS, BZ, CL, CO, CR, DO, EC, GT..."
4,A History of Violence,movie,"Action, Crime, Drama",2005,7.4,"AG, AR, BB, BO, BR, BS, BZ, CL, CO, CR, DO, EC..."


In [6]:
hbo_df_dropna = hbo_df_dropna[['Title', 'Type', 'Genres', 'Release Year', 'IMDB Average Rating', 'Available Countries']]

hbo_df_dropna['Release Year'] = hbo_df_dropna['Release Year'].astype(str)
hbo_df_dropna['Release Year'] = hbo_df_dropna['Release Year'].str.split('.').str[0]

hbo_df_dropna.head()   

Unnamed: 0,Title,Type,Genres,Release Year,IMDB Average Rating,Available Countries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,"DK, FI, NO, SE"
1,Jarhead,movie,"Biography, Drama, War",2005,7.0,"HK, ID, MY, PH, SG, TH, TW"
2,Unforgiven,movie,"Drama, Western",1992,8.2,"AG, AR, BB, BE, BO, BR, BS, BZ, CL, CO, CR, DO..."
3,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004,8.3,"AG, AR, BB, BO, BS, BZ, CL, CO, CR, DO, EC, GT..."
4,A History of Violence,movie,"Action, Crime, Drama",2005,7.4,"AG, AR, BB, BO, BR, BS, BZ, CL, CO, CR, DO, EC..."


In [7]:
# Save this as a new csv file
hbo_df.to_csv("../hbo_data/hbo_cleaned.csv", index=False)