In [None]:
# Data Importation and Basic Cleaning

This notebook's purpose is to download several files from IMDB's movie data set and filter out the subset of movies as requested by the stakeholder. Additionally, basic data cleaning (removal of duplicates, etc.) will be provided.

The following files are to be downloaded and filtered:
* [Akas](https://datasets.imdbws.com/title.akas.tsv.gz)
* [Ratings](https://datasets.imdbws.com/title.ratings.tsv.gz)
* [Basics](https://datasets.imdbws.com/name.basics.tsv.gz)
* [Crew](https://datasets.imdbws.com/title.crew.tsv.gz)
* [Principals](https://datasets.imdbws.com/title.principals.tsv.gz)
* [Names](https://datasets.imdbws.com/name.basics.tsv.gz)

The data dictionary can be found [here](https://www.imdb.com/interfaces/).

Datasets for extraction can be found [here](https://datasets.imdbws.com/)


## Library Importation, Folder Creation, and Function Implementation

Importing various libraries such as Pandas, creating any folders, and implementing any useful functions later on.

In [None]:
#Importing numpy and pandas for basic data manipulation
import numpy as np
import pandas as pd

#Importing os to connect with operating system
import os

In [None]:
#Setting pandas options to max column and row displays
pd.set_option('display.max_columns', None) #Used for displaying columns
pd.set_option('display.max_rows', None) #Used for displaying rows

In [None]:
#Making data folder if one does not already exist
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

## Creating and Cleaning Databases

The following will be applied to all databases:

* Exclude any movie with missing values for genre or runtime
* Include only full-length movies (titleType = "movie").
* Include only fictional movies (not from documentary genre)
* Include only movies that were released 2000 - 2021 (include 2000 and 2021)
* Include only movies that were released in the United States
* Replace all "\N" values with np.nan

### Creating and Cleaning Akas

akas_url: https://datasets.imdbws.com/title.akas.tsv.gz

In [None]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

akas_df = pd.read_csv(akas_url, sep = "\t", low_memory = False)
akas_df.head()

In [None]:
#Checking what abbreviation = US
#i.e. is the abbr. US, or USA?
akas_df["region"].value_counts()

<center>The abbreviation for movies from the USA, is "US".</center>

In [None]:
#Filtering out non-US regions
akas_filter = akas_df["region"] == "US"

akas_df = akas_df[akas_filter]

akas_df["region"].value_counts()

In [None]:
#Removing all \N values
akas_df = akas_df.replace({"\\N":np.nan})

akas_df.head()

In [None]:
#Checking for duplicated values
akas_df.duplicated().sum()

In [None]:
#Preliminary check for missing values
akas_df.isna().sum()

In [None]:
#Preliminary check for missing values by %
akas_df.isna().sum()/len(akas_df) * 100

### Creating and Cleaning Ratings

ratings_url: https://datasets.imdbws.com/title.ratings.tsv.gz

In [None]:
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

ratings_df = pd.read_csv(ratings_url, sep = "\t", low_memory = False)
ratings_df.head()

In [None]:
#Filtering out non-US ratings
ratings_in_US_filter = ratings_df["tconst"].isin(akas_df["titleId"])

ratings_df = ratings_df[ratings_in_US_filter]
ratings_df.head()

In [None]:
#Removing all \N values
ratings_df = ratings_df.replace({"\\N":np.nan})

ratings_df.head()

In [None]:
#Checking for duplicated values
ratings_df.duplicated().sum()

In [None]:
#Preliminary check for missing values
ratings_df.isna().sum()

### Creating and Cleaning Basics

basics_url: https://datasets.imdbws.com/title.basics.tsv.gz

In [None]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

basics_df = pd.read_csv(basics_url, sep = "\t", low_memory = False)
basics_df.head()

In [None]:
#Filtering out non-US movies
movies_in_US_filter = basics_df["tconst"].isin(akas_df["titleId"])

basics_df = basics_df[movies_in_US_filter]
basics_df.head()

In [None]:
# Removing all \N values
basics_df = basics_df.replace({"\\N": np.nan})

basics_df.head()

In [None]:
#Filtering out non-movies
isMovie = basics_df["titleType"] == "movie"
basics_df = basics_df[isMovie]

basics_df.head()

In [None]:
#Checking type of "genres"
basics_df["genres"].info()

In [None]:
#Filtering out documentaries
is_documentary = basics_df["genres"].str.contains("documentary", na = False)
is_Documentary = basics_df["genres"].str.contains("Documentary", na = False)
basics_df = basics_df[~is_documentary & ~is_Documentary]

basics_df.head()

In [None]:
#Changing startYear to be an int
basics_df["startYear"] = basics_df["startYear"].astype(float)

#Filters for movies from 2000-2021 inclusive
isOlderThan2000 = basics_df["startYear"] >= 2000
isYoungerThan2022 = basics_df["startYear"] <= 2021

basics_df = basics_df[isOlderThan2000 & isYoungerThan2022]

#Checking to make sure filters work
basics_df.describe()

In [None]:
#Changing runtimeMinutes to an int
basics_df["runtimeMinutes"] = basics_df["runtimeMinutes"].astype(float)

#Removing NA values in runtimeMinutes, genres
basics_df = basics_df.dropna(subset = ["runtimeMinutes", "genres"])

#Preliminary check for missing values
basics_df.isna().sum()

In [None]:
#Checking for duplicates
basics_df.duplicated().sum()

### Creating and Cleaning Crew

crew_url: https://datasets.imdbws.com/title.crew.tsv.gz

In [None]:
crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"

crew_df = pd.read_csv(crew_url, sep = "\t", low_memory = False)
crew_df.head()

In [None]:
#Filtering out crew that are not in basics
movies_in_basics_filter = crew_df["tconst"].isin(basics_df["tconst"])

crew_df = crew_df[movies_in_basics_filter]
crew_df.head()

In [None]:
# Removing all \N values
crew_df = crew_df.replace({"\\N": np.nan})

crew_df.head()

#### Creating unique pairs of Directors and Writers

In [None]:
#Splitting writers and directors into lists
crew_df["directors_split"] = crew_df["directors"].str.split(',')
crew_df["writers_split"] = crew_df["writers"].str.split(',')
crew_df.head()

In [None]:
#Removing unnecessary directors and writers columns
crew_df = crew_df.drop(columns = ["directors", "writers"])
crew_df.head()

In [None]:
#Exploding directors and writers
crew_df = crew_df.explode("directors_split")
crew_df = crew_df.explode("writers_split")
crew_df.head()

In [None]:
#Renaming directors and writers columns
crew_df = crew_df.rename(columns = {"directors_split": "director", "writers_split": "writer"})
crew_df.head()

In [None]:
unique_director = crew_df["director"].unique()
unique_director

In [None]:
unique_writer = crew_df["writer"].unique()
unique_writer

### Creating and Cleaning Principals

principals_url: https://datasets.imdbws.com/title.principals.tsv.gz

In [None]:
principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"

principals_df = pd.read_csv(principals_url, sep = "\t", low_memory = False)

principals_df.head()

In [None]:
#Filtering out principals that are not in basics
movies_in_basics_filter = principals_df["tconst"].isin(basics_df["tconst"])

principals_df = principals_df[movies_in_basics_filter]
principals_df.head()

In [None]:
# Removing all \N values
principals_df = principals_df.replace({"\\N": np.nan})

principals_df.head()

### Creating and Cleaning Names

names_url: https://datasets.imdbws.com/name.basics.tsv.gz

In [None]:
names_url = "https://datasets.imdbws.com/name.basics.tsv.gz"

names_df = pd.read_csv(names_url, sep = "\t", low_memory = False)

names_df.head()

In [None]:
#Filtering out crew
names_in_principals_filter = names_df["nconst"].isin(principals_df["nconst"])
names_in_directors_filter = names_df["nconst"].isin(unique_director)
names_in_writers_filter = names_df["nconst"].isin(unique_writer)

names_df = names_df[names_in_principals_filter | 
                    names_in_directors_filter |
                    names_in_writers_filter]
names_df.head()

In [None]:
# Removing all \N values
names_df = names_df.replace({"\\N": np.nan})

names_df.head()

## Deliverables

In order to showcase what has been done

* A summary of how many movies remain in each dataset and the datatypes for each feature will be provided
* Each pandas dataframe will be compressed into a csv file into the "Data/" folder

### Showcasing how many movies remain in each dataset w/ features