## Imports

In [1]:
# Imports
import os
import pandas as pd
import numpy as np

## Call Data Folder

In [2]:
# Call 'Data' folder
os.listdir("Data/")

['.ipynb_checkpoints',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'title_akas.csv.gz']

## Load Datasets

In [3]:
# Load 'basics' dataset
basics = pd.read_csv('Data/title.basics.tsv.gz',sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
# Load datatypes
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10064703 entries, 0 to 10064702
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 691.1+ MB


In [5]:
# Load filtered 'akas' dataset
akas = pd.read_csv('Data/title_akas.csv.gz', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


## Filtering/Cleaning Steps:
- Title Basics:
 - Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)
 - Replace "\N" with np.nan
 - Eliminate movies that are null for runtimeMinutes
 - Eliminate movies that are null for genre
 - keep only titleType==Movie
 - Convert the startYear column to float data type.
 - Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)
 - Eliminate movies that include "Documentary" in the genre (see tip below).

In [6]:
# Replace "\N" with np.nan
basics.replace({'\\N':np.nan}, inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [7]:
# Convert the startYear column to float data type
basics['startYear'] = basics['startYear'].astype(float)

In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10064703 entries, 0 to 10064702
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   isAdult         object 
 5   startYear       float64
 6   endYear         object 
 7   runtimeMinutes  object 
 8   genres          object 
dtypes: float64(1), object(8)
memory usage: 691.1+ MB


In [9]:
# Keep only US movies
keepers = basics['tconst'].isin(akas['titleId'])
keepers

0            True
1            True
2           False
3           False
4            True
            ...  
10064698    False
10064699    False
10064700    False
10064701    False
10064702    False
Name: tconst, Length: 10064703, dtype: bool

In [10]:
# Filter the basics table down to only include the US by using the filter akas dataframe
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894.0,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894.0,,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10064564,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963.0,,58,Family
10064593,tt9916620,movie,The Copeland Case,The Copeland Case,0,,,,Drama
10064631,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,,,,"Drama,Short"
10064654,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019.0,,,Short


In [11]:
# Check value counts of 'titleType' column
basics['titleType'].value_counts()

short           444163
movie           299471
tvEpisode       245282
video           170050
tvSeries        104032
tvMovie          40926
tvSpecial        22131
tvMiniSeries     17847
videoGame        17110
tvShort           4599
Name: titleType, dtype: int64

In [18]:
# Keep only titleType == movie
basics = basics[basics['titleType'] == "movie"]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,,90,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908.0,,,Drama
...,...,...,...,...,...,...,...,...,...
10064384,tt9916188,movie,Minotaur,Minotaur,0,,,,Thriller
10064385,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
10064469,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"
10064501,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019.0,,,"Adventure,History,War"


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45.0,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,,100.0,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70.0,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,,90.0,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908.0,,,Drama


In [13]:
# Filter dataframe using 'startYear' column
