Import Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Load Data

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
title_basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [3]:
basics_url2="https://datasets.imdbws.com/title.akas.tsv.gz"
title_akas = pd.read_csv(basics_url2, sep='\t', low_memory=False)

In [4]:
basics_url3="https://datasets.imdbws.com/title.ratings.tsv.gz"
title_ratings = pd.read_csv(basics_url3, sep='\t', low_memory=False)

Filtering/Cleaning Data

In [5]:
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
title_basics.duplicated().sum()

0

In [7]:
title_basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [8]:
title_basics.dropna(subset=['primaryTitle', 'originalTitle', 'genres'], inplace=True)
title_basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

Replace "\N" with np.nan

In [9]:
title_basics['endYear'] = title_basics['endYear'].replace({'\\N':np.nan})
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9566812,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,\N,"Action,Drama,Family"
9566813,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,\N,"Action,Drama,Family"
9566814,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,\N,"Action,Drama,Family"
9566815,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [10]:
title_basics['runtimeMinutes'] = title_basics['runtimeMinutes'].replace({'\\N':np.nan})
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9566812,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,,"Action,Drama,Family"
9566813,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"
9566814,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,,"Action,Drama,Family"
9566815,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [11]:
title_basics['genres'] = title_basics['genres'].replace({'\\N':np.nan})
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9566812,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,,"Action,Drama,Family"
9566813,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"
9566814,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,,"Action,Drama,Family"
9566815,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [12]:
title_basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear               0
endYear           9464993
runtimeMinutes    6785215
genres             434196
dtype: int64

In [13]:
title_basics = title_basics.drop(columns=['endYear'])
title_basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear               0
runtimeMinutes    6785215
genres             434196
dtype: int64

In [14]:
title_basics.dropna(subset=['runtimeMinutes'], inplace=True)
title_basics.isna().sum()

tconst                0
titleType             0
primaryTitle          0
originalTitle         0
isAdult               0
startYear             0
runtimeMinutes        0
genres            75331
dtype: int64

In [15]:
title_basics.dropna(subset=['genres'], inplace=True)
title_basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
genres            0
dtype: int64

Eliminate movies that include "Documentary" in genre

In [16]:
#Eliminate movies that include "Documentary" in genre
is_documentary = title_basics['genres'].str.contains('documentary',case=False)
title_basics= title_basics[~is_documentary]

In [17]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,1,Short
...,...,...,...,...,...,...,...,...
9566740,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,66,Drama
9566773,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,43,"Family,Game-Show,Reality-TV"
9566808,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,11,"Adventure,Animation,Comedy"
9566815,tt9916856,short,The Wind,The Wind,0,2015,27,Short


Keep only US movies

In [18]:
keepers =title_basics['tconst'].isin(title_akas['titleId'])
keepers

1           True
2           True
3           True
4           True
5           True
           ...  
9566740     True
9566773     True
9566808    False
9566815     True
9566816    False
Name: tconst, Length: 2309811, dtype: bool

In [19]:
title_basics = title_basics[keepers]
title_basics


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,1,Short
...,...,...,...,...,...,...,...,...
9566703,tt9916610,short,Private Eye,Private Eye,0,2019,22,"Drama,Mystery,Sci-Fi"
9566705,tt9916616,short,Terror,Terror,0,\N,13,"Drama,Short"
9566740,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,66,Drama
9566773,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,43,"Family,Game-Show,Reality-TV"


keep only titleType==Movie

In [20]:
#Eliminate movies that include "short" in genre
is_short = title_basics['titleType'].str.contains('short',case=False)
title_basics= title_basics[~is_short]

#Eliminate movies that include "tvEpisode	" in genre
is_tvEpisode = title_basics['titleType'].str.contains('tvEpisode',case=False)
title_basics= title_basics[~is_tvEpisode]

#Eliminate movies that include "tvSeries" in genre
is_tvSeries = title_basics['titleType'].str.contains('tvSeries',case=False)
title_basics= title_basics[~is_tvSeries]

#Eliminate movies that include "tvMovie" in genre
is_tvMovie = title_basics['titleType'].str.contains('tvMovie',case=False)
title_basics= title_basics[~is_tvMovie]

#Eliminate movies that include "video" in genre
is_video = title_basics['titleType'].str.contains('video',case=False)
title_basics= title_basics[~is_video]

In [21]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,120,"Adventure,Fantasy"
930,tt0000941,movie,Locura de amor,Locura de amor,0,1909,45,Drama
...,...,...,...,...,...,...,...,...
9566490,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9566499,tt9916190,movie,Safeguard,Safeguard,0,2020,95,"Action,Adventure,Thriller"
9566538,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,84,Thriller
9566583,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"


In [22]:
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311228 entries, 8 to 9566667
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          311228 non-null  object
 1   titleType       311228 non-null  object
 2   primaryTitle    311228 non-null  object
 3   originalTitle   311228 non-null  object
 4   isAdult         311228 non-null  object
 5   startYear       311228 non-null  object
 6   runtimeMinutes  311228 non-null  object
 7   genres          311228 non-null  object
dtypes: object(8)
memory usage: 21.4+ MB


In [23]:
## Save current dataframe to file.
title_basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

OSError: Cannot save file into a non-existent directory: 'Data'

In [None]:
# Open saved file and preview again
title_basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
title_basics.head()
