Import Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Load Data

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
title_basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [3]:
basics_url2="https://datasets.imdbws.com/title.akas.tsv.gz"
title_akas = pd.read_csv(basics_url2, sep='\t', low_memory=False)

In [4]:
basics_url3="https://datasets.imdbws.com/title.ratings.tsv.gz"
title_ratings = pd.read_csv(basics_url3, sep='\t', low_memory=False)

Filtering/Cleaning Data

In [5]:
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
title_basics.duplicated().sum()

0

In [7]:
title_basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [8]:
title_basics.dropna(subset=['primaryTitle', 'originalTitle', 'genres'], inplace=True)
title_basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

Replace "\N" with np.nan

In [20]:
title_basics.replace({'\\N':np.nan},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_basics.replace({'\\N':np.nan},inplace=True)


In [12]:
title_basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear               0
endYear           9496807
runtimeMinutes    6795237
genres             434061
dtype: int64

In [13]:
title_basics = title_basics.drop(columns=['endYear'])
title_basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear               0
runtimeMinutes    6795237
genres             434061
dtype: int64

In [14]:
title_basics.dropna(subset=['runtimeMinutes'], inplace=True)
title_basics.isna().sum()

tconst                0
titleType             0
primaryTitle          0
originalTitle         0
isAdult               0
startYear             0
runtimeMinutes        0
genres            75479
dtype: int64

In [15]:
title_basics.dropna(subset=['genres'], inplace=True)
title_basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
genres            0
dtype: int64

Eliminate movies that include "Documentary" in genre

In [16]:
#Eliminate movies that include "Documentary" in genre
is_documentary = title_basics['genres'].str.contains('documentary',case=False)
title_basics= title_basics[~is_documentary]

In [17]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,1,Short
...,...,...,...,...,...,...,...,...
9599071,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,66,Drama
9599104,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,43,"Family,Game-Show,Reality-TV"
9599139,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,11,"Adventure,Animation,Comedy"
9599146,tt9916856,short,The Wind,The Wind,0,2015,27,Short


Keep only US movies

In [18]:
keepers =title_basics['tconst'].isin(title_akas['titleId'])
keepers

1           True
2           True
3           True
4           True
5           True
           ...  
9599071     True
9599104     True
9599139    False
9599146     True
9599147    False
Name: tconst, Length: 2330866, dtype: bool

In [19]:
title_basics = title_basics[keepers]
title_basics


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,1,Short
...,...,...,...,...,...,...,...,...
9599034,tt9916610,short,Private Eye,Private Eye,0,2019,22,"Drama,Mystery,Sci-Fi"
9599036,tt9916616,short,Terror,Terror,0,\N,13,"Drama,Short"
9599071,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,66,Drama
9599104,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,43,"Family,Game-Show,Reality-TV"


keep only titleType==Movie

In [21]:
#Eliminate movies that include "short" in genre
is_short = title_basics['titleType'].str.contains('short',case=False)
title_basics= title_basics[~is_short]

#Eliminate movies that include "tvEpisode	" in genre
is_tvEpisode = title_basics['titleType'].str.contains('tvEpisode',case=False)
title_basics= title_basics[~is_tvEpisode]

#Eliminate movies that include "tvSeries" in genre
is_tvSeries = title_basics['titleType'].str.contains('tvSeries',case=False)
title_basics= title_basics[~is_tvSeries]

#Eliminate movies that include "tvMovie" in genre
is_tvMovie = title_basics['titleType'].str.contains('tvMovie',case=False)
title_basics= title_basics[~is_tvMovie]

#Eliminate movies that include "video" in genre
is_video = title_basics['titleType'].str.contains('video',case=False)
title_basics= title_basics[~is_video]

In [22]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,120,"Adventure,Fantasy"
930,tt0000941,movie,Locura de amor,Locura de amor,0,1909,45,Drama
...,...,...,...,...,...,...,...,...
9598821,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9598830,tt9916190,movie,Safeguard,Safeguard,0,2020,95,"Action,Adventure,Thriller"
9598869,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,84,Thriller
9598914,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"


In [28]:
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311878 entries, 0 to 311877
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          311878 non-null  object 
 1   titleType       311878 non-null  object 
 2   primaryTitle    311878 non-null  object 
 3   originalTitle   311878 non-null  object 
 4   isAdult         311878 non-null  int64  
 5   startYear       307161 non-null  float64
 6   runtimeMinutes  311878 non-null  int64  
 7   genres          311878 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 19.0+ MB


keep startYear 2000-2022

In [31]:
title_basics['startYear'] = title_basics['startYear'].astype(float)
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311878 entries, 0 to 311877
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          311878 non-null  object 
 1   titleType       311878 non-null  object 
 2   primaryTitle    311878 non-null  object 
 3   originalTitle   311878 non-null  object 
 4   isAdult         311878 non-null  int64  
 5   startYear       307161 non-null  float64
 6   runtimeMinutes  311878 non-null  int64  
 7   genres          311878 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 19.0+ MB


In [55]:
keepers =title_basics['startYear'] >= 2000
keepers

0         False
1         False
2         False
3         False
4         False
          ...  
311873     True
311874     True
311875     True
311876     True
311877     True
Name: startYear, Length: 311878, dtype: bool

In [56]:
title_basics = title_basics[keepers]
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
18892,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,118,"Comedy,Fantasy,Romance"
38136,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,70,Drama
42536,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,122,Drama
48914,tt0079644,movie,November 1828,November 1828,0,2001.0,140,"Drama,War"
54156,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...
311873,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,51,Drama
311874,tt9916190,movie,Safeguard,Safeguard,0,2020.0,95,"Action,Adventure,Thriller"
311875,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,84,Thriller
311876,tt9916362,movie,Coven,Akelarre,0,2020.0,92,"Drama,History"


In [57]:
## Save current dataframe to file.
title_basics.to_csv("title_basics.csv.gz",compression='gzip',index=False)