In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

In [4]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

In [5]:
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [6]:
basics = pd.read_csv(basics_url, sep='\t',low_memory=False)

In [7]:
akas = pd.read_csv(akas_url, sep='\t',low_memory=False)

In [8]:
ratings = pd.read_csv(ratings_url, sep='\t',low_memory=False)

Filter/Cleaning Title Basics:

In [9]:
basics.replace({'\\N':np.nan},inplace=True)

In [10]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [11]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9294674 entries, 0 to 9294673
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 638.2+ MB


In [12]:
basics = basics.dropna(subset=['runtimeMinutes','genres'])

In [13]:
basics['titleType']=='movie'

0          False
1          False
2          False
3          False
4          False
           ...  
9294624     True
9294630    False
9294665    False
9294672    False
9294673    False
Name: titleType, Length: 2435459, dtype: bool

In [14]:
basics = basics.loc[basics['titleType']=='movie']
basics


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,,58,"Adventure,Drama"
...,...,...,...,...,...,...,...,...,...
9294439,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
9294523,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
9294564,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9294591,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary


In [15]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [16]:
basics['startYear'].str.strip()

8          1894
570        1906
587        1907
672        1908
1172       1910
           ... 
9294346    2019
9294355    2020
9294394    2020
9294439    2020
9294523    2019
Name: startYear, Length: 281379, dtype: object

In [17]:
basics= basics.dropna(subset=['startYear'])

In [18]:
basics['startYear'] = basics['startYear'].astype(float)

In [19]:
basics.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object

In [20]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908.0,,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910.0,,58,"Adventure,Drama"
...,...,...,...,...,...,...,...,...,...
9294346,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9294355,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9294394,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9294439,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


In [21]:
basics=basics[(basics['startYear']>=2000)& (basics['startYear']<2022)]

In [22]:
basics['startYear'].value_counts()

2018.0    9593
2017.0    9413
2019.0    9312
2016.0    8998
2015.0    8554
2014.0    8148
2021.0    8032
2013.0    7772
2020.0    7491
2012.0    7275
2011.0    6743
2010.0    6349
2009.0    5962
2008.0    5194
2007.0    4606
2006.0    4371
2005.0    3891
2004.0    3511
2003.0    3215
2002.0    2974
2001.0    2848
2000.0    2717
Name: startYear, dtype: int64

In [23]:
keepers = basics['tconst'].isin(akas['titleId'])

In [24]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77934,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86770,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9294346,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9294355,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9294394,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9294439,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


In [25]:
akas.replace({'\\N':np.nan},inplace=True)

In [26]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,,,0
1,tt0000001,2,Carmencita,DE,,,,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,,,0
3,tt0000001,4,Καρμενσίτα,GR,,,,0
4,tt0000001,5,Карменсита,RU,,,,0


In [27]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33484260 entries, 0 to 33484259
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   titleId          object 
 1   ordering         int64  
 2   title            object 
 3   region           object 
 4   language         object 
 5   types            float64
 6   attributes       float64
 7   isOriginalTitle  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 2.0+ GB


In [28]:
akas['region']=='US'

0           False
1           False
2           False
3           False
4           False
            ...  
33484255    False
33484256    False
33484257    False
33484258    False
33484259    False
Name: region, Length: 33484260, dtype: bool

In [29]:
akas = akas.loc[akas['region']=='US']
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,,,0
14,tt0000002,7,The Clown and His Dogs,US,,,,0
33,tt0000005,10,Blacksmith Scene,US,,,,0
36,tt0000005,1,Blacksmithing Scene,US,,,,0
41,tt0000005,6,Blacksmith Scene #1,US,,,,0
...,...,...,...,...,...,...,...,...
33483932,tt9916702,1,Loving London: The Playground,US,,,,0
33483969,tt9916720,10,The Demonic Nun,US,,,,0
33483971,tt9916720,12,The Nun 2,US,,,,0
33483988,tt9916756,1,Pretty Pretty Black Girl,US,,,,0


In [30]:
ratings.replace({'\\N':np.nan},inplace=True)

In [31]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1920
1,tt0000002,5.8,260
2,tt0000003,6.5,1726
3,tt0000004,5.6,173
4,tt0000005,6.2,2542


In [32]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1237493 entries, 0 to 1237492
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1237493 non-null  object 
 1   averageRating  1237493 non-null  float64
 2   numVotes       1237493 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.3+ MB


In [33]:
keeper = ratings['tconst'].isin(akas['titleId'])
keeper

0           True
1           True
2          False
3          False
4           True
           ...  
1237488    False
1237489     True
1237490    False
1237491    False
1237492    False
Name: tconst, Length: 1237493, dtype: bool

In [34]:
ratings = ratings[keeper]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1920
1,tt0000002,5.8,260
4,tt0000005,6.2,2542
5,tt0000006,5.1,175
6,tt0000007,5.4,796
...,...,...,...
1237467,tt9916204,8.2,245
1237473,tt9916348,8.5,17
1237474,tt9916362,6.4,4914
1237478,tt9916428,3.8,14


In [36]:
import os
os.makedirs('Data/',exist_ok=True)
os.listdir('Data/')

[]

In [37]:
basics.to_csv('Data/title_basics.csv.gz',compression='gzip',index=False)

In [38]:
basics =pd.read_csv('Data/title_basics.csv.gz',low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [39]:
akas.to_csv('Data/title_akas.csv.gz',compression='gzip',index=False)

In [40]:
akas = pd.read_csv('Data/title_akas.csv.gz',low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,,0.0
2,tt0000005,10,Blacksmith Scene,US,,,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,,,0.0


In [43]:
ratings.to_csv('Data/title_ratings.csv.gz',compression='gzip',index=False)

In [44]:
ratings = pd.read_csv('Data/title_ratings.csv.gz',low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1920
1,tt0000002,5.8,260
2,tt0000005,6.2,2542
3,tt0000006,5.1,175
4,tt0000007,5.4,796
