In [1]:
import pandas as pd
import numpy as np

##Downloading the Data

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

In [3]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [4]:
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

##Importing the Data

In [5]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [6]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [7]:
ratings_url = pd.read_csv(ratings_url, sep='\t', low_memory=False)

##Filtering The data

###Filter the AKA's Data

In [8]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [9]:
akas.replace({'\\N':np.nan},inplace=True)

In [10]:
akas['region'].value_counts()

DE    4504949
FR    4503713
JP    4502259
IN    4446794
ES    4420110
       ...   
JE          2
TV          1
NU          1
PW          1
NR          1
Name: region, Length: 247, dtype: int64

In [11]:
akas = akas[(akas['region'] == 'US')]

In [12]:
akas['region'].value_counts()

US    1472351
Name: region, dtype: int64

In [13]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


###Filter the Ratings data 

In [14]:
ratings_url.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000003,6.5,1894
3,tt0000004,5.5,178
4,tt0000005,6.2,2678


In [15]:
ratings_url.replace({'\\N':np.nan},inplace=True)

In [16]:
ratings_url.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [17]:
keepers =ratings_url['tconst'].isin(akas['titleId'])
keepers



0           True
1           True
2          False
3          False
4           True
           ...  
1360551    False
1360552    False
1360553    False
1360554    False
1360555    False
Name: tconst, Length: 1360556, dtype: bool

In [18]:
ratings_url= ratings_url[keepers]
ratings_url

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
4,tt0000005,6.2,2678
5,tt0000006,5.0,183
6,tt0000007,5.4,839
...,...,...,...
1360518,tt9916200,8.1,238
1360519,tt9916204,8.2,274
1360526,tt9916348,8.3,18
1360527,tt9916362,6.4,5573


### Filter the basics

In [19]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [20]:
basics.replace({'\\N':np.nan},inplace=True)

In [21]:
basics.isna().sum()

tconst                   0
titleType                0
primaryTitle            11
originalTitle           11
isAdult                  1
startYear          1370949
endYear           10137108
runtimeMinutes     7159618
genres              458543
dtype: int64

In [22]:
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [23]:
basics['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'video', 'tvSpecial', 'videoGame'], dtype=object)

In [24]:
basics = basics[(basics['titleType'] == 'movie')]

In [25]:
basics = basics[basics['startYear'].between('2000', '2022')]

In [26]:
basics["startYear"].value_counts()

2018    14444
2017    14431
2019    14189
2016    14016
2015    13537
2022    13331
2014    13175
2021    12559
2013    12425
2012    11688
2020    11666
2011    10810
2010    10250
2009     9404
2008     8207
2007     7000
2006     6568
2005     5875
2004     5237
2003     4634
2002     4156
2001     3900
2000     3667
Name: startYear, dtype: int64

In [27]:
basics["genres"].value_counts()

Documentary                  53789
Drama                        36265
Comedy                       13487
Comedy,Drama                  6486
Horror                        5929
                             ...  
Adult,Mystery                    1
Biography,Music,Mystery          1
Action,Animation,History         1
Comedy,Reality-TV,Romance        1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 1172, dtype: int64

In [28]:
docu = basics['genres'].str.contains('Documentary',case=False)
basics = basics[~docu]

In [29]:
basics["genres"].value_counts()

Drama                        36265
Comedy                       13487
Comedy,Drama                  6486
Horror                        5929
Drama,Romance                 4346
                             ...  
Animation,Biography,Sport        1
Adventure,History,Music          1
Adventure,History,War            1
Adventure,Romance,Sport          1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 954, dtype: int64

In [30]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror


In [31]:
keep = basics['tconst'].isin(akas['titleId'])
keep
                

34800        True
61111        True
67485        True
67663        True
80548        True
            ...  
10250616     True
10250655    False
10250700     True
10250784    False
10250874    False
Name: tconst, Length: 148584, dtype: bool

In [32]:
basics = basics[keep]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
...,...,...,...,...,...,...,...,...,...
10250073,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
10250467,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019,,97,"Comedy,Drama,Fantasy"
10250607,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
10250616,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


##Save Files in the repository

In [41]:
import os
os.makedirs('Data/',exist_ok=True) 
#Confirm folder created
os.listdir("Data/")

[]

In [42]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)


In [43]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [44]:
ratings_url.to_csv("Data/title_ratings_url.csv.gz",compression='gzip',index=False)

In [45]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
4,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror


In [46]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [47]:
ratings_url = pd.read_csv("Data/title_ratings_url.csv.gz", low_memory = False)
ratings_url.head()


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000005,6.2,2678
3,tt0000006,5.0,183
4,tt0000007,5.4,839
