# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

# Preprocessing

## Basics

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9566817 entries, 0 to 9566816
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 656.9+ MB


In [6]:
basics.replace({'\\N':np.nan},inplace=True)
basics.dropna(subset=['runtimeMinutes','genres','startYear'],inplace=True)

In [7]:
basics['runtimeMinutes'].isnull().values.any()

False

In [8]:
basics['genres'].isnull().values.any()

False

In [9]:
basics['startYear'].isnull().values.any()

False

In [10]:
basics.isnull().values.any()

True

In [11]:
is_movie = basics['titleType'].str.contains('movie')
basics = basics[is_movie]

basics['startYear'] = basics['startYear'].astype(float).copy()
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<2022)]

is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [12]:
keepers =basics['tconst'].isin(akas['titleId'])

In [13]:
basics = basics[keepers]

In [14]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9566490,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9566499,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9566538,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9566583,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


## Ratings

In [15]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1946
1,tt0000002,5.8,263
2,tt0000003,6.5,1773
3,tt0000004,5.6,179
4,tt0000005,6.2,2580


In [16]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1273172 entries, 0 to 1273171
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1273172 non-null  object 
 1   averageRating  1273172 non-null  float64
 2   numVotes       1273172 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.1+ MB


In [17]:
ratings.replace({'\\N':np.nan})
keepers2 =ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers2]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1946
1,tt0000002,5.8,263
2,tt0000003,6.5,1773
3,tt0000004,5.6,179
4,tt0000005,6.2,2580
...,...,...,...
1273154,tt9916460,9.4,18
1273156,tt9916538,8.3,6
1273157,tt9916544,6.9,61
1273167,tt9916730,8.1,9


## AKAS

In [18]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [19]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34716118 entries, 0 to 34716117
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [21]:
akas.replace({'\\N':np.nan})
akas.dropna(subset=['region'],inplace=True)
in_US = akas['region'].str.contains('US',case=False)
akas = akas[in_US]

In [None]:
akas