# Import

In [2]:
import pandas as pd
import numpy as np
import os, time,json
import tmdbsimple as tmdb 
from tqdm.notebook import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns

# Downloading the Files

In [1]:
# Install tmdbsimple (only need to run once)
!pip install tmdbsimple



In [3]:
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

# Loading TSV's with Pandas

In [5]:
basics = pd.read_csv(basics_url, sep = '\t', low_memory = False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9490669,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
9490670,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
9490671,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
9490672,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [6]:
akas = pd.read_csv(akas_url, sep = '\t', low_memory = False)
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
34388480,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
34388481,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
34388482,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
34388483,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


In [7]:
ratings = pd.read_csv(ratings_url, sep = '\t', low_memory = False)
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1930
1,tt0000002,5.8,261
2,tt0000003,6.5,1750
3,tt0000004,5.6,176
4,tt0000005,6.2,2562
...,...,...,...
1262150,tt9916682,6.4,6
1262151,tt9916690,7.8,7
1262152,tt9916730,8.0,8
1262153,tt9916766,7.0,21


# Filtering/Cleaning Steps:

## Replace "\N" with np.nan


In [8]:
basics= basics.replace({'\\N':np.nan})
akas = akas.replace({'\\N':np.nan})
ratings =ratings.replace({'\\N':np.nan})

In [9]:
# check nan values
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1273306
endYear           9390003
runtimeMinutes    6743241
genres             433537
dtype: int64

In [10]:
# check duplicates
basics.duplicated().sum()

0

## Eliminate movies that are null for runtimeMinutes and genres

In [11]:
#Eliminate movies that are null for runtimeMinutes and genre
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [12]:
#check nan values 
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          145049
endYear           2624702
runtimeMinutes          0
genres                  0
dtype: int64

## keep only titleType==Movie

In [13]:
# check type
basics['titleType'].value_counts()

tvEpisode       1311304
short            586872
movie            374333
video            177858
tvMovie           90384
tvSeries          88510
tvSpecial         17360
tvMiniSeries      16648
tvShort            8589
videoGame           312
Name: titleType, dtype: int64

In [14]:
# keep only titleType==Movie
basics = basics.loc[basics['titleType'] == 'movie']
# check type
basics['titleType'].value_counts()

movie    374333
Name: titleType, dtype: int64

## keep startYear 2000-2022

In [15]:
# check startYear
basics['startYear'].value_counts()

2017    14260
2018    14208
2019    13904
2016    13884
2015    13394
        ...  
1899        1
1904        1
1897        1
1896        1
1894        1
Name: startYear, Length: 127, dtype: int64

In [16]:
# check dtype
basics['startYear'].dtype

dtype('O')

In [17]:
# we need to change to integer to filter 2000-2021 but first drop nan
basics = basics.dropna(subset=['startYear'])

In [18]:
# change to integer
basics['startYear'] = basics['startYear'].astype(int)
# check
basics['startYear'].dtype

dtype('int32')

In [19]:
# filter startYear 2000-2022
basics = basics[(basics['startYear'] >=2000) & (basics['startYear'] <=2022)]
# Check 
basics['startYear'].value_counts()

2017    14260
2018    14208
2019    13904
2016    13884
2015    13394
2014    13021
2013    12323
2021    12046
2022    11833
2012    11581
2020    11393
2011    10731
2010    10157
2009     9313
2008     8109
2007     6921
2006     6462
2005     5785
2004     5161
2003     4549
2002     4106
2001     3825
2000     3611
Name: startYear, dtype: int64

## Eliminate movies that include "Documentary" 

In [20]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

## Keep only US movies

In [22]:
akas = akas[(akas['region'] == 'US')]
# check
akas['region'].value_counts()

US    1400595
Name: region, dtype: int64

In [23]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
9489811,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9490207,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9490347,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9490356,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [24]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers1 =ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers1]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1930
1,tt0000002,5.8,261
4,tt0000005,6.2,2562
5,tt0000006,5.1,176
6,tt0000007,5.4,805
...,...,...,...
1262129,tt9916200,8.2,220
1262130,tt9916204,8.2,251
1262136,tt9916348,8.5,17
1262137,tt9916362,6.4,5087


In [25]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 482688 entries, 0 to 1262141
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         482688 non-null  object 
 1   averageRating  482688 non-null  float64
 2   numVotes       482688 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.7+ MB


In [26]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1400595 entries, 5 to 34388229
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1400595 non-null  object
 1   ordering         1400595 non-null  int64 
 2   title            1400595 non-null  object
 3   region           1400595 non-null  object
 4   language         3758 non-null     object
 5   types            968632 non-null   object
 6   attributes       45613 non-null    object
 7   isOriginalTitle  1399250 non-null  object
dtypes: int64(1), object(7)
memory usage: 96.2+ MB


In [27]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84348 entries, 34803 to 9490440
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          84348 non-null  object
 1   titleType       84348 non-null  object
 2   primaryTitle    84348 non-null  object
 3   originalTitle   84348 non-null  object
 4   isAdult         84348 non-null  object
 5   startYear       84348 non-null  int32 
 6   endYear         0 non-null      object
 7   runtimeMinutes  84348 non-null  object
 8   genres          84348 non-null  object
dtypes: int32(1), object(8)
memory usage: 6.1+ MB


In [28]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
9489811,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9490207,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9490347,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9490356,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [29]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
34388012,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
34388082,tt9916620,1,The Copeland Case,US,,,,0
34388170,tt9916702,1,Loving London: The Playground,US,,,,0
34388213,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [30]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1930
1,tt0000002,5.8,261
4,tt0000005,6.2,2562
5,tt0000006,5.1,176
6,tt0000007,5.4,805
...,...,...,...
1262129,tt9916200,8.2,220
1262130,tt9916204,8.2,251
1262136,tt9916348,8.5,17
1262137,tt9916362,6.4,5087


## Saving the Files in Your Repository

In [31]:
# making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [33]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [34]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [35]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [36]:
rating = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
rating.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1930
1,tt0000002,5.8,261
2,tt0000005,6.2,2562
3,tt0000006,5.1,176
4,tt0000007,5.4,805
