In [123]:
# importing libraries
import pandas as pd
import numpy as np

## Filtering the data

In [124]:
# getting the data
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [125]:
# converting the urls to dataframes
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [126]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [127]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [128]:
# replacing \N characters in the dataframe to numpy nan values
basics.replace({'\\N':np.nan}, inplace = True)
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9329527 entries, 0 to 9329526
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 640.6+ MB


In [129]:
# looking at the data
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [130]:
# dropping rows that have a missing value in genres or runtimeMinutes
basics.dropna(subset=['genres', 'runtimeMinutes'], inplace=True)

In [131]:
# converting the startYear column to floats
basics['startYear'] = basics['startYear'].astype('float')

In [132]:
# filtering out to movies after 2000 and before 2022 inclusive
basics = basics.loc[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2022) & (basics['titleType'] == 'movie')]

In [133]:
# filtering out documentaries
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [134]:
# replacing \N characters in the dataframe to numpy nan values
akas.replace({'\\N':np.nan}, inplace = True)

In [135]:
# filtering to only include the US region
akas = akas.loc[akas['region'] == 'US']

In [136]:
# creating a filter for the basics dataframe using the akas dataframe to look
# for id's that appear in both. Since akas is already filtered to only be
# US this works to help filter basics the same way
keepers = basics['tconst'].isin(akas['titleId'])

In [137]:
basics = basics[keepers]

In [138]:
# replacing \N characters in the dataframe to numpy nan values
ratings.replace({'\\N':np.nan}, inplace = True)

In [139]:
# creating a filter for the ratings database using the akas database
keepers2 = ratings['tconst'].isin(akas['titleId'])

In [140]:
ratings = ratings[keepers2]

## File management and downloading

In [141]:
# Creating a file for the data
import os
os.makedirs('Data/',exist_ok=True) 
os.listdir("Data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [142]:
# downloading the dataframes
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [143]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [144]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [145]:
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1920
1,tt0000002,5.8,260
2,tt0000005,6.2,2541
3,tt0000006,5.1,175
4,tt0000007,5.4,797


In [146]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [147]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


## Final dataframes

In [148]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83172 entries, 0 to 83171
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          83172 non-null  object 
 1   titleType       83172 non-null  object 
 2   primaryTitle    83172 non-null  object 
 3   originalTitle   83172 non-null  object 
 4   isAdult         83172 non-null  int64  
 5   startYear       83172 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  83172 non-null  int64  
 8   genres          83172 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 5.7+ MB


In [149]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472849 entries, 0 to 472848
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         472849 non-null  object 
 1   averageRating  472849 non-null  float64
 2   numVotes       472849 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 10.8+ MB


In [150]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1363068 entries, 0 to 1363067
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1363068 non-null  object 
 1   ordering         1363068 non-null  int64  
 2   title            1363068 non-null  object 
 3   region           1363068 non-null  object 
 4   language         3706 non-null     object 
 5   types            965311 non-null   object 
 6   attributes       45246 non-null    object 
 7   isOriginalTitle  1361693 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 83.2+ MB
