In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

# Basics

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
##Replace "\N" with np.nan
basics.replace({'\\N':np.nan},inplace= True)

In [6]:
##Eliminate movies that are null for runtimeMinutes and genre
basics.dropna(subset = ['runtimeMinutes','genres'],inplace = True)

In [7]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           65094
endYear           2447308
runtimeMinutes          0
genres                  0
dtype: int64

In [8]:
##keep only titleType==Movie
basics = basics[basics['titleType']=='movie']

In [9]:
##keep startYear 2000-2022
basics = basics[(basics['startYear'] >= '2000') & (basics['startYear'] <= '2021')]

In [10]:
##Eliminate movies that include "Documentary" in genre
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

# Akas

In [11]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [12]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [13]:
akas.replace({'\\N':np.nan},inplace= True)

In [14]:
akas = akas[akas['region']=='US']

In [15]:
keepers = basics['tconst'].isin(akas['titleId'])

In [16]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34793,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61096,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67642,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86772,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93908,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
9361236,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9361632,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9361772,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9361781,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


# Ratings

In [17]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [18]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1922
1,tt0000002,5.8,259
2,tt0000003,6.5,1734
3,tt0000004,5.6,174
4,tt0000005,6.2,2545


In [19]:
ratings.replace({'\\N':np.nan},inplace= True)

In [20]:
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]

# Summary

In [21]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79863 entries, 34793 to 9361865
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          79863 non-null  object
 1   titleType       79863 non-null  object
 2   primaryTitle    79863 non-null  object
 3   originalTitle   79863 non-null  object
 4   isAdult         79863 non-null  object
 5   startYear       79863 non-null  object
 6   endYear         0 non-null      object
 7   runtimeMinutes  79863 non-null  object
 8   genres          79863 non-null  object
dtypes: object(9)
memory usage: 6.1+ MB


In [22]:
basics.drop(columns = ['endYear'],inplace = True)

In [23]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79863 entries, 34793 to 9361865
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          79863 non-null  object
 1   titleType       79863 non-null  object
 2   primaryTitle    79863 non-null  object
 3   originalTitle   79863 non-null  object
 4   isAdult         79863 non-null  object
 5   startYear       79863 non-null  object
 6   runtimeMinutes  79863 non-null  object
 7   genres          79863 non-null  object
dtypes: object(8)
memory usage: 5.5+ MB


In [24]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474250 entries, 0 to 1246144
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         474250 non-null  object 
 1   averageRating  474250 non-null  float64
 2   numVotes       474250 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ MB


In [25]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1366189 entries, 5 to 33780853
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1366189 non-null  object
 1   ordering         1366189 non-null  int64 
 2   title            1366189 non-null  object
 3   region           1366189 non-null  object
 4   language         3721 non-null     object
 5   types            965998 non-null   object
 6   attributes       45291 non-null    object
 7   isOriginalTitle  1364814 non-null  object
dtypes: int64(1), object(7)
memory usage: 93.8+ MB


# Data Save

In [17]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [26]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [27]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,126,Drama


In [28]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79863 entries, 0 to 79862
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          79863 non-null  object
 1   titleType       79863 non-null  object
 2   primaryTitle    79863 non-null  object
 3   originalTitle   79863 non-null  object
 4   isAdult         79863 non-null  int64 
 5   startYear       79863 non-null  int64 
 6   runtimeMinutes  79863 non-null  int64 
 7   genres          79863 non-null  object
dtypes: int64(3), object(5)
memory usage: 4.9+ MB


In [29]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [30]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)