In [6]:
import pandas as pd
import numpy as np

In [2]:
rate_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

# AKAS Cleaning

In [3]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [7]:
#Replace "\N" with np.nan
akas_copy = akas.copy()
#akas.replace({'"\\N"','np.nan'},inplace=True)
akas = akas.replace({'\\N':np.nan})

In [8]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [10]:
#keep only US movies.
akas= akas[(akas['region'] == 'US')]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [11]:
akas.isna().sum()

titleId                  0
ordering                 0
title                    0
region                   0
language           1400132
types               434145
attributes         1358221
isOriginalTitle       1345
dtype: int64

# Rating Cleaning

In [12]:
rate = pd.read_csv(rate_url, sep='\t', low_memory=False)

In [13]:
rate.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1936
1,tt0000002,5.8,262
2,tt0000003,6.5,1760
3,tt0000004,5.6,177
4,tt0000005,6.2,2570


In [14]:
#check for duplicates
rate.duplicated().sum()

0

In [15]:
#Check for missing values
rate.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [16]:
#Check for unique values
rate.nunique()

tconst           1265785
averageRating         91
numVotes           20978
dtype: int64

In [17]:
#Replace "\N" with np.nan (if any)
rate = rate.replace({'\\N':np.nan})

In [18]:
#Keep only US movies
keepers = rate['tconst'].isin(akas['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1265780    False
1265781    False
1265782    False
1265783    False
1265784    False
Name: tconst, Length: 1265785, dtype: bool

In [31]:
rate_us = keepers.to_frame()

# Basic Cleaning

In [21]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [22]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [23]:
#check for duplicates
basics.duplicated().sum()

0

In [24]:
#Replace "\N" with np.nan
basics = basics.replace({'\\N':np.nan})

In [25]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [26]:
#Eliminate movies that are null for runtimeMinutes
basics.dropna(subset=['runtimeMinutes'], inplace=True)
basics.dropna(subset=['genres'], inplace=True)
basics.dropna(subset=['startYear'], inplace=True)

In [27]:
basics['runtimeMinutes'].isnull().value_counts()

False    2532052
Name: runtimeMinutes, dtype: int64

In [28]:
basics['runtimeMinutes'].to_frame()

Unnamed: 0,runtimeMinutes
0,1
1,5
2,4
3,12
4,1
...,...
9516871,49
9516877,43
9516912,11
9516919,27


In [29]:
#Check for missing values
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear               0
endYear           2484371
runtimeMinutes          0
genres                  0
dtype: int64

In [30]:
#Check for unique values
basics.nunique()

tconst            2532052
titleType              10
primaryTitle      1751598
originalTitle     1769777
isAdult                 2
startYear             146
endYear                92
runtimeMinutes        865
genres               2212
dtype: int64

In [31]:
#Check Data types
basics.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [32]:
#keep only titleType==Movie
basics['titleType'].value_counts()

tvEpisode       1183998
short            581121
movie            368484
video            177581
tvMovie           90153
tvSeries          87852
tvSpecial         17435
tvMiniSeries      16590
tvShort            8525
videoGame           313
Name: titleType, dtype: int64

In [33]:
basics = basics[(basics['titleType'] == 'movie')]

In [35]:
#Eliminate movies that include "Documentary" in genre 
basics.drop(basics[basics['genres'].str.contains('Documentary')].index, inplace = True)

In [36]:
#keep startYear 2000-2022
basics['startYear'] = basics['startYear'].astype('int')

In [37]:
basics = basics[(basics['startYear'] >= 2000)]

In [38]:
basics['startYear'].value_counts()

2018    9619
2017    9428
2019    9354
2016    9017
2015    8584
2022    8503
2021    8175
2014    8163
2013    7788
2020    7548
2012    7305
2011    6772
2010    6364
2009    5973
2008    5212
2007    4628
2006    4384
2005    3897
2004    3531
2003    3222
2002    2987
2001    2850
2000    2730
2023     731
2024       2
2025       2
Name: startYear, dtype: int64

# Info 

In [39]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146769 entries, 34803 to 9516771
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          146769 non-null  object
 1   titleType       146769 non-null  object
 2   primaryTitle    146769 non-null  object
 3   originalTitle   146769 non-null  object
 4   isAdult         146769 non-null  object
 5   startYear       146769 non-null  int32 
 6   endYear         0 non-null       object
 7   runtimeMinutes  146769 non-null  object
 8   genres          146769 non-null  object
dtypes: int32(1), object(8)
memory usage: 10.6+ MB


In [40]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1403897 entries, 5 to 34495702
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1403897 non-null  object
 1   ordering         1403897 non-null  int64 
 2   title            1403897 non-null  object
 3   region           1403897 non-null  object
 4   language         3765 non-null     object
 5   types            969752 non-null   object
 6   attributes       45676 non-null    object
 7   isOriginalTitle  1402552 non-null  object
dtypes: int64(1), object(7)
memory usage: 96.4+ MB


In [41]:
rate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1265785 entries, 0 to 1265784
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1265785 non-null  object 
 1   averageRating  1265785 non-null  float64
 2   numVotes       1265785 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.0+ MB


# Download

In [42]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [43]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [45]:
rate.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [46]:
akas.to_csv("Data/title_aka.csv.gz",compression='gzip',index=False)