In [4]:
#Import lIBRARIES
import pandas as pd
import numpy as np
import os as os

In [5]:
# example making new folder with os

os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['basics_clean.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title.akas.csv.gz',
 'title.ratings.csv.gz',
 'title_basics.csv.gz',
 'tmdb_api_results_2001.json']

# Basics Data Load & Clean

In [6]:
##Load Basics set and assign to DF

basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [7]:
#Assign local file to DF
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [8]:
#Basics preprocessing

#Replace "\N" with np.nan
basics.replace({'\\N':np.nan}, inplace=True) 
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [9]:
#Eliminate movies that are null for runtimeMinutes
basics.dropna(subset=['runtimeMinutes','genres'], inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [10]:
basics['genres'].isna().sum()

0

In [11]:
#keep only titleType==Movie
movie_filter = basics['titleType']=='movie'
movie_filter.head()

basics[movie_filter].head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,,58,"Adventure,Drama"
1273,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"


In [12]:
basics['runtimeMinutes'].dropna(inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [13]:
#keep startYear 2000-2022
start_filt = basics['startYear'] >= '2000'
start_filt.head()

0    False
1    False
2    False
3    False
4    False
Name: startYear, dtype: bool

In [14]:
end_filt = basics['startYear']<='2022'
end_filt.head()

0    True
1    True
2    True
3    True
4    True
Name: startYear, dtype: bool

In [15]:
basics = basics.loc[start_filt & end_filt, :]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,133,Documentary
33805,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
39547,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021,,6,Short
43551,tt0044326,short,Abstronic,Abstronic,0,2021,,6,Short


In [16]:
#Eliminate movies that include  "Documentary" in genre (see tip below)
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33805,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
39547,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021,,6,Short
43551,tt0044326,short,Abstronic,Abstronic,0,2021,,6,Short
44093,tt0044879,short,Mandala,Mandala,0,2021,,3,Short


In [17]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1417432 entries, 33805 to 8982588
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   tconst          1417432 non-null  object
 1   titleType       1417432 non-null  object
 2   primaryTitle    1417432 non-null  object
 3   originalTitle   1417432 non-null  object
 4   isAdult         1417432 non-null  object
 5   startYear       1417432 non-null  object
 6   endYear         21080 non-null    object
 7   runtimeMinutes  1417432 non-null  object
 8   genres          1417432 non-null  object
dtypes: object(9)
memory usage: 108.1+ MB


# AKAs Data Load & Clean

In [18]:
##Load AKAs data set and save locally 
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

akas.to_csv("Data/title.akas.csv.gz",compression='gzip',index=False)

In [19]:
#Assign local file to DF
akas = pd.read_csv("Data/title.akas.csv.gz", low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [20]:
#AKA Preprocessing
akas.replace({'\\N':np.nan}, inplace=True) 

In [21]:
#AKA - keep only US entries. I keep breaking the main df, I'm creating a dummy
#df that I will move back once done.
dfa= akas
dfa.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [22]:
#Apply filter to retain only movies in US region
region_filt = dfa['region']=='US'
region_filt.head()

0    False
1    False
2    False
3    False
4    False
Name: region, dtype: bool

In [23]:
#Check that filtering worked...it did.  ONLY US region films are showing.
dfa[region_filt].head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [24]:
dfa = dfa.loc[region_filt, :]
dfa.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [25]:
dfa['region'].value_counts()

US    1326857
Name: region, dtype: int64

In [26]:
#Transfer dummy set back to actual df
akas = dfa
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [27]:
#Save new cleansed df to local csv file from earlier 
akas.to_csv("Data/title.akas.csv.gz",compression='gzip',index=False)

In [28]:
#Display df info as requested
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1326857 entries, 5 to 32244076
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1326857 non-null  object
 1   ordering         1326857 non-null  int64 
 2   title            1326857 non-null  object
 3   region           1326857 non-null  object
 4   language         3570 non-null     object
 5   types            1025868 non-null  object
 6   attributes       44141 non-null    object
 7   isOriginalTitle  1325482 non-null  object
dtypes: int64(1), object(7)
memory usage: 91.1+ MB


# Ratings Data Load & Clean

In [29]:
##Load Ratings data set and assign to DF
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

ratings.to_csv("Data/title.ratings.csv.gz",compression='gzip',index=False)

In [30]:
#Assign local file to DF
ratings = pd.read_csv("Data/title.ratings.csv.gz", low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1886
1,tt0000002,5.9,250
2,tt0000003,6.5,1673
3,tt0000004,5.8,163
4,tt0000005,6.2,2493


In [31]:
#RATINGS Preprocessing
ratings.replace({'\\N':np.nan}) 

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1886
1,tt0000002,5.9,250
2,tt0000003,6.5,1673
3,tt0000004,5.8,163
4,tt0000005,6.2,2493
...,...,...,...
1251875,tt9916690,6.5,6
1251876,tt9916720,5.3,223
1251877,tt9916730,8.4,6
1251878,tt9916766,6.7,20


In [32]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251880 entries, 0 to 1251879
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1251880 non-null  object 
 1   averageRating  1251880 non-null  float64
 2   numVotes       1251880 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.7+ MB


In [33]:
ratings.to_csv("Data/title.ratings.csv.gz",compression='gzip',index=False)

# Filtering Dataframes Upon One Another

In [34]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

33805       True
34805       True
39547       True
43551       True
44093       True
           ...  
8982525     True
8982545    False
8982580    False
8982587    False
8982588    False
Name: tconst, Length: 1417432, dtype: bool

In [35]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33805,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
39547,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021,,6,Short
43551,tt0044326,short,Abstronic,Abstronic,0,2021,,6,Short
44093,tt0044879,short,Mandala,Mandala,0,2021,,3,Short
...,...,...,...,...,...,...,...,...,...
8982281,tt9916214,short,Drown the Clown,Drown the Clown,0,2019,,8,"Drama,Short"
8982301,tt9916254,video,Big Tit Cream Pie 32,Big Tit Cream Pie 32,1,2015,,226,Adult
8982347,tt9916348,video,Ancient World Exposed,Ancient World Exposed,0,2019,,67,History
8982354,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [36]:
basics.to_csv("Data/basics_clean.csv.gz",compression='gzip',index=False)