# Movie Database with SQL 
Author: Kim Hazed Delfino


## Imports 

In [1]:
import pandas as pd
import numpy as np

## Load Dataset

In [2]:
# Load files
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

### Preprossing - Title Basics

In [3]:
# Drop missing values from runtimeMinutes and genre
runtime_filter = basics['runtimeMinutes'] != '\\N'

genre_filter = basics['genres'] != '\\N'



In [4]:
# Create new filtered df 
basics_rtime_genre_filtered = basics[runtime_filter & genre_filter]

In [5]:
# Filter titleType and startYear
type_filter = basics_rtime_genre_filtered['titleType'] == 'movie'

years_filter = basics_rtime_genre_filtered['startYear'] != '\\N'

In [6]:
# Create filtered df 
basics_df = basics_rtime_genre_filtered[type_filter & years_filter]

In [7]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376707 entries, 8 to 9890178
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          376707 non-null  object
 1   titleType       376707 non-null  object
 2   primaryTitle    376707 non-null  object
 3   originalTitle   376707 non-null  object
 4   isAdult         376707 non-null  object
 5   startYear       376707 non-null  object
 6   endYear         376707 non-null  object
 7   runtimeMinutes  376707 non-null  object
 8   genres          376707 non-null  object
dtypes: object(9)
memory usage: 28.7+ MB


In [8]:
# Convert startYear value into int dtype
basics_df['startYear'] = basics_df['startYear'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_df['startYear'] = basics_df['startYear'].astype(int)


In [9]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376707 entries, 8 to 9890178
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          376707 non-null  object
 1   titleType       376707 non-null  object
 2   primaryTitle    376707 non-null  object
 3   originalTitle   376707 non-null  object
 4   isAdult         376707 non-null  object
 5   startYear       376707 non-null  int32 
 6   endYear         376707 non-null  object
 7   runtimeMinutes  376707 non-null  object
 8   genres          376707 non-null  object
dtypes: int32(1), object(8)
memory usage: 27.3+ MB


In [10]:
max(basics_df['startYear'])

2029

In [11]:
# Filter movies with only startYear 2000 to 2022
year_filter_2000 = basics_df['startYear'] >= 2000
year_filter_2021 = basics_df['startYear'] < 2022
basics_20_to_21 = basics_df[year_filter_2000 & year_filter_2021]

In [12]:
basics_20_to_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210703 entries, 13082 to 9890178
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          210703 non-null  object
 1   titleType       210703 non-null  object
 2   primaryTitle    210703 non-null  object
 3   originalTitle   210703 non-null  object
 4   isAdult         210703 non-null  object
 5   startYear       210703 non-null  int32 
 6   endYear         210703 non-null  object
 7   runtimeMinutes  210703 non-null  object
 8   genres          210703 non-null  object
dtypes: int32(1), object(8)
memory usage: 15.3+ MB


In [13]:
# Replace '\N' values with np.nan
basics_20_to_21.replace({'\\N':np.nan},inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_20_to_21.replace({'\\N':np.nan},inplace=True)


In [14]:
# Check df
basics_20_to_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210703 entries, 13082 to 9890178
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          210703 non-null  object 
 1   titleType       210703 non-null  object 
 2   primaryTitle    210703 non-null  object 
 3   originalTitle   210703 non-null  object 
 4   isAdult         210703 non-null  object 
 5   startYear       210703 non-null  int32  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  210703 non-null  object 
 8   genres          210703 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 15.3+ MB


In [15]:
basics_20_to_21.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
42384,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67668,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama


In [16]:
# Filter out Documentary in our df
is_documentary = basics_20_to_21['genres'].str.contains('documentary',case=False)
basics_20_21_filtered = basics_20_to_21[~is_documentary]



In [17]:
basics_20_21_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138578 entries, 34803 to 9890078
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          138578 non-null  object 
 1   titleType       138578 non-null  object 
 2   primaryTitle    138578 non-null  object 
 3   originalTitle   138578 non-null  object 
 4   isAdult         138578 non-null  object 
 5   startYear       138578 non-null  int32  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  138578 non-null  object 
 8   genres          138578 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 10.0+ MB


### Preprocessing - AKAs

In [18]:
# Check df 
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36053786 entries, 0 to 36053785
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [19]:
# Keep only US region
us_akas = akas[akas['region']== 'US']

In [20]:
# Replace '\\N' with np.nan
us_akas.replace({'\\N':np.nan},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_akas.replace({'\\N':np.nan},inplace=True)


In [21]:
# Double check
us_akas.sample(5)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
23116028,tt2496622,1,Hibernation,US,,,,0
17317756,tt17077914,1,I'm Worth It,US,,,,0
2299219,tt0564267,1,"Now You See It, Now You Don't",US,,,,0
25542465,tt2977642,1,Big House Blues,US,,,,0
17807943,tt17940400,2,Fifty Times Rock,US,,imdbDisplay,,0


In [22]:
# Filter only US using AKAs dataset 
us_filter = basics_20_21_filtered['tconst'].isin(us_akas['titleId'])


In [23]:
# Create filtered df
us_basics_df = basics_20_21_filtered[us_filter]

In [24]:
# Doublec check df 
us_basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81782 entries, 34803 to 9889994
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81782 non-null  object 
 1   titleType       81782 non-null  object 
 2   primaryTitle    81782 non-null  object 
 3   originalTitle   81782 non-null  object 
 4   isAdult         81782 non-null  object 
 5   startYear       81782 non-null  int32  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81782 non-null  object 
 8   genres          81782 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 5.9+ MB


In [25]:
us_basics_df.sample(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
6000369,tt2376354,movie,Del Shores: Sordid Confessions,Del Shores: Sordid Confessions,0,2012,,90,Comedy
6867208,tt3189862,movie,Mission NinetyTwo,NinetyTwo,0,2015,,88,"Action,Adventure,Thriller"
192364,tt0200171,movie,Stanley's Gig,Stanley's Gig,0,2000,,93,"Drama,Music"
5385152,tt2111274,movie,Bleed 4 Me,Bleed 4 Me,0,2011,,89,Horror
8204518,tt6223806,movie,Under the Tree,Undir trénu,0,2017,,89,"Comedy,Drama,Thriller"


### Preprocessing - Ratings

In [26]:
# Check df
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316828 entries, 0 to 1316827
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1316828 non-null  object 
 1   averageRating  1316828 non-null  float64
 2   numVotes       1316828 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.1+ MB


In [27]:
ratings.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
935732,tt26919883,5.0,6
300985,tt0580694,9.2,14
1268726,tt8578528,8.3,7
413613,tt0867040,8.2,80
952310,tt2874436,6.4,7


In [28]:
# Filter only US region
us_rating_filter =  ratings['tconst'].isin(us_akas['titleId'])

In [29]:
# Create filtered df 
us_ratings_df = ratings[us_rating_filter]

# Replace '\\N' with np.nan
us_ratings_df.replace({'\\N':np.nan},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_ratings_df.replace({'\\N':np.nan},inplace=True)


In [30]:
# Double check df
us_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500349 entries, 0 to 1316803
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         500349 non-null  object 
 1   averageRating  500349 non-null  float64
 2   numVotes       500349 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [31]:
us_ratings_df.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
76591,tt0103933,4.5,209
274419,tt0522865,6.7,164
359869,tt0712777,6.6,26
844908,tt2105660,7.7,2186
15367,tt0032392,6.0,33


In [32]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


[]

In [33]:
## Save dataframe to file.
us_basics_df.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

us_ratings_df.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

us_akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

