# Movie Database with SQL 
Author: Kim Hazed Delfino


## Imports 

In [196]:
import pandas as pd
import numpy as np

## Load Dataset

In [213]:
# Load files
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

### Preprossing - Title Basics

In [229]:
# Drop missing values from runtimeMinutes and genre
runtime_filter = basics['runtimeMinutes'] != '\\N'

genre_filter = basics['genres'] != '\\N'



In [230]:
# Create new filtered df 
movie_rtime_genre_filtered = basics[runtime_filter & genre_filter]

In [232]:
# Filter titleType and startYear
type_filter = movie_rtime_genre_filtered['titleType'] == 'movie'

years_filter = movie_rtime_genre_filtered['startYear'] != '\\N'

In [239]:
# Create filtered df 
movies_df = movie_rtime_genre_filtered[type_filter & years_filter]

In [240]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376423 entries, 8 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          376423 non-null  object
 1   titleType       376423 non-null  object
 2   primaryTitle    376423 non-null  object
 3   originalTitle   376423 non-null  object
 4   isAdult         376423 non-null  object
 5   startYear       376423 non-null  object
 6   endYear         376423 non-null  object
 7   runtimeMinutes  376423 non-null  object
 8   genres          376423 non-null  object
dtypes: object(9)
memory usage: 28.7+ MB


In [243]:
# Convert startYear value into int dtype
movies_df['startYear'] = movies_df['startYear'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['startYear'] = movies_df['startYear'].astype(int)


In [244]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376423 entries, 8 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          376423 non-null  object
 1   titleType       376423 non-null  object
 2   primaryTitle    376423 non-null  object
 3   originalTitle   376423 non-null  object
 4   isAdult         376423 non-null  object
 5   startYear       376423 non-null  int32 
 6   endYear         376423 non-null  object
 7   runtimeMinutes  376423 non-null  object
 8   genres          376423 non-null  object
dtypes: int32(1), object(8)
memory usage: 27.3+ MB


In [246]:
max(movies_df['startYear'])

2029

In [247]:
# Filter movies with only startYear 2000 to 2022
year_filter_2000 = movies_df['startYear'] >= 2000
year_filter_2022 = movies_df['startYear'] < 2023
movies_20_to_22 = movies_df[year_filter_2000 & year_filter_2022]

In [248]:
movies_20_to_22.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223494 entries, 13082 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          223494 non-null  object
 1   titleType       223494 non-null  object
 2   primaryTitle    223494 non-null  object
 3   originalTitle   223494 non-null  object
 4   isAdult         223494 non-null  object
 5   startYear       223494 non-null  int32 
 6   endYear         223494 non-null  object
 7   runtimeMinutes  223494 non-null  object
 8   genres          223494 non-null  object
dtypes: int32(1), object(8)
memory usage: 16.2+ MB


In [249]:
# Replace '\N' values with np.nan
movies_20_to_22.replace({'\\N':np.nan},inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_20_to_22.replace({'\\N':np.nan},inplace=True)


In [252]:
# Check df
movies_20_to_22.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223494 entries, 13082 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          223494 non-null  object 
 1   titleType       223494 non-null  object 
 2   primaryTitle    223494 non-null  object 
 3   originalTitle   223494 non-null  object 
 4   isAdult         223494 non-null  object 
 5   startYear       223494 non-null  int32  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  223494 non-null  object 
 8   genres          223494 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 16.2+ MB


In [251]:
movies_20_to_22.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67668,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
76058,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022,,46,Documentary


In [257]:
# Filter out Documentary in our df
is_documentary = movies_20_to_22['genres'].str.contains('documentary',case=False)
movies_20_22_filtered = movies_20_to_22[~is_documentary]



In [258]:
movies_20_22_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147604 entries, 34803 to 9875588
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          147604 non-null  object 
 1   titleType       147604 non-null  object 
 2   primaryTitle    147604 non-null  object 
 3   originalTitle   147604 non-null  object 
 4   isAdult         147604 non-null  object 
 5   startYear       147604 non-null  int32  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  147604 non-null  object 
 8   genres          147604 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 10.7+ MB


### Preprocessing - AKAs

In [278]:
# Check df 
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1438843 entries, 5 to 35996146
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1438843 non-null  object
 1   ordering         1438843 non-null  int64 
 2   title            1438843 non-null  object
 3   region           1438843 non-null  object
 4   language         1438843 non-null  object
 5   types            1438843 non-null  object
 6   attributes       1438843 non-null  object
 7   isOriginalTitle  1438843 non-null  object
dtypes: int64(1), object(7)
memory usage: 98.8+ MB


In [279]:
# Keep only US region
akas_us = akas[akas['region']== 'US']

In [281]:
# Replace '\\N' with np.nan
akas_us.replace({'\\N':np.nan},inplace=True)

In [282]:
# Double check
akas_us.sample(5)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
24789755,tt27526101,1,The Things People Want,US,,,,0
27949328,tt4777018,3,The Mystery of Casa Matusita II: The Five Guests,US,,,,0
10851852,tt13405164,2,Suspended Homicide,US,,imdbDisplay,,0
31020331,tt7010566,1,Dirty Hollywood,US,,imdbDisplay,,0
12167008,tt14039086,17,Run & Gun,US,,imdbDisplay,,0


In [283]:
# Filter only US using AKAs dataset 
us_filter = movies_20_22_filtered['tconst'].isin(akas_us['titleId'])


In [284]:
# Create filtered df
us_movies_20_22_df = movies_20_22_filtered[us_filter]

In [285]:
# Doublec check df 
us_movies_20_22_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86744 entries, 34803 to 9875504
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86744 non-null  object 
 1   titleType       86744 non-null  object 
 2   primaryTitle    86744 non-null  object 
 3   originalTitle   86744 non-null  object 
 4   isAdult         86744 non-null  object 
 5   startYear       86744 non-null  int32  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86744 non-null  object 
 8   genres          86744 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 6.3+ MB


In [286]:
us_movies_20_22_df.sample(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
7986514,tt5764222,movie,Killer 2,Siu nin do sou,0,2003,,74,Crime
4699810,tt1776081,movie,Baker,Baker,0,2011,,127,"Action,Drama"
369452,tt0385703,movie,The Weakness of the Bolshevik,La flaqueza del bolchevique,0,2003,,95,"Adventure,Drama,Romance"
171838,tt0178043,movie,Stranger Than Fiction,Stranger Than Fiction,0,2000,,90,"Comedy,Thriller"
9121921,tt8278152,movie,Taboo Confessions: Sharing Family with Friends,Taboo Confessions: Sharing Family with Friends,1,2016,,69,Adult


### Preprocessing - Ratings

In [287]:
# Check df
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1314729 entries, 0 to 1314728
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1314729 non-null  object 
 1   averageRating  1314729 non-null  float64
 2   numVotes       1314729 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.1+ MB


In [299]:
ratings.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
655519,tt14105790,8.9,18
800794,tt1874641,4.1,15
530979,tt11512156,8.0,7
32101,tt0052512,4.3,154
702185,tt1515742,6.8,9


In [300]:
# Filter only US region
us_rating_filter =  ratings['tconst'].isin(akas_us['titleId'])

In [304]:
# Create filtered df 
us_ratings_df = ratings[us_rating_filter]

# Replace '\\N' with np.nan
us_ratings_df.replace({'\\N':np.nan},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_ratings_df.replace({'\\N':np.nan},inplace=True)


In [305]:
# Double check df
us_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499798 entries, 0 to 1314704
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         499798 non-null  object 
 1   averageRating  499798 non-null  float64
 2   numVotes       499798 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [306]:
us_ratings_df.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
69150,tt0095200,4.5,17
28561,tt0048383,5.6,21
718654,tt15526620,7.5,19
376444,tt0758050,5.1,19
179423,tt0300935,5.7,60


In [307]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


[]

In [309]:
## Save dataframe to file.
us_movies_20_22_df.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

us_ratings_df.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

akas_us.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

