# Movie Database with SQL 
Author: Kim Hazed Delfino


## Imports 

In [312]:
import pandas as pd
import numpy as np

## Load Dataset

In [313]:
# Load files
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

### Preprossing - Title Basics

In [314]:
# Drop missing values from runtimeMinutes and genre
runtime_filter = basics['runtimeMinutes'] != '\\N'

genre_filter = basics['genres'] != '\\N'



In [315]:
# Create new filtered df 
movie_rtime_genre_filtered = basics[runtime_filter & genre_filter]

In [316]:
# Filter titleType and startYear
type_filter = movie_rtime_genre_filtered['titleType'] == 'movie'

years_filter = movie_rtime_genre_filtered['startYear'] != '\\N'

In [317]:
# Create filtered df 
movies_df = movie_rtime_genre_filtered[type_filter & years_filter]

In [318]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376423 entries, 8 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          376423 non-null  object
 1   titleType       376423 non-null  object
 2   primaryTitle    376423 non-null  object
 3   originalTitle   376423 non-null  object
 4   isAdult         376423 non-null  object
 5   startYear       376423 non-null  object
 6   endYear         376423 non-null  object
 7   runtimeMinutes  376423 non-null  object
 8   genres          376423 non-null  object
dtypes: object(9)
memory usage: 28.7+ MB


In [319]:
# Convert startYear value into int dtype
movies_df['startYear'] = movies_df['startYear'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['startYear'] = movies_df['startYear'].astype(int)


In [320]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376423 entries, 8 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          376423 non-null  object
 1   titleType       376423 non-null  object
 2   primaryTitle    376423 non-null  object
 3   originalTitle   376423 non-null  object
 4   isAdult         376423 non-null  object
 5   startYear       376423 non-null  int32 
 6   endYear         376423 non-null  object
 7   runtimeMinutes  376423 non-null  object
 8   genres          376423 non-null  object
dtypes: int32(1), object(8)
memory usage: 27.3+ MB


In [321]:
max(movies_df['startYear'])

2029

In [322]:
# Filter movies with only startYear 2000 to 2022
year_filter_2000 = movies_df['startYear'] >= 2000
year_filter_2021 = movies_df['startYear'] < 2022
movies_20_to_21 = movies_df[year_filter_2000 & year_filter_2021]

In [323]:
movies_20_to_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210633 entries, 13082 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          210633 non-null  object
 1   titleType       210633 non-null  object
 2   primaryTitle    210633 non-null  object
 3   originalTitle   210633 non-null  object
 4   isAdult         210633 non-null  object
 5   startYear       210633 non-null  int32 
 6   endYear         210633 non-null  object
 7   runtimeMinutes  210633 non-null  object
 8   genres          210633 non-null  object
dtypes: int32(1), object(8)
memory usage: 15.3+ MB


In [324]:
# Replace '\N' values with np.nan
movies_20_to_21.replace({'\\N':np.nan},inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_20_to_21.replace({'\\N':np.nan},inplace=True)


In [325]:
# Check df
movies_20_to_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210633 entries, 13082 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          210633 non-null  object 
 1   titleType       210633 non-null  object 
 2   primaryTitle    210633 non-null  object 
 3   originalTitle   210633 non-null  object 
 4   isAdult         210633 non-null  object 
 5   startYear       210633 non-null  int32  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  210633 non-null  object 
 8   genres          210633 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 15.3+ MB


In [326]:
movies_20_to_21.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67668,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86800,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [327]:
# Filter out Documentary in our df
is_documentary = movies_20_to_21['genres'].str.contains('fictional',case=False)
movies_20_21_filtered = movies_20_to_21[~is_documentary]



In [328]:
movies_20_21_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210633 entries, 13082 to 9875688
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          210633 non-null  object 
 1   titleType       210633 non-null  object 
 2   primaryTitle    210633 non-null  object 
 3   originalTitle   210633 non-null  object 
 4   isAdult         210633 non-null  object 
 5   startYear       210633 non-null  int32  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  210633 non-null  object 
 8   genres          210633 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 15.3+ MB


### Preprocessing - AKAs

In [329]:
# Check df 
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35996402 entries, 0 to 35996401
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [330]:
# Keep only US region
us_akas = akas[akas['region']== 'US']

In [331]:
# Replace '\\N' with np.nan
us_akas.replace({'\\N':np.nan},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_akas.replace({'\\N':np.nan},inplace=True)


In [332]:
# Double check
us_akas.sample(5)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
3299779,tt0968539,1,Peepo on Trial,US,,,,0
887930,tt0105395,16,Shootfighter: Fight to the Death,US,,imdbDisplay,,0
27408168,tt4397732,2,Talent You Should Know,US,,imdbDisplay,,0
29946489,tt6239150,2,Possessed by Love,US,,imdbDisplay,,0
14700297,tt15244244,2,Blixxie: Ice Cream,US,,imdbDisplay,,0


In [333]:
# Filter only US using AKAs dataset 
us_filter = movies_20_21_filtered['tconst'].isin(us_akas['titleId'])


In [334]:
# Create filtered df
us_movies_20_21_df = movies_20_21_filtered[us_filter]

In [335]:
# Doublec check df 
us_movies_20_21_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114218 entries, 34803 to 9875504
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          114218 non-null  object 
 1   titleType       114218 non-null  object 
 2   primaryTitle    114218 non-null  object 
 3   originalTitle   114218 non-null  object 
 4   isAdult         114218 non-null  object 
 5   startYear       114218 non-null  int32  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  114218 non-null  object 
 8   genres          114218 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 8.3+ MB


In [336]:
us_movies_20_21_df.sample(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
2489401,tt12768268,movie,How far a family movie can get,Até Onde Pode Chegar um Filme de Família,0,2019,,75,"Biography,Documentary"
403836,tt0421059,movie,Gambling,Gambling,0,2004,,90,"Drama,Mystery"
1108579,tt10253466,movie,Badland,Badland,0,2019,,117,"Drama,Western"
8892448,tt7778420,movie,Close Friends,Close Friends,0,2018,,70,"Comedy,Drama,Romance"
8187295,tt6217648,movie,House of Paper,House of Paper,0,2017,,99,Drama


### Preprocessing - Ratings

In [337]:
# Check df
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1314729 entries, 0 to 1314728
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1314729 non-null  object 
 1   averageRating  1314729 non-null  float64
 2   numVotes       1314729 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.1+ MB


In [338]:
ratings.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
375504,tt0755441,7.2,6
293782,tt0566449,6.8,121
34216,tt0054912,5.5,593
618019,tt13314962,7.5,9
761225,tt17016944,7.3,9


In [339]:
# Filter only US region
us_rating_filter =  ratings['tconst'].isin(us_akas['titleId'])

In [340]:
# Create filtered df 
us_ratings_df = ratings[us_rating_filter]

# Replace '\\N' with np.nan
us_ratings_df.replace({'\\N':np.nan},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_ratings_df.replace({'\\N':np.nan},inplace=True)


In [341]:
# Double check df
us_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499798 entries, 0 to 1314704
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         499798 non-null  object 
 1   averageRating  499798 non-null  float64
 2   numVotes       499798 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [342]:
us_ratings_df.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
1033364,tt4136020,5.2,15
1038401,tt4212436,7.4,6
785042,tt1809249,6.3,20
928426,tt2644714,4.5,3020
20472,tt0038535,4.2,84


In [343]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [344]:
## Save dataframe to file.
us_movies_20_21_df.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

us_ratings_df.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

us_akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

