# Movie Database with SQL 
Author: Kim Hazed Delfino


## Imports 

In [197]:
import pandas as pd
import numpy as np

## Load Dataset

In [198]:
# Load files
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

### Preprossing - Title Basics

In [199]:
# Drop missing values from runtimeMinutes and genre
runtime_filter = basics['runtimeMinutes'] != '\\N'

genre_filter = basics['genres'] != '\\N'



In [200]:
# Create new filtered df 
basics_rtime_genre_filtered = basics[runtime_filter & genre_filter]

In [201]:
# Filter titleType and startYear
type_filter = basics_rtime_genre_filtered['titleType'] == 'movie'

years_filter = basics_rtime_genre_filtered['startYear'] != '\\N'

In [202]:
# Create filtered df 
basics_df = basics_rtime_genre_filtered[type_filter & years_filter]

In [203]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377300 entries, 8 to 9926829
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          377300 non-null  object
 1   titleType       377300 non-null  object
 2   primaryTitle    377300 non-null  object
 3   originalTitle   377300 non-null  object
 4   isAdult         377300 non-null  object
 5   startYear       377300 non-null  object
 6   endYear         377300 non-null  object
 7   runtimeMinutes  377300 non-null  object
 8   genres          377300 non-null  object
dtypes: object(9)
memory usage: 28.8+ MB


In [204]:
# Convert startYear value into int dtype
basics_df['startYear'] = basics_df['startYear'].astype(int)

In [205]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377300 entries, 8 to 9926829
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          377300 non-null  object
 1   titleType       377300 non-null  object
 2   primaryTitle    377300 non-null  object
 3   originalTitle   377300 non-null  object
 4   isAdult         377300 non-null  object
 5   startYear       377300 non-null  int64 
 6   endYear         377300 non-null  object
 7   runtimeMinutes  377300 non-null  object
 8   genres          377300 non-null  object
dtypes: int64(1), object(8)
memory usage: 28.8+ MB


In [206]:
max(basics_df['startYear'])

2029

In [207]:
# Filter movies with only startYear 2000 to 2022
year_filter_2000 = basics_df['startYear'] >= 2000
year_filter_2021 = basics_df['startYear'] < 2022
basics_20_to_21 = basics_df[year_filter_2000 & year_filter_2021]

In [208]:
basics_20_to_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210788 entries, 13082 to 9926829
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          210788 non-null  object
 1   titleType       210788 non-null  object
 2   primaryTitle    210788 non-null  object
 3   originalTitle   210788 non-null  object
 4   isAdult         210788 non-null  object
 5   startYear       210788 non-null  int64 
 6   endYear         210788 non-null  object
 7   runtimeMinutes  210788 non-null  object
 8   genres          210788 non-null  object
dtypes: int64(1), object(8)
memory usage: 16.1+ MB


In [209]:
# Replace '\N' values with np.nan
basics_20_to_21.replace({'\\N':np.nan},inplace=True)


In [210]:
# Check df
basics_20_to_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210788 entries, 13082 to 9926829
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          210788 non-null  object 
 1   titleType       210788 non-null  object 
 2   primaryTitle    210788 non-null  object 
 3   originalTitle   210788 non-null  object 
 4   isAdult         210788 non-null  object 
 5   startYear       210788 non-null  int64  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  210788 non-null  object 
 8   genres          210788 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 16.1+ MB


In [211]:
basics_20_to_21.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
42384,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67668,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama


In [212]:
# Filter out Documentary in our df
is_documentary = basics_20_to_21['genres'].str.contains('documentary',case=False)
basics_20_21_filtered = basics_20_to_21[~is_documentary]



In [213]:
basics_20_21_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138613 entries, 34803 to 9926729
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          138613 non-null  object 
 1   titleType       138613 non-null  object 
 2   primaryTitle    138613 non-null  object 
 3   originalTitle   138613 non-null  object 
 4   isAdult         138613 non-null  object 
 5   startYear       138613 non-null  int64  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  138613 non-null  object 
 8   genres          138613 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 10.6+ MB


### Preprocessing - AKAs

In [214]:
# Check df 
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36216823 entries, 0 to 36216822
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [215]:
# Keep only US region
us_akas = akas[akas['region']== 'US']

In [216]:
# Replace '\\N' with np.nan
us_akas.replace({'\\N':np.nan},inplace=True)

In [217]:
# Double check
us_akas.sample(5)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
15306122,tt1552972,1,2000 Army-Navy Game,US,,imdbDisplay,,0
17542844,tt1749038,1,The Mummenschantz Conundrum,US,,,,0
9075516,tt12578410,1,The Cantor's Last Cantata,US,,imdbDisplay,,0
1645669,tt0313853,1,Cannon Bootcamp,US,,imdbDisplay,,0
4502501,tt10504328,1,Clubby Cub,US,,imdbDisplay,,0


In [218]:
# Filter only US using AKAs dataset 
us_filter = basics_20_21_filtered['tconst'].isin(us_akas['titleId'])


In [219]:
# Create filtered df
us_basics_df = basics_20_21_filtered[us_filter]

In [220]:
# Doublec check df 
us_basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81762 entries, 34803 to 9926645
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81762 non-null  object 
 1   titleType       81762 non-null  object 
 2   primaryTitle    81762 non-null  object 
 3   originalTitle   81762 non-null  object 
 4   isAdult         81762 non-null  object 
 5   startYear       81762 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81762 non-null  object 
 8   genres          81762 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 6.2+ MB


In [221]:
us_basics_df.sample(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
6016378,tt2382246,movie,Merry Christmas,Merry Christmas,0,2013,,83,"Comedy,Drama"
3047038,tt1380151,movie,Blood Honey,Blood Honey,0,2017,,95,"Drama,Horror,Mystery"
6439708,tt27047471,movie,Best Little Fishing Secret in the Gulf: Venice...,Best Little Fishing Secret in the Gulf: Venice...,0,2020,,77,Sport
792837,tt0818620,movie,The Sacrifice,The Sacrifice,0,2005,,95,Horror
229419,tt0239509,movie,Julie Johnson,Julie Johnson,0,2001,,93,"Drama,Romance"


### Preprocessing - Ratings

In [222]:
# Check df
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319129 entries, 0 to 1319128
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1319129 non-null  object 
 1   averageRating  1319129 non-null  float64
 2   numVotes       1319129 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.2+ MB


In [223]:
ratings.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
997521,tt3506278,6.8,109
893027,tt2328942,7.3,369
659022,tt1416427,7.8,7
985049,tt3329402,7.7,924
719070,tt1551229,5.6,6


In [224]:
# Filter only US region
us_rating_filter =  ratings['tconst'].isin(us_akas['titleId'])

In [225]:
# Create filtered df 
us_ratings_df = ratings[us_rating_filter]

# Replace '\\N' with np.nan
us_ratings_df.replace({'\\N':np.nan},inplace=True)

In [226]:
# Double check df
us_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 501086 entries, 0 to 1319104
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         501086 non-null  object 
 1   averageRating  501086 non-null  float64
 2   numVotes       501086 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [227]:
us_ratings_df.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
261778,tt0495820,6.0,120
104591,tt0148605,4.4,22
623333,tt1340674,8.8,7
233222,tt0424433,5.1,42
180437,tt0302973,4.5,12


In [228]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


['tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'tmdb_api_results_2001.json',
 'title_basics.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_ratings.csv.gz']

In [229]:
## Save dataframe to file.
us_basics_df.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

us_ratings_df.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

us_akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)



# API Calls 

## Imports

In [230]:
import os, time, json
import tmdbsimple as tmdb
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'tmdb_api_results_2001.json',
 'title_basics.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_ratings.csv.gz']

## Credentials

In [231]:
with open("/Users/kim/.secret/tmdb_api.json", "r") as f:
    login = json.load(f)

# Display loaded dict 
login.keys()

dict_keys(['client-id', 'api-key'])

In [232]:
# Set API Key 
tmdb.API_KEY = login['client-id'] # api key stored as client-id and api_token as api-key

## Custom Function 

In [233]:
def write_json(new_data, filename):     
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [234]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # Save the .info .realeases dict
    info = movie.info()

    release = movie.releases()
    # Loop through countries in releases
    for c in release['countries']:
        # Store if c == US 
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']

    return info

In [235]:
# Testing custom func
avenger_info = get_movie_with_rating("tt0848228")
avenger_info

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 108.524,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [236]:
the_notebook_info = get_movie_with_rating("tt0332280")
the_notebook_info

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 53.152,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/5ThIuO93vsk47oexKTSdfKEr7EC.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

In [237]:
# Load basics csv
basics_df = pd.read_csv("/Users/kim/Documents/GitHub/repo/Movie_database/Data/title_basics.csv.gz")

In [238]:
# Set target 
YEARS_TO_GET  = [2000,2001]


In [239]:
# Capture errors
errors = []

In [240]:
# Loop through the data
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position = 0):
    JSON_FILE = f"{FOLDER}tmdb_api_results_{YEAR}.json"
    is_file = os.path.isfile(JSON_FILE)
    if not is_file:
        with open(JSON_FILE,'w') as f:
            json.dump([{"imdb_id":0}],f)

        df = basics_df.loc[basics_df['startYear']== YEAR].copy()
        movie_ids = df['tconst'].copy()
        # load existing data
        previous_df = pd.read_json(JSON_FILE)

        movie_id_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

        for movie_id in tqdm_notebook(movie_id_to_get,
                                      desc= f'Movies from {YEAR}',
                                      position =1,
                                      leave=True):
            try:
                # Store data for the movie id 
                temp = get_movie_with_rating(movie_id)
                write_json(temp,JSON_FILE)
                time.sleep(.02)
                
            except Exception as e:
                errors.append([movie_id,e])
        final_year_df = pd.read_json(JSON_FILE)
        final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression='gzip',
                     index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

In [241]:
print(f" total errors: {len(errors)}")

 total errors: 0


In [242]:
# load the final datas
tmdb_2000 = pd.read_csv("/Users/kim/Documents/GitHub/repo/Movie_database/Data/final_tmdb_data_2000.csv.gz", sep=',',lineterminator='\n', low_memory=False)
tmdb_2001 = pd.read_csv("/Users/kim/Documents/GitHub/repo/Movie_database/Data/final_tmdb_data_2001.csv.gz", sep=',',lineterminator='\n',low_memory=False)


In [243]:
# Combine both years into one dataframe 
combined_df = pd.concat([tmdb_2000, tmdb_2001])

## Exploratory Data Analysis

In [244]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2576 entries, 0 to 1335
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2576 non-null   object 
 1   adult                  2574 non-null   float64
 2   backdrop_path          1408 non-null   object 
 3   belongs_to_collection  208 non-null    object 
 4   budget                 2574 non-null   float64
 5   genres                 2574 non-null   object 
 6   homepage               172 non-null    object 
 7   id                     2574 non-null   float64
 8   original_language      2574 non-null   object 
 9   original_title         2574 non-null   object 
 10  overview               2524 non-null   object 
 11  popularity             2574 non-null   float64
 12  poster_path            2316 non-null   object 
 13  production_companies   2574 non-null   object 
 14  production_countries   2574 non-null   object 
 15  rele

In [245]:
combined_df.head()


Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,"{'id': 1131062, 'name': 'Wong Kar-Wai’s Love T...",150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,...,14204632.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.112,2221.0,PG


### Questions to answer

- How many movies had at least some valid financial info? (budget OR revenue > 0)

In [246]:
# Create filter
has_budget = combined_df["budget"] > 0
has_revenue = combined_df["revenue"] > 0


In [247]:
# Filter with financial movies
with_financial_filtered = combined_df[has_budget | has_revenue]
with_financial_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 1 to 1312
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                638 non-null    object 
 1   adult                  638 non-null    float64
 2   backdrop_path          541 non-null    object 
 3   belongs_to_collection  110 non-null    object 
 4   budget                 638 non-null    float64
 5   genres                 638 non-null    object 
 6   homepage               76 non-null     object 
 7   id                     638 non-null    float64
 8   original_language      638 non-null    object 
 9   original_title         638 non-null    object 
 10  overview               636 non-null    object 
 11  popularity             638 non-null    float64
 12  poster_path            619 non-null    object 
 13  production_companies   638 non-null    object 
 14  production_countries   638 non-null    object 
 15  relea

- How many movies are there in each of the certification categories? 

In [248]:
# counter per categories
with_financial_filtered['certification'].value_counts()

R        232
PG-13    131
PG        35
NR        17
G         15
Name: certification, dtype: int64

- What is the average revenue per certification catergory? 

In [249]:
# get avg rev per certification
for cert in with_financial_filtered['certification'].unique():
    certification_filtered = with_financial_filtered[with_financial_filtered['certification']== cert]
    avg_rev = certification_filtered['revenue'].mean()
    print(f"Ave revenue for {cert} is ${avg_rev}")

Ave revenue for nan is $nan
Ave revenue for PG is $110679080.08571428
Ave revenue for R is $32465051.612068966
Ave revenue for G is $117364760.8
Ave revenue for PG-13 is $99287858.32824427
Ave revenue for NR is $9588674.352941176


- What is the ave budget per cetification? 

In [250]:
# get avg rev per certification
for cert in with_financial_filtered['certification'].unique():
    certification_filtered = with_financial_filtered[with_financial_filtered['certification']== cert]
    avg_rev = certification_filtered['budget'].mean()
    print(f"Ave budget for {cert} is ${avg_rev}")

Ave budget for nan is $nan
Ave budget for PG is $44828492.4
Ave budget for R is $19484070.905172415
Ave budget for G is $38133333.333333336
Ave budget for PG-13 is $42993571.96183206
Ave budget for NR is $6302358.470588235


In [251]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2576 entries, 0 to 1335
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2576 non-null   object 
 1   adult                  2574 non-null   float64
 2   backdrop_path          1408 non-null   object 
 3   belongs_to_collection  208 non-null    object 
 4   budget                 2574 non-null   float64
 5   genres                 2574 non-null   object 
 6   homepage               172 non-null    object 
 7   id                     2574 non-null   float64
 8   original_language      2574 non-null   object 
 9   original_title         2574 non-null   object 
 10  overview               2524 non-null   object 
 11  popularity             2574 non-null   float64
 12  poster_path            2316 non-null   object 
 13  production_companies   2574 non-null   object 
 14  production_countries   2574 non-null   object 
 15  rele

In [252]:
# Convert df back to csv
combined_df.to_csv(f"tmdb_results_combined.csv.gz", compression='gzip',
                     index=False)

### Normalize Tables

In [253]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81762 entries, 0 to 81761
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81762 non-null  object 
 1   titleType       81762 non-null  object 
 2   primaryTitle    81762 non-null  object 
 3   originalTitle   81762 non-null  object 
 4   isAdult         81762 non-null  int64  
 5   startYear       81762 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81762 non-null  int64  
 8   genres          81762 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.6+ MB


- We will only need Movie ID(tconst), Primary Title, Start Year, Runtime (in Minutes) and Genres from Title Basics table so we can drop the rest 


In [254]:
# Drop Unnecessary columns
basics_df.drop(columns=["titleType","isAdult","endYear","originalTitle"], inplace=True)

In [255]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81762 entries, 0 to 81761
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          81762 non-null  object
 1   primaryTitle    81762 non-null  object
 2   startYear       81762 non-null  int64 
 3   runtimeMinutes  81762 non-null  int64 
 4   genres          81762 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.1+ MB


In [256]:
basics_df.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance"
1,tt0043139,Life of a Beijing Policeman,2013,120,"Drama,History"
2,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020,70,Drama
3,tt0069049,The Other Side of the Wind,2018,122,Drama
4,tt0088751,The Naked Monster,2005,100,"Comedy,Horror,Sci-Fi"


In [257]:
# Split genre str 
basics_df['genres_split'] = basics_df['genres'].str.split(',')

In [258]:
# Separate genres using explode
exploded_genres = basics_df.explode('genres_split')
exploded_genres

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,genres_split
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",Comedy
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",Fantasy
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",Romance
1,tt0043139,Life of a Beijing Policeman,2013,120,"Drama,History",Drama
1,tt0043139,Life of a Beijing Policeman,2013,120,"Drama,History",History
...,...,...,...,...,...,...
81760,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller",Action
81760,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller",Adventure
81760,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller",Thriller
81761,tt9916362,Coven,2020,92,"Drama,History",Drama


In [259]:
# Create genre IDs for each unique genre 
unique_genre = sorted(exploded_genres['genres_split'].unique())
unique_genre[:3]

['Action', 'Adult', 'Adventure']

In [260]:
# Set Int value for each unique ID 
genre_ID = range(len(unique_genre))
genre_ID

range(0, 25)

In [261]:
# Create mapper 
id_map = dict(zip(unique_genre,genre_ID))
id_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Sport': 20,
 'Talk-Show': 21,
 'Thriller': 22,
 'War': 23,
 'Western': 24}

In [262]:
# Create new table called title_genres
title_genres = exploded_genres[['tconst','genres_split']].copy()
title_genres.head()

Unnamed: 0,tconst,genres_split
0,tt0035423,Comedy
0,tt0035423,Fantasy
0,tt0035423,Romance
1,tt0043139,Drama
1,tt0043139,History


In [263]:
# Replace genres column with genre_id
title_genres['genre_id'] = title_genres['genres_split'].replace(id_map)
title_genres.drop(columns='genres_split',inplace=True)
title_genres.head()

Unnamed: 0,tconst,genre_id
0,tt0035423,5
0,tt0035423,9
0,tt0035423,18
1,tt0043139,7
1,tt0043139,11


In [344]:
# Create genres table with genre
genres= pd.DataFrame({'genre_name':id_map.keys(),
                      'genre_id': id_map.values()})

genres.head()

Unnamed: 0,genre_name,genre_id
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4


In [283]:
# Rename basics_df to title_basics
title_basics = basics_df
title_basics.drop(columns=["genres","genres_split"],inplace=True)
title_basics.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes
0,tt0035423,Kate & Leopold,2001,118
1,tt0043139,Life of a Beijing Policeman,2013,120
2,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020,70
3,tt0069049,The Other Side of the Wind,2018,122
4,tt0088751,The Naked Monster,2005,100


In [266]:
# Rename us_ratings_df to title_ratings
title_ratings = us_ratings_df
title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1980
1,tt0000002,5.8,265
4,tt0000005,6.2,2622
5,tt0000006,5.1,182
6,tt0000007,5.4,821


- We only need Movie ID, Revenue, Budget, Certification(MPAA Rating) from our API results so we can drop the rest.

In [267]:
# Drop unnecessary columns
cols = ["imdb_id", "revenue","budget",'certification']
API_Results = combined_df[cols]
API_Results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2576 entries, 0 to 1335
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        2576 non-null   object 
 1   revenue        2574 non-null   float64
 2   budget         2574 non-null   float64
 3   certification  814 non-null    object 
dtypes: float64(2), object(2)
memory usage: 100.6+ KB


In [319]:
# Rename API_results to tmdb_data
tmdb_data = API_Results
tmdb_data.drop(index=0,inplace=True)

tmdb_data.head()

Unnamed: 0,imdb_id,revenue,budget,certification
1,tt0113026,0.0,10000000.0,
2,tt0113092,0.0,0.0,
3,tt0116391,0.0,0.0,
4,tt0118694,14204632.0,150000.0,PG
5,tt0118852,0.0,0.0,R


## Creating MySQL Database

### Imports

In [269]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus as urlquote

### Credentials

In [270]:
with open("/Users/kim/.secret/mysql_credentials.json", "r") as f:
    login = json.load(f)

# Display loaded dict 
login.keys()

dict_keys(['user', 'password'])

In [271]:
# Assign credentials
username = login['user']
password = login['password']

In [278]:
# Set Connection and Engine
connection = f"mysql+pymysql://{username}:{urlquote(password)}@localhost/movies"
engine = create_engine(connection)

In [279]:
# Check if database exists
if database_exists(connection):
    print("It exist")
else:
    create_database(connection)
    print('Database created')

Database created


In [332]:
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81762 entries, 0 to 81761
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          81762 non-null  object
 1   primaryTitle    81762 non-null  object
 2   startYear       81762 non-null  int64 
 3   runtimeMinutes  81762 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.5+ MB


In [333]:
from sqlalchemy.types import *
## Calculate max string lengths for object columns
key_len = title_basics['tconst'].fillna('').map(len).max()
title_len = title_basics['primaryTitle'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
df_schema = {
    "tconst": String(key_len+1), 
    "primaryTitle": Text(title_len+1),
    'startYear':INT(),
    'runtimeMinutes':Integer()}



In [334]:
# Save to sql with dtype and index=False
title_basics.to_sql('title_basics',engine,dtype=df_schema,if_exists='replace',index=False)



81762

In [335]:
engine.execute('ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);')



<sqlalchemy.engine.cursor.LegacyCursorResult at 0x2fbdac370>

In [336]:
query = '''SELECT *
            FROM title_basics
            LIMIT 5;'''
pd.read_sql(query,engine)

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes
0,tt0035423,Kate & Leopold,2001,118
1,tt0043139,Life of a Beijing Policeman,2013,120
2,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020,70
3,tt0069049,The Other Side of the Wind,2018,122
4,tt0088751,The Naked Monster,2005,100


In [286]:
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 501086 entries, 0 to 1319104
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         501086 non-null  object 
 1   averageRating  501086 non-null  float64
 2   numVotes       501086 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [297]:
## Calculate max string lengths for object columns
key_len = title_ratings['tconst'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
ratings_schema = {
    "tconst": String(key_len+1), 
    "averageRating":Float(),
    "numVotes":INT()}

In [298]:
# Save to sql with dtype and index=False
title_ratings.to_sql('title_ratings',engine,dtype=ratings_schema,if_exists='replace',index=False)

501086

In [299]:
engine.execute('ALTER TABLE title_ratings ADD PRIMARY KEY (`tconst`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x2b9e28220>

In [337]:
query = '''SELECT *
            FROM title_ratings
            LIMIT 5;'''
pd.read_sql(query,engine)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1980
1,tt0000002,5.8,265
2,tt0000005,6.2,2622
3,tt0000006,5.1,182
4,tt0000007,5.4,821


In [321]:
tmdb_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2574 entries, 1 to 1335
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        2574 non-null   object 
 1   revenue        2574 non-null   float64
 2   budget         2574 non-null   float64
 3   certification  814 non-null    object 
dtypes: float64(2), object(2)
memory usage: 100.5+ KB


In [322]:
tmdb_data.head()

Unnamed: 0,imdb_id,revenue,budget,certification
1,tt0113026,0.0,10000000.0,
2,tt0113092,0.0,0.0,
3,tt0116391,0.0,0.0,
4,tt0118694,14204632.0,150000.0,PG
5,tt0118852,0.0,0.0,R


In [323]:
## Calculate max string lengths for object columns
key_len = tmdb_data['imdb_id'].fillna('').map(len).max()
cert_len = tmdb_data['certification'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
tmdb_schema = {
    "imdb_id": String(key_len+1), 
    "revenue":Float(),
    "budget":Float(),
    "certification":String(cert_len+1)}

In [324]:
# Save to sql with dtype and index=False
tmdb_data.to_sql('tmdb_data',engine,dtype=tmdb_schema,if_exists='replace',index=False)

2574

In [325]:
engine.execute('ALTER TABLE tmdb_data ADD PRIMARY KEY (`imdb_id`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x37feb4bb0>

In [338]:
query = '''SELECT *
            FROM tmdb_data
            LIMIT 5;'''
pd.read_sql(query,engine)

Unnamed: 0,imdb_id,revenue,budget,certification
0,tt0035423,76019000.0,48000000.0,PG-13
1,tt0113026,0.0,10000000.0,
2,tt0113092,0.0,0.0,
3,tt0114447,0.0,0.0,
4,tt0116391,0.0,0.0,


In [345]:
genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   genre_name  25 non-null     object
 1   genre_id    25 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 528.0+ bytes


In [346]:
## Calculate max string lengths for object columns
genre_len = genres['genre_name'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
genre_schema = {
    "genre_name": String(key_len+1), 
    "genre_id":INT()}

In [347]:
# Save to sql with dtype and index=False
genres.to_sql('genres',engine,dtype=genre_schema,if_exists='replace',index=False)

25

In [348]:
engine.execute('ALTER TABLE genres ADD PRIMARY KEY (`genre_id`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x407dd1e20>

In [349]:
title_genres.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153208 entries, 0 to 81761
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tconst    153208 non-null  object
 1   genre_id  153208 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.5+ MB


In [353]:
## Calculate max string lengths for object columns
tconst_len = title_genres['tconst'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
title_genre_schema = {
    "tconst": String(tconst_len+1), 
    "genre_id":INT()}

In [356]:
# Save to sql with dtype and index=False
title_genres.to_sql('title_genres',engine,dtype=title_genre_schema,if_exists='replace',index=False)

153208

In [359]:
query = '''SHOW TABLES'''
pd.read_sql(query,engine)

Unnamed: 0,Tables_in_movies
0,genres
1,title_basics
2,title_genres
3,title_ratings
4,tmdb_data
