# IMDB Project Part 4
- *David Atkins*

### Setup

In [31]:
import json
import sqlalchemy
sqlalchemy.__version__
# imports
import pandas as pd
import numpy as np
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from sqlalchemy.types import *
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import tmdbsimple as tmdb
pd.set_option('display.max_columns',50)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# In case my password uses special characters
from urllib.parse import quote_plus 

In [2]:
with open('/Users/gondr/.secret/mysql_cred.json') as cred:
    login = json.load(cred)
login.keys()

dict_keys(['username', 'password'])

In [5]:
# Creating the sqlalchemy engine and connection
username = login['username']
password = login['password']
# password = quote_plus("Myp@ssword!") # If I have special chars in password
db_name = "movies"
connection = f"mysql+pymysql://{username}:{password}@localhost/{db_name}"
engine = create_engine(connection)
conn = engine.connect()

In [24]:
# Preemptively changing the setting for FOREIGN_KEY_CHECKS
q = """SET @@FOREIGN_KEY_CHECKS=0"""
conn.execute(q)
# Confirming
q = """SELECT @@FOREIGN_KEY_CHECKS"""
pd.read_sql(q, conn)

Unnamed: 0,@@FOREIGN_KEY_CHECKS
0,0


#### Collecting the Data in one DF
I am using data from 2000 through 2008, the recession

In [17]:
temp_files = sorted(glob.glob("Data/final_tmdb_data*.csv"))
temp_files

['Data\\final_tmdb_data_2000.csv',
 'Data\\final_tmdb_data_2003.csv',
 'Data\\final_tmdb_data_2004.csv',
 'Data\\final_tmdb_data_2005.csv',
 'Data\\final_tmdb_data_2006.csv',
 'Data\\final_tmdb_data_2007.csv',
 'Data\\final_tmdb_data_2008.csv']

In [15]:
# Use read_csv in a list comprehension and combine with concat to load all files
t1 = pd.concat([pd.read_csv(f) for f in temp_files] )
t1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11831 entries, 0 to 2180
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                11831 non-null  object 
 1   adult                  11824 non-null  float64
 2   backdrop_path          6603 non-null   object 
 3   belongs_to_collection  913 non-null    object 
 4   budget                 11824 non-null  float64
 5   genres                 11824 non-null  object 
 6   homepage               1873 non-null   object 
 7   id                     11824 non-null  float64
 8   original_language      11824 non-null  object 
 9   original_title         11824 non-null  object 
 10  overview               11545 non-null  object 
 11  popularity             11824 non-null  float64
 12  poster_path            10474 non-null  object 
 13  production_companies   11824 non-null  object 
 14  production_countries   11824 non-null  object 
 15  rel

In [11]:
# Use glob to get all filepaths that match the pattern (*=wildcard)
t2_files = sorted(glob.glob("Data/final_tmdb_data*.csv.gz"))
t2_files

['Data\\final_tmdb_data_2001.csv.gz', 'Data\\final_tmdb_data_2002.csv.gz']

In [14]:
# Use read_csv in a list comprehension and combine with concat to load all files
t2 = pd.concat([pd.read_csv(f) for f in t2_files] )
t2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2646 entries, 0 to 1292
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2646 non-null   object 
 1   adult                  2644 non-null   float64
 2   backdrop_path          1517 non-null   object 
 3   belongs_to_collection  215 non-null    object 
 4   budget                 2644 non-null   float64
 5   genres                 2644 non-null   object 
 6   homepage               213 non-null    object 
 7   id                     2644 non-null   float64
 8   original_language      2644 non-null   object 
 9   original_title         2644 non-null   object 
 10  overview               2586 non-null   object 
 11  popularity             2644 non-null   float64
 12  poster_path            2405 non-null   object 
 13  production_companies   2644 non-null   object 
 14  production_countries   2644 non-null   object 
 15  rele

In [13]:
tempdf = [t1,t2]
df = pd.concat(tempdf)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14477 entries, 0 to 1292
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                14477 non-null  object 
 1   adult                  14468 non-null  float64
 2   backdrop_path          8120 non-null   object 
 3   belongs_to_collection  1128 non-null   object 
 4   budget                 14468 non-null  float64
 5   genres                 14468 non-null  object 
 6   homepage               2086 non-null   object 
 7   id                     14468 non-null  float64
 8   original_language      14468 non-null  object 
 9   original_title         14468 non-null  object 
 10  overview               14131 non-null  object 
 11  popularity             14468 non-null  float64
 12  poster_path            12879 non-null  object 
 13  production_companies   14468 non-null  object 
 14  production_countries   14468 non-null  object 
 15  rel

In [16]:
fname_out = "Data/tmdb_results_2000_through_2008.csv.gz"
df.to_csv(fname_out, index=False)

#### Updating MySQL Data

In [26]:
mov = pd.read_csv('Data/tmdb_results_2000_through_2008.csv.gz')
mov.info()
mov.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14477 entries, 0 to 14476
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                14477 non-null  object 
 1   adult                  14468 non-null  float64
 2   backdrop_path          8120 non-null   object 
 3   belongs_to_collection  1128 non-null   object 
 4   budget                 14468 non-null  float64
 5   genres                 14468 non-null  object 
 6   homepage               2086 non-null   object 
 7   id                     14468 non-null  float64
 8   original_language      14468 non-null  object 
 9   original_title         14468 non-null  object 
 10  overview               14131 non-null  object 
 11  popularity             14468 non-null  float64
 12  poster_path            12879 non-null  object 
 13  production_companies   14468 non-null  object 
 14  production_countries   14468 non-null  object 
 15  re

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,,,,,,,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.787,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-22,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,Earth is in a state of constant war and two co...,1.862,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-11-15,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.45,10.0,


In [27]:
mov = mov.drop(columns=['adult',
                        'backdrop_path',
                        'belongs_to_collection',
                        'genres',
                        'homepage',
                        'id',
                        'original_language',
                        'original_title', 
                        'overview', 
                        'popularity',
                        'poster_path',
                        'production_companies',
                        'production_countries', 
                        'release_date',
                        'runtime',
                        'spoken_languages',
                        'status', 
                        'tagline',
                        'title', 
                        'video',
                        'vote_average',
                        'vote_count'])

mov.head()

Unnamed: 0,imdb_id,budget,revenue,certification
0,0,,,
1,tt0113026,10000000.0,0.0,
2,tt0113092,0.0,0.0,
3,tt0116391,0.0,0.0,
4,tt0118694,150000.0,14204632.0,PG


In [33]:
id_max_len = mov['imdb_id'].fillna('').map(len).max()
cert_max_len = mov['certification'].fillna('').map(len).max()

print(f'max imdb_id string length: {id_max_len}')
print(f'max certification string length: {cert_max_len}')

max imdb_id string length: 10
max certification string length: 31


In [34]:
tmdb_schema = {
    'imdb_id':CHAR(id_max_len+1),
    'revenue':FLOAT(),
    'budget':FLOAT(),
    'certification':VARCHAR(cert_max_len+1)}
tmdb_schema

{'imdb_id': CHAR(length=11),
 'revenue': FLOAT(),
 'budget': FLOAT(),
 'certification': VARCHAR(length=32)}

In [None]:
#inserting data
mov.to_sql("tmdb_data",conn, dtype=tmdb_schema, index=False, if_exists='replace')

In [None]:
### More checking here once the data inserts

### Stakeholder Questions
##### 1. Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?