# Part 4

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

### Question
The stakeholder's first question is: does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

In [2]:
# imports
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote
# Create connection string using credentials following this format

In [3]:
## Checking what data we already in our Data folder using os.listdir
import os
FOLDER = 'Data/'
file_list = sorted(os.listdir(FOLDER))
file_list

['.ipynb_checkpoints',
 'TMDB API Results.csv',
 'Title Basics.csv',
 'Title Ratings.csv',
 'Untitled.ipynb',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'final_tmdb_data_2002.csv.gz',
 'final_tmdb_data_2003.csv.gz',
 'final_tmdb_data_2004.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_api_results_2002.json',
 'tmdb_api_results_2003.json',
 'tmdb_api_results_2004.json',
 'tmdb_results_combined.csv.gz']

In [4]:
## let's check the filepath 
file_list[2]

'Title Basics.csv'

In [5]:
## add the folder plus filename
FOLDER+ file_list[2]

'Data/Title Basics.csv'

In [6]:
import glob
## Make a filepath query
q = FOLDER+"Title *.csv"
print(q)
file_list = sorted(glob.glob(q,recursive=True))
file_list

Data/Title *.csv


['Data\\Title Basics.csv', 'Data\\Title Ratings.csv']

In [7]:
## use a list comprehension to load in all files into 1 dataframe
df = pd.concat([pd.read_csv(f,lineterminator='\n') for f in file_list])
df

Unnamed: 0,imdb id,primaryTitle,startYear,runtimeMinutes,genres\r,averageRating,numVotes\r
0,tt0035423,Kate & Leopold,2001.0,118.0,"Comedy,Fantasy,Romance\r",,
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70.0,Drama\r,,
2,tt0068865,Lives of Performers,2016.0,90.0,Drama\r,,
3,tt0069049,The Other Side of the Wind,2018.0,122.0,Drama\r,,
4,tt0082328,Embodiment of Evil,2008.0,94.0,Horror\r,,
...,...,...,...,...,...,...,...
513261,tt9916200,,,,,8.1,238.0
513262,tt9916204,,,,,8.2,275.0
513263,tt9916348,,,,,8.3,18.0
513264,tt9916362,,,,,6.4,5600.0


In [8]:
df = df.drop(columns=['averageRating','numVotes\r','runtimeMinutes'])
df

Unnamed: 0,imdb id,primaryTitle,startYear,genres\r
0,tt0035423,Kate & Leopold,2001.0,"Comedy,Fantasy,Romance\r"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,Drama\r
2,tt0068865,Lives of Performers,2016.0,Drama\r
3,tt0069049,The Other Side of the Wind,2018.0,Drama\r
4,tt0082328,Embodiment of Evil,2008.0,Horror\r
...,...,...,...,...
513261,tt9916200,,,
513262,tt9916204,,,
513263,tt9916348,,,
513264,tt9916362,,,


In [9]:
#df["movie id"] = df['imdb id'].astype(str)
#df

In [10]:
#df = df.dropna(subset=['movie id'])
#df

In [11]:
# remove nan from concattenated column
df['imdb id'].fillna('', inplace=True)
df

Unnamed: 0,imdb id,primaryTitle,startYear,genres\r
0,tt0035423,Kate & Leopold,2001.0,"Comedy,Fantasy,Romance\r"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,Drama\r
2,tt0068865,Lives of Performers,2016.0,Drama\r
3,tt0069049,The Other Side of the Wind,2018.0,Drama\r
4,tt0082328,Embodiment of Evil,2008.0,Horror\r
...,...,...,...,...
513261,tt9916200,,,
513262,tt9916204,,,
513263,tt9916348,,,
513264,tt9916362,,,


In [12]:
# remove ids that are 0
df = df.loc[ df['imdb id']!='0']
df

Unnamed: 0,imdb id,primaryTitle,startYear,genres\r
0,tt0035423,Kate & Leopold,2001.0,"Comedy,Fantasy,Romance\r"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,Drama\r
2,tt0068865,Lives of Performers,2016.0,Drama\r
3,tt0069049,The Other Side of the Wind,2018.0,Drama\r
4,tt0082328,Embodiment of Evil,2008.0,Horror\r
...,...,...,...,...
513261,tt9916200,,,
513262,tt9916204,,,
513263,tt9916348,,,
513264,tt9916362,,,


In [13]:
df = df.reset_index(drop=True)
df

Unnamed: 0,imdb id,primaryTitle,startYear,genres\r
0,tt0035423,Kate & Leopold,2001.0,"Comedy,Fantasy,Romance\r"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,Drama\r
2,tt0068865,Lives of Performers,2016.0,Drama\r
3,tt0069049,The Other Side of the Wind,2018.0,Drama\r
4,tt0082328,Embodiment of Evil,2008.0,Horror\r
...,...,...,...,...
600671,tt9916200,,,
600672,tt9916204,,,
600673,tt9916348,,,
600674,tt9916362,,,


In [14]:
## saving the combined csv to disk
df.to_csv(FOLDER+'combined_tmdb_data.csv.gz',compression='gzip',index=False)

df = pd.read_csv(FOLDER+'combined_tmdb_data.csv.gz',lineterminator='\n')
df

Unnamed: 0,imdb id,primaryTitle,startYear,genres\r\r
0,tt0035423,Kate & Leopold,2001.0,"Comedy,Fantasy,Romance\r\r"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,Drama\r\r
2,tt0068865,Lives of Performers,2016.0,Drama\r\r
3,tt0069049,The Other Side of the Wind,2018.0,Drama\r\r
4,tt0082328,Embodiment of Evil,2008.0,Horror\r\r
...,...,...,...,...
600671,tt9916200,,,\r
600672,tt9916204,,,\r
600673,tt9916348,,,\r
600674,tt9916362,,,\r


In [16]:
import json
with open('/Users/miran/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [17]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/tmdb"

In [18]:
# create sqlite engine for a database called schools
engine = create_engine(connection)
# create connection to engine
conn = engine.connect()

In [19]:
database_exists(connection)

True

In [20]:
## Check if database exists, if not, create it
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created!')

It exists!


In [22]:
tmdb.to_sql('tmdb', engine, if_exists = 'replace')

NameError: name 'tmdb' is not defined

In [None]:
q = """SELECT * FROM tmdb;"""
pd.read_sql(q, engine)

In [None]:
q = '''SHOW TABLES'''
pd.read_sql(q,engine)