# Part 4

For part 4 of the project, you will be using your MySQL database from part 3 to answer meaningful questions for your stakeholder. They want you to use your hypothesis testing and statistics knowledge to answer 3 questions about what makes a successful movie.

# Import

In [65]:
import pandas as pd
import numpy as np
import os, time,json
import tmdbsimple as tmdb 
from tqdm.notebook import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy import stats
import scipy


# Using Your API Credentials

In [2]:
# Load API Credentials

with open('/Users/Ernesto/.secret/tmdb_api.json') as f:
    login = json.load(f)

In [3]:
# display keys
login.keys()

dict_keys(['client-id', 'api-key'])

In [4]:
tmdb.API_KEY = login['api-key']

# Designate a folder

In [9]:
# create a new folder
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'df_2000_2001.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

# Defining Our Function

In [48]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [49]:
def get_movie_with_rating(movie_id):
    # get the movie object for the currend id
    movie = tmdb.Movies(movie_id)
    #save the .info .releases dictionaries
    info = movie.info()
    
    releases = movie.releases()
    # loop thrpoght countries in releases
    for c in releases['countries']:
        # if the country abbreviation == US
        if c['iso_3166_1'] =='US':
            # save a certificatiion key in info with the certification
            info['certification'] = c['certification']
    return info

# Requested a extraction of movies in the last 10 years, 2010-2019 (pre-pandemic)

In [37]:
YEARS_TO_GET = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]


In [38]:
## Define an errors list
errors = [ ]

In [39]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [51]:
# OUTER and INNER loop

# Start of OUTER loop ( Iterate through years 2000 and 2001)

for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if JSON file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
            
    #Define/filter the IDs to call( From Basics dataset filter out movie ids for the year (2000 & 2001)
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()
    # Load existing data from json into a dataframe called "previous_df"
    
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
    # Start of INNER LOOP
    # Iterate through the list of Movies IDs and make the calls
        #Get index and movie id from list
    
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])
            
    # Save your results as csv.gz file
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/10 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3795 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4170 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4460 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4661 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4827 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/4962 [00:00<?, ?it/s]

Movies from 2016:   0%|          | 0/5168 [00:00<?, ?it/s]

Movies from 2017:   0%|          | 0/5546 [00:00<?, ?it/s]

Movies from 2018:   0%|          | 0/5664 [00:00<?, ?it/s]

Movies from 2019:   0%|          | 0/5758 [00:00<?, ?it/s]

In [52]:
# Print a message reporting back the number of movie ids that caused an error.
print(f"- Total errors: {len(errors)}")

- Total errors: 161792


## Using Glob to Load Many Files

In [96]:
import glob
q = "Data/final_tmdb_data*.csv.gz"
tmdb_files = glob.glob(q)
# Showing 
tmdb_files



['Data\\final_tmdb_data_2000.csv.gz',
 'Data\\final_tmdb_data_2001.csv.gz',
 'Data\\final_tmdb_data_2010.csv.gz',
 'Data\\final_tmdb_data_2011.csv.gz',
 'Data\\final_tmdb_data_2012.csv.gz',
 'Data\\final_tmdb_data_2013.csv.gz',
 'Data\\final_tmdb_data_2014.csv.gz',
 'Data\\final_tmdb_data_2015.csv.gz',
 'Data\\final_tmdb_data_2016.csv.gz',
 'Data\\final_tmdb_data_2017.csv.gz',
 'Data\\final_tmdb_data_2018.csv.gz',
 'Data\\final_tmdb_data_2019.csv.gz',
 'Data\\final_tmdb_data_combined.csv.gz']

## Combining Many Files

### For-Loop Way

In [97]:
## Loading all files as df and appending to a list
df_list = []
for file in tmdb_files:
    temp_df = pd.read_csv(file, index_col=0)
    df_list.append(temp_df)
    
## Concatenating the list of dfs into 1 combined
df_combined = pd.concat(df_list)
df_combined



Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,,,,,,,,,,,...,,,,,,,,,,
tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,Two rural teens sing and dance their way throu...,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.500,22.0,
tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,Earth is in a state of constant war and two co...,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.100,8.0,
tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,"After falling prey to underworld, four friends...",...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.000,1.0,
tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,"Hong Kong, 1962: Chow Mo-Wan and Su Li-Zhen mo...",...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.100,2070.0,PG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.0,,,,0.0,[],,599083.0,ml,ഓട്ടം,"Set in Trivandrum, the story of Ottam unfolds ...",...,0.0,120.0,"[{'english_name': 'Malayalam', 'iso_639_1': 'm...",Released,,Ottam,0.0,0.000,0.0,unknow
0.0,,/htoReVItKtolso5VJqFqJDBaOwm.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,635903.0,ml,മനോഹരം,Manoharan is a poster artist struggling to fin...,...,0.0,122.0,"[{'english_name': 'Malayalam', 'iso_639_1': 'm...",Released,,Manoharam,0.0,6.136,11.0,unknow
0.0,,/z6YpkQiWLnRKl5yQD9dSiPDFWN.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,711261.0,en,No Apology,When a group of women struggle to deal with th...,...,0.0,102.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,No Apology,0.0,8.000,1.0,unknow
0.0,,/tjHUeULyw2vtS1DXFp0OHfhUzRX.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,598553.0,ca,La vida sense la Sara Amat,"Pep, a 13-year-old boy, is in love with a girl...",...,0.0,74.0,"[{'english_name': 'Catalan', 'iso_639_1': 'ca'...",Released,,Life without Sara Amat,0.0,7.714,7.0,unknow


# Save 

In [98]:
## Saving the final combined dataframe
final_fname ='Data/final_tmdb_data_combined.csv.gz'
df_combined.to_csv(final_fname, compression='gzip', index=False)



In [101]:
df_combined = pd.read_csv(final_fname)
df_combined



Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,,,,,,,,,,,...,,,,,,,,,,
1,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,Two rural teens sing and dance their way throu...,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.500,22.0,
2,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,Earth is in a state of constant war and two co...,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.100,8.0,
3,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,"After falling prey to underworld, four friends...",...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.000,1.0,
4,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,"Hong Kong, 1962: Chow Mo-Wan and Su Li-Zhen mo...",...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.100,2070.0,PG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79869,,,,0.0,[],,599083.0,ml,ഓട്ടം,"Set in Trivandrum, the story of Ottam unfolds ...",...,0.0,120.0,"[{'english_name': 'Malayalam', 'iso_639_1': 'm...",Released,,Ottam,0.0,0.000,0.0,unknow
79870,,/htoReVItKtolso5VJqFqJDBaOwm.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,635903.0,ml,മനോഹരം,Manoharan is a poster artist struggling to fin...,...,0.0,122.0,"[{'english_name': 'Malayalam', 'iso_639_1': 'm...",Released,,Manoharam,0.0,6.136,11.0,unknow
79871,,/z6YpkQiWLnRKl5yQD9dSiPDFWN.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,711261.0,en,No Apology,When a group of women struggle to deal with th...,...,0.0,102.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,No Apology,0.0,8.000,1.0,unknow
79872,,/tjHUeULyw2vtS1DXFp0OHfhUzRX.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,598553.0,ca,La vida sense la Sara Amat,"Pep, a 13-year-old boy, is in love with a girl...",...,0.0,74.0,"[{'english_name': 'Catalan', 'iso_639_1': 'ca'...",Released,,Life without Sara Amat,0.0,7.714,7.0,unknow


### Explore data 

In [102]:
# check missing values
df_combined.isna().sum()

adult                    39943
backdrop_path            28298
belongs_to_collection    74672
budget                      12
genres                      12
homepage                 59488
id                          12
original_language           12
original_title              12
overview                  1788
popularity                  12
poster_path               6310
production_companies        12
production_countries        12
release_date              1120
revenue                     12
runtime                     12
spoken_languages            12
status                      12
tagline                  50646
title                       12
video                       12
vote_average                12
vote_count                  12
certification            30874
dtype: int64

In [103]:
# check duplicates 
df_combined.duplicated().sum()

11

In [104]:
# drop duplicates
df_combined = df_combined.drop_duplicates()
# check again 
df_combined.duplicated().sum()

0

# Hypothesis Testing

## The stakeholder's first question is: does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

In [105]:
# checking missing values in certification column
nan_certification = df_combined['certification'].isna().sum()
print(f'there are {nan_certification} movies with missing certification')

there are 30863 movies with missing certification


In [106]:
# replace nan per unknow
df_combined['certification'] = df_combined['certification'].fillna('unknow')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined['certification'] = df_combined['certification'].fillna('unknow')


In [107]:
# check missing values again
nan_certification = df_combined['certification'].isna().sum()
print(f'there are {nan_certification} movies with missing certification')

there are 0 movies with missing certification


In [108]:
df_combined['certification'].value_counts()

unknow     61726
R           7103
NR          4656
PG-13       3859
PG          1710
G            584
NC-17        210
Unrated       10
10             2
-              1
R              1
PG-13          1
Name: certification, dtype: int64

In [109]:
# there are 2 R and PG-13, let's check what is the problem 
df_combined['certification'].unique()

array(['unknow', 'PG', 'R', 'G', 'NR', 'PG-13', 'NC-17', 'Unrated', '-',
       'R ', 'PG-13 ', '10'], dtype=object)

In [110]:
# replace - to unknow and remove white space
df_combined['certification'] = df_combined['certification'].replace({'-': 'unknow', 'PG-13 ': 'PG-13', 'R ' : 'R'})
df_combined['certification'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined['certification'] = df_combined['certification'].replace({'-': 'unknow', 'PG-13 ': 'PG-13', 'R ' : 'R'})


unknow     61727
R           7104
NR          4656
PG-13       3860
PG          1710
G            584
NC-17        210
Unrated       10
10             2
Name: certification, dtype: int64

In [111]:
# checking missing values in revenue column
nan_revenue = df_combined['revenue'].isna().sum()
print(f'there are {nan_revenue} movies with missing revenue')

there are 1 movies with missing revenue


In [112]:
# drop row with missing values 
df_combined = df_combined.dropna(subset=['revenue'])

In [113]:
# checking missing values in revenue column
nan_revenue = df_combined['revenue'].isna().sum()
print(f'there are {nan_revenue} movies with missing revenue')

there are 0 movies with missing certification


In [87]:
df_combined['revenue'].value_counts()

0.0           35147
10000.0          25
100000.0         13
2000000.0        12
500.0             9
              ...  
10703234.0        1
885000.0          1
62000.0           1
14353.0           1
42200.0           1
Name: revenue, Length: 4481, dtype: int64

### 1. State the Hypothesis & Null Hypothesis

 H0 (Null Hypothesis): no difference in revenue amount between all MPAA rating.
 
 
 HA(Alternative Hypothesis): there is a significant difference in revenue amount between ratings.

### 2. Determine the correct test to perform.

Type of Data? numeric

How many groups/samples?  more than two

Therefore, which test is appropriate? ANOVA

In [114]:
rating_names = df_combined['certification'].unique()
rating_names

array(['unknow', 'PG', 'R', 'G', 'NR', 'PG-13', 'NC-17', 'Unrated', '10'],
      dtype=object)

In [115]:
## Create a dictionary with each group as key and funded_amount as values
groups = {}
for rating in rating_names:
    temp = df_combined.loc[df_combined['certification'] == rating, 'revenue']
    groups[rating] = temp

In [116]:
## check one of the sectors in the dict
groups['PG']

4         12854953.0
19        36754634.0
22        36609995.0
27       354248063.0
28        35134820.0
            ...     
79538            0.0
79672            0.0
79791            0.0
79813            0.0
79852            0.0
Name: revenue, Length: 1710, dtype: float64

### 3. Testing Assumptions

#### No significant outliers

In [119]:
pip install autoviz

^C
Note: you may need to restart the kernel to use updated packages.


In [120]:
from autoviz.AutoViz_Class import AutoViz_Class

ModuleNotFoundError: No module named 'autoviz'

In [None]:
AV = AutoViz_Class()
%matplotlib inline
_ = AV.AutoViz(df_combined)