# Part 4 Hypothesis Testing
-Loveida Lucero

**Imports**

In [1]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, json, time
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb

# Load TMDB API Key and add to tmdbsimple

In [2]:
with open('/Users/Lovei/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [3]:
# Setting the API_KEY
tmdb.API_KEY =  login['api-key']

# Define Functions

In [4]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    
    # save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
    
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info

In [5]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('Data/basics.csv')
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
86974,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
86975,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
86976,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
86977,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


# TMDB API Calls

## 2000-2013

In [6]:
# define error list
errors = [ ]

In [7]:
YEARS_TO_GET = [2000,2003,2004,2005,2006,2007,2008,2009,2011,2012,2013]

In [8]:
# start ot outer loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):

    # Create the folder for saving files (if it doesn't exist)
    FOLDER = "Data/"
    os.makedirs(FOLDER, exist_ok=True)
   
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if the JSON file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
        # Print a message indicating the file is being created 
        print(f"Creating {JSON_FILE} for API results for {YEAR}.")
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    # If it exists, print a message
    else:
        print(f'The file {JSON_FILE} already exists.')   
   
    # Filtering for movies from selected startYears
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst']
    movie_ids.head()

    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    previous_df

    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    # Loop through movie_ids_to_get with a tqdm progress bar
    for movie_id in tqdm_notebook(movie_ids_to_get, f"Movies from {YEAR}"):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

    # print number of movies that caused error
    print(f"- Total errors: {len(errors)}")

YEARS:   0%|          | 0/11 [00:00<?, ?it/s]

Creating Data/tmdb_api_results_2000.json for API results for 2000.


Movies from 2000:   0%|          | 0/1457 [00:00<?, ?it/s]

- Total errors: 201
Creating Data/tmdb_api_results_2003.json for API results for 2003.


Movies from 2003:   0%|          | 0/1687 [00:00<?, ?it/s]

- Total errors: 542
Creating Data/tmdb_api_results_2004.json for API results for 2004.


Movies from 2004:   0%|          | 0/1904 [00:00<?, ?it/s]

- Total errors: 940
Creating Data/tmdb_api_results_2005.json for API results for 2005.


Movies from 2005:   0%|          | 0/2185 [00:00<?, ?it/s]

- Total errors: 1442
Creating Data/tmdb_api_results_2006.json for API results for 2006.


Movies from 2006:   0%|          | 0/2440 [00:00<?, ?it/s]

- Total errors: 1964
Creating Data/tmdb_api_results_2007.json for API results for 2007.


Movies from 2007:   0%|          | 0/2579 [00:00<?, ?it/s]

- Total errors: 2599
Creating Data/tmdb_api_results_2008.json for API results for 2008.


Movies from 2008:   0%|          | 0/2912 [00:00<?, ?it/s]

- Total errors: 3325
Creating Data/tmdb_api_results_2009.json for API results for 2009.


Movies from 2009:   0%|          | 0/3560 [00:00<?, ?it/s]

- Total errors: 4380
Creating Data/tmdb_api_results_2011.json for API results for 2011.


Movies from 2011:   0%|          | 0/4229 [00:00<?, ?it/s]

- Total errors: 5577
Creating Data/tmdb_api_results_2012.json for API results for 2012.


Movies from 2012:   0%|          | 0/4522 [00:00<?, ?it/s]

- Total errors: 7598
Creating Data/tmdb_api_results_2013.json for API results for 2013.


Movies from 2013:   0%|          | 0/4714 [00:00<?, ?it/s]

- Total errors: 12312


# 2014-2022

In [9]:
# define error list
errors = [ ]

In [10]:
YEARS_TO_GET2 = [2014,2015,2016,2017,2018,2019,2020,2021,2022]

In [11]:
# start ot outer loop
for YEAR in tqdm_notebook(YEARS_TO_GET2,desc='YEARS',position=0):

    # Create the folder for saving files (if it doesn't exist)
    FOLDER = "Data/"
    os.makedirs(FOLDER, exist_ok=True)
   
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if the JSON file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
        # Print a message indicating the file is being created 
        print(f"Creating {JSON_FILE} for API results for {YEAR}.")
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    # If it exists, print a message
    else:
        print(f'The file {JSON_FILE} already exists.')   
   
    # Filtering for movies from selected startYears
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst']
    movie_ids.head()

    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    previous_df

    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    # Loop through movie_ids_to_get with a tqdm progress bar
    for movie_id in tqdm_notebook(movie_ids_to_get, f"Movies from {YEAR}"):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

    # print number of movies that caused error
    print(f"- Total errors: {len(errors)}")

YEARS:   0%|          | 0/9 [00:00<?, ?it/s]

Creating Data/tmdb_api_results_2014.json for API results for 2014.


Movies from 2014:   0%|          | 0/4917 [00:00<?, ?it/s]

- Total errors: 1143
Creating Data/tmdb_api_results_2015.json for API results for 2015.


Movies from 2015:   0%|          | 0/5056 [00:00<?, ?it/s]

- Total errors: 2349
Creating Data/tmdb_api_results_2016.json for API results for 2016.


Movies from 2016:   0%|          | 0/5256 [00:00<?, ?it/s]

- Total errors: 3516
Creating Data/tmdb_api_results_2017.json for API results for 2017.


Movies from 2017:   0%|          | 0/5643 [00:00<?, ?it/s]

- Total errors: 4693
Creating Data/tmdb_api_results_2018.json for API results for 2018.


Movies from 2018:   0%|          | 0/5785 [00:00<?, ?it/s]

- Total errors: 5781
Creating Data/tmdb_api_results_2019.json for API results for 2019.


Movies from 2019:   0%|          | 0/5877 [00:00<?, ?it/s]

- Total errors: 6738
Creating Data/tmdb_api_results_2020.json for API results for 2020.


Movies from 2020:   0%|          | 0/5010 [00:00<?, ?it/s]

- Total errors: 7785
Creating Data/tmdb_api_results_2021.json for API results for 2021.


Movies from 2021:   0%|          | 0/5163 [00:00<?, ?it/s]

- Total errors: 8896
Creating Data/tmdb_api_results_2022.json for API results for 2022.


Movies from 2022:   0%|          | 0/5073 [00:00<?, ?it/s]

ValueError: Expected object or value

- All years completed except year 2022 which errored out. 2022 will be ran separately.

# 2022

In [12]:
YEARS_TO_GET3 = [2022]

In [13]:
# start ot outer loop
for YEAR in tqdm_notebook(YEARS_TO_GET3,desc='YEARS',position=0):

    # Create the folder for saving files (if it doesn't exist)
    FOLDER = "Data/"
    os.makedirs(FOLDER, exist_ok=True)
   
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if the JSON file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
        # Print a message indicating the file is being created 
        print(f"Creating {JSON_FILE} for API results for {YEAR}.")
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    # If it exists, print a message
    else:
        print(f'The file {JSON_FILE} already exists.')   
   
    # Filtering for movies from selected startYears
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst']
    movie_ids.head()

    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    previous_df

    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    # Loop through movie_ids_to_get with a tqdm progress bar
    for movie_id in tqdm_notebook(movie_ids_to_get, f"Movies from {YEAR}"):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

    # print number of movies that caused error
    print(f"- Total errors: {len(errors)}")

YEARS:   0%|          | 0/1 [00:00<?, ?it/s]

Creating Data/tmdb_api_results_2022.json for API results for 2022.


Movies from 2022:   0%|          | 0/5073 [00:00<?, ?it/s]

- Total errors: 13396


- All API calls for 2000-2022 completed.