# Part 4: Apply hypothesis testing to explore what makes a movie successful.

In [1]:
## Libraries.
import numpy as np
import pandas as pd
import tmdbsimple as tmdb 
import matplotlib.pyplot as plt
import seaborn as sns
import os, time, json
os.makedirs('Data', exist_ok=True)
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [2]:
# Function to get the certification.
def get_movie_certification(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    
    for c in releases['countries']:
        if c['iso_3166_1'] == "US":
            info['certifcation'] = c['certification']     
    return info

In [3]:
# Function to create our .json file.

def write_json(new_data, filename):    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [4]:
# Load API Credentials
with open('/Users/lbodden/.secret/tmdb_api.json') as f:   #use your path here!
    login = json.load(f)
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

In [5]:
# Import credentials.
tmdb.API_KEY =  login['api-key']

In [6]:
# Checking the connection with 'The Avengers'.
avengers_movie = tmdb.Movies('tt0848228')
avengers_info = avengers_movie.info()
avengers_info['budget']

220000000

In [7]:
# Checking the connection with 'The Notebook'.
notebook_movie = tmdb.Movies('tt0332280')
notebook_info = notebook_movie.info()
notebook_info['budget']

29000000

In [8]:
# Specify folder for saving data.
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['tmdb_api_results_2010.json',
 'final_tmdb_data_2018.csv.gz',
 'final_tmdb_data_2014.csv.gz',
 'tmbd_data.csv.gz',
 'final_tmdb_data_2016.csv.gz',
 'tmdb_api_results_2011.json',
 'genres.csv.gz',
 'tmdb_api_results_2016.json',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2012.csv.gz',
 'tmdb_api_results_2001.json',
 'final_tmdb_data_2010.csv.gz',
 'title_basics.csv.gz',
 'tmdb_api_results_2017.json',
 'title_genres.csv.gz',
 'title_aka.csv.gz',
 'tmdb_api_results_2018.json',
 'final_tmdb_data_2019.csv.gz',
 'tmdb_api_results_2014.json',
 'final_tmdb_data_2015.csv.gz',
 'tmdb_api_results_2015.json',
 'final_tmdb_data_2017.csv.gz',
 'tmdb_api_results_2019.json',
 'final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'final_tmdb_data_2013.csv.gz',
 'tmdb_api_results_2012.json',
 'tmdb_results_combined.csv.gz',
 'tmdb_api_results_2013.json',
 'final_tmdb_data_2011.csv.gz',
 'title_ratings.csv.gz']

In [9]:
basics_df = pd.read_csv('Data/title_basics.csv.gz', low_memory = False)

In [10]:
# Create Required Lists for the our function.
YEARS_TO_GET = range(2010, 2020)
errors = [ ]

In [11]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    # Defining the JSON file to store results for year.
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if the file exists.
    file_exists = os.path.isfile(JSON_FILE)
    # If it does exist: notify me.
    if file_exists == True:
        print(f'{YEAR} {JSON_FILE} already exists.')
    # If it does not exist: create it.
    else:
    # Save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as file:
            json.dump([{'imdb_id':0}], file)

    # Saving new year as the current df.        
    df = basics_df.loc[basics_df['startYear'] == YEAR].copy()
    # Saving movie ids to list.
    movie_ids = df['tconst'].copy()
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    
    # Filter out any ids that are already in the JSON_FILE.
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    
    # Start of INNER Loop.
    if file_exists == False:
        for movie_id in tqdm_notebook(movie_ids_to_get,
                                      desc=f'Movies from {YEAR}',
                                      position=1,
                                      leave=True):
            try:
                # Retrieve then data for the movie id.
                temp = get_movie_certification(movie_id)  
                # Append/extend results to existing file using a pre-made function.
                write_json(temp,JSON_FILE)
                # Short 20 ms sleep to prevent overwhelming server.
                time.sleep(0.02)

            except Exception as e:
                errors.append([movie_id, e])

        final_year_df = pd.read_json(JSON_FILE)
        final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

print(f"- Total errors: {len(errors)}")

YEARS:   0%|          | 0/10 [00:00<?, ?it/s]

2010 Data/tmdb_api_results_2010.json already exists.
2011 Data/tmdb_api_results_2011.json already exists.
2012 Data/tmdb_api_results_2012.json already exists.
2013 Data/tmdb_api_results_2013.json already exists.
2014 Data/tmdb_api_results_2014.json already exists.
2015 Data/tmdb_api_results_2015.json already exists.
2016 Data/tmdb_api_results_2016.json already exists.
2017 Data/tmdb_api_results_2017.json already exists.
2018 Data/tmdb_api_results_2018.json already exists.
2019 Data/tmdb_api_results_2019.json already exists.
- Total errors: 0


In [12]:
movies_2010 = pd.read_csv('Data/final_tmdb_data_2010.csv.gz', low_memory = False)
movies_2011 = pd.read_csv('Data/final_tmdb_data_2011.csv.gz', low_memory = False)
movies_2012 = pd.read_csv('Data/final_tmdb_data_2012.csv.gz', low_memory = False)
movies_2013 = pd.read_csv('Data/final_tmdb_data_2013.csv.gz', low_memory = False)
movies_2014 = pd.read_csv('Data/final_tmdb_data_2014.csv.gz', low_memory = False)
movies_2015 = pd.read_csv('Data/final_tmdb_data_2015.csv.gz', low_memory = False)
movies_2016 = pd.read_csv('Data/final_tmdb_data_2016.csv.gz', low_memory = False)
movies_2017 = pd.read_csv('Data/final_tmdb_data_2017.csv.gz', low_memory = False)
movies_2018 = pd.read_csv('Data/final_tmdb_data_2018.csv.gz', low_memory = False)
movies_2019 = pd.read_csv('Data/final_tmdb_data_2019.csv.gz', low_memory = False)

ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.
