# API Calls

This notebook's purpose is to make calls to The Movie DataBase via TMDB's API in order to create yearly datasets from 2000-2022 inclusive.

## Library Importation, Folder Creation, Function Implementation

In [1]:
#run this if you don't already have tmdbsimple on your computer
#!pip install tmdbsimple

In [2]:
#Basic numpy and pandas for data manipulation
import numpy as np
import pandas as pd

#Importing os, json to interface with operating system
#importing time to measure time
import os, time,json

#importing tmdbsimple to interface with TMDB
import tmdbsimple as tmdb

#Importing tqdm notebook for progress bars
from tqdm.notebook import tqdm_notebook

#Importing glob for making large dataset later on
import glob

In [3]:
#Checking what folders already exist
os.listdir(".")

['API-Calls.ipynb',
 'Hypothesis-Testing.ipynb',
 'LICENSE',
 'IMDB-Movies-To-SQL.ipynb',
 'README.md',
 'Data-Importation-And-Cleaning.ipynb',
 '.gitattributes',
 '.ipynb_checkpoints',
 '.git',
 'Data',
 'Visualizations.ipynb']

In [4]:
#Checking what folders exist in "Data"
os.listdir("Data/")

['title_names.csv.gz',
 'tmdb_data',
 'title_basics.csv.gz',
 'title_crew.csv.gz',
 '.ipynb_checkpoints',
 'title_akas.csv.gz',
 'title_principals.csv.gz',
 'title_ratings.csv.gz']

In [5]:
#Creating new folder for tmdb data in Data flder
tmdb_folder = "Data/tmdb_data/"
os.makedirs(tmdb_folder, exist_ok=True)
os.listdir(tmdb_folder)

['final_tmdb_data_2006.csv.gz',
 'tmdb_api_results_2010.json',
 'final_tmdb_data_2018.csv.gz',
 'final_tmdb_data_2014.csv.gz',
 'tmdb_api_results_2006.json',
 'final_tmdb_data_2022.csv.gz',
 'final_tmdb_data_2008.csv.gz',
 'final_tmdb_data_2016.csv.gz',
 'final_tmdb_data_2004.csv.gz',
 'tmdb_api_results_2007.json',
 'final_tmdb_data_2020.csv.gz',
 'tmdb_api_results_2011.json',
 'tmdb_api_results_2020.json',
 'tmdb_api_results_2016.json',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2012.csv.gz',
 'tmdb_api_results_2001.json',
 'final_tmdb_data_2010.csv.gz',
 'final_tmdb_data_2002.csv.gz',
 'tmdb_api_results_2017.json',
 'tmdb_api_results_2021.json',
 'tmdb_api_results_2022.json',
 'tmdb_api_results_2018.json',
 'tmdb_api_results_2002.json',
 'final_tmdb_data_2019.csv.gz',
 'final_tmdb_data_2007.csv.gz',
 'tmdb_api_results_2014.json',
 'final_tmdb_data_2015.csv.gz',
 'tmdb_api_results_2015.json',
 'final_tmdb_data_2021.csv.gz',
 'tmdb_api_results_2003

In [3]:
#Setting pandas options to max column and row displays
pd.set_option('display.max_columns', None) #Used for displaying columns
pd.set_option('display.max_rows', None) #Used for displaying rows

### Functions

Various functions for assisting in fetching data, taken from multiple sources (as credited).

#### Function for getting a movie with rating

Copied from [Celia Oakley's Github](github.com/celiao/tmdbsimple)

In [None]:
def get_movie_with_rating(movie_id):
    """adapted from github.com/celiao/tmdbsimple"""
    #Get the movie object associated with movie_id
    movie = tmdb.Movies(movie_id)
    
    #save the .info and .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    
    #Loop through the countries in releases
    for country in releases["countries"]:
        if country["iso_3166_1"] == "US":
            info["certification"] = country["certification"]
    return info

#### Function for writing json files

Adapted from [geeksforgeeks.org](https://www.geeksforgeeks.org/append-to-json-file-using-python/)

In [None]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    """  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

## Extracting data from TMDB

In [None]:
basics = pd.read_csv("Data/title_basics.csv.gz")
basics.head()

In [None]:
with open('/Users/joseph/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

In [None]:
tmdb.API_KEY =  login['api-key']

In [None]:
#Testing ids to make sure everything is working
test_ids = ["tt0848228", "tt0332280"]
results = []
errors = []
for movie_id in test_ids:
    
    try:
        movie_info = get_movie_with_rating(movie_id)
        results.append(movie_info)
        
    except Exception as e: 
        errors.append([movie_id, e])
    
pd.DataFrame(results)

In [None]:
#Creating list of years to search through
YEARS_TO_GET = list(range(2000, 2023))
errors = [ ]
YEARS_TO_GET

In [None]:
for YEAR in tqdm_notebook(YEARS_TO_GET, desc = "YEARS", position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{tmdb_folder}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE, "w") as f:
            json.dump([{"imdb_id":0}],f)

    #Saving new year as the current df
    df = basics.loc[ basics["startYear"] == YEAR].copy()
    # saving movie ids to list
    movie_ids = df["tconst"].copy()

    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)

    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df["imdb_id"])]

    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{tmdb_folder}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

In [None]:
print(f"- Total errors: {len(errors)}")

## Concatenating Data
Pulling all tmdb data into one large .csv.gz file

In [6]:
#Creating query to feed to glob to get all tmdb files
q = "Data/tmdb_data/*.csv.gz"
all_tmdb_files = sorted(glob.glob(q))
all_tmdb_files

['Data/tmdb_data/final_tmdb_data_2000.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2001.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2002.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2003.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2004.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2005.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2006.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2007.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2008.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2009.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2010.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2011.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2012.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2013.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2014.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2015.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2016.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2017.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2018.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2019.csv.gz',
 'Data/tmdb_data/final_tmdb_data_2020.csv.gz',
 'Data/tmdb_d

In [7]:
# Loading all tmdb files as df's and appending to a list
df_list = []
for file in all_tmdb_files:
    temp_df = pd.read_csv(file, lineterminator = "\n", index_col=0)
    df_list.append(temp_df)

In [8]:
df_list[0].head()

Unnamed: 0_level_0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,,,,,,,,,,,,,,,,,,,,,,,,,
tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.519,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-22,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,Earth is in a state of constant war and two co...,1.592,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-11-15,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,"After falling prey to underworld, four friends...",0.655,/yB5wRu4uyXXwZA3PEj8cITu0xt3.jpg,[],"[{'iso_3166_1': 'IN', 'name': 'India'}]",2000-04-14,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,Two neighbors become intimate after discoverin...,21.766,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,"[{'id': 539, 'logo_path': '/iPLtePguIzOPNtAWfT...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2000-09-29,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.119,2204.0,PG


Attempts to concat dataframe together in one step failing. Will manually join all files together via for loop.

In [None]:
# Concatenating the list of dfs into 1 combined df
tmdb_df = df_list[0]
for i in range(1, len(df_list)):
    print(i)
    tmdb_df = pd.concat([tmdb_df, df_list[i]])
tmdb_df = pd.concat(df_list)
tmdb_df

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


In [None]:
#Checking for duplicates
tmdb_df.duplicated().sum()

In [17]:
tmdb_df = tmdb_df.drop_duplicates()
tmdb_df.duplicated().sum()

0

In [18]:
#Preliminary check for missing values
tmdb_df.isna().sum()

adult                        1
backdrop_path            22291
belongs_to_collection    58600
budget                       1
genres                       1
homepage                 47721
id                           1
original_language            1
original_title               1
overview                  1337
popularity                   1
poster_path               4904
production_companies         1
production_countries         1
release_date               953
revenue                      1
runtime                      1
spoken_languages             1
status                       1
tagline                  38870
title                        1
video                        1
vote_average                 1
vote_count                   1
certification            47627
dtype: int64

In [19]:
tmdb_df.isna().sum()/len(tmdb_df) * 100

adult                     0.001592
backdrop_path            35.490137
belongs_to_collection    93.298731
budget                    0.001592
genres                    0.001592
homepage                 75.977965
id                        0.001592
original_language         0.001592
original_title            0.001592
overview                  2.128676
popularity                0.001592
poster_path               7.807798
production_companies      0.001592
production_countries      0.001592
release_date              1.517298
revenue                   0.001592
runtime                   0.001592
spoken_languages          0.001592
status                    0.001592
tagline                  61.886035
title                     0.001592
video                     0.001592
vote_average              0.001592
vote_count                0.001592
certification            75.828305
dtype: float64

In [20]:
# Saving concatenated data frame
tmdb_df.to_csv("Data/tmdb_data.csv.gz", compression = "gzip", index = False)