In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, json, math, time
from tqdm.notebook import tqdm_notebook

In [32]:
FOLDER = 'Movies_files/'
os.makedirs(FOLDER, exist_ok =True)
os.listdir(FOLDER)

['final_tmdb_data_2001.csv',
 'final_tmdb_data_2001.csv.gz',
 'Title_Akas.csv',
 'Title_Akas.csv.gz',
 'Title_Basics.csv',
 'Title_Basics.csv.gz',
 'Title_Ratings.csv',
 'Title_Ratings.csv.gz',
 'tmdb_api_results_2001.json']

In [33]:
YEARS_TO_GET = [2000, 2001]

In [34]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
#Defining the JSON file to store results for year
    JSON_FILE = f"{FOLDER}tmdb_api_results_{YEAR}.json"

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Checking if file already exists

In [35]:
#checking for existence of file
file_exists =os.path.isfile(JSON_FILE)
file_exists

True

In [36]:
#if it does not exist: create it
if file_exists ==False:
    
#save an empty dict with just "imdb_id" to the new json file.

    with open(JSON_FILE, "w") as f:
        json.dump([{"imdb_id":0}], f)

Defining and filtering the IDs to call

In [37]:
#Loading the dataframe from project part 1 as basics
basics = pd.read_csv('Title_Basics.csv.gz')

In [38]:
basics.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama


In [39]:
#Saving new year as the current df
df =basics.loc[basics["startYear"]==YEAR].copy()

#Saving movie ids to the list
movie_ids =df["tconst"].copy()#.to_list

In [40]:
#Load in any existing API results with pd.read_json
#Check to see if any of the movie_ids to get are already in the JSON file.
#Filter out only movies that are missing from the JSON file to use in the loop

In [41]:
previous_df = pd.read_json(JSON_FILE)
previous_df

Unnamed: 0,imdb_id
0,0


In [42]:
#filtering movie IDS already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df["imdb_id"])]
movie_ids_to_get

0        tt0035423
11       tt0114447
18       tt0118589
19       tt0118652
24       tt0119004
           ...    
77049    tt9071078
77496    tt9212730
77527    tt9228234
78290    tt9555974
78343    tt9578462
Name: tconst, Length: 1517, dtype: object

In [43]:
def movie_with_rating (movie_id):

#Retrieving movie and release dates
    movie = tmdb.Movies(movie_id)
    #Constructing output dict
    movie_info = movie.info()
    releases = movie.releases()
    #loop through counteries in the releases
    for c in releases["countries"]:
    #if the country abbreviatedion==US
        if c["iso_3166_1"]=="US":
        #Save a "certification" key in the info dict with the certification
            movie_info["certification"] = c["certification"]
    return movie_info

INNER LOOP:

Now we have defined the "movie_ids_to_get". It includes the ids from our dataframe in the year we are seeking, and it excludes any that we have already made calls for.

We will use this list for our inner loop of API calls.

In [44]:
#USing a function to append new results to the existing JSON file

def write_json (new_data, filename):
    with open(filename, "r+") as file:
        #First we load existing data into a dict
        file_data = json.load(file)
        if (type(new_data)==list)& (type(file_data)==list):
            file_data.extend(new_data)# can use .append as alternative
        else:
            file_data.append(new_data)
        #setting the current files's position at offset.
        file.seek(0)
        #converting back to json
        json.dump(file_data, file)

OUTER LOOP

defining a fuction to make API calls and adding the certification to the .info results

In [45]:
#REtrieve the movie ID from list
for movie_id in tqdm_notebook(movie_ids_to_get, 
                             desc =f'Movies from {YEAR}',
                              position =1,
                              leave =True):
    
    #retrieve data for the movie id
    try:
        temp =movie_with_rating(movie_id)#This will use the pre-made function
        #Append the results to existing file using the pre-made function
        write_json(temp, JSON_FILE)
        #Set to 20ms sleep to avoid overwhelming server
        time.sleep(0.02)
        
    #if it fails, make a dict with just the id and None for certification
    except Exception as e:
        continue
        

Movies from 2001:   0%|          | 0/1517 [00:00<?, ?it/s]

Saving the year's result as a csv.gz file

In [46]:
final_year_df =pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression ="gzip", index =False)

In [47]:
final_year_df

Unnamed: 0,imdb_id
0,0
