In [35]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

In [36]:
#Check for / create the data folder
import os
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['basics_clean.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title.akas.csv.gz',
 'title.ratings.csv.gz',
 'title_basics.csv.gz',
 'tmdb_api_results_2001.json']

In [37]:
#Load TMDB API key
import json
with open('/Users/staichn/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [38]:
# Set API key to variable
tmdb.API_KEY =  login['api-key']

In [39]:
movie = tmdb.Movies(603)

info = movie.info()
info

{'adult': False,
 'backdrop_path': '/y9wuhlrqSHvhTLNVNwKMKe6HZzY.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 59.281,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 372,
   'logo_path': None,
   'name': 'Groucho II Film

In [40]:
# From Sample Solution function
def get_movie_with_rating(movie_id):
    ## Get movie and release dates
    movie = tmdb.Movies(movie_id)
    ## Construct output dict
    movie_info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            movie_info['certification'] = c['certification']
    return movie_info

In [41]:
get_movie_with_rating('tt0035423')

{'adult': False,
 'backdrop_path': '/ab5yL8zgRotrICzGbEl10z24N71.jpg',
 'belongs_to_collection': None,
 'budget': 48000000,
 'genres': [{'id': 35, 'name': 'Comedy'},
  {'id': 14, 'name': 'Fantasy'},
  {'id': 10749, 'name': 'Romance'}],
 'homepage': '',
 'id': 11232,
 'imdb_id': 'tt0035423',
 'original_language': 'en',
 'original_title': 'Kate & Leopold',
 'overview': "When her scientist ex-boyfriend discovers a portal to travel through time -- and brings back a 19th-century nobleman named Leopold to prove it -- a skeptical Kate reluctantly takes responsibility for showing Leopold the 21st century. The more time Kate spends with Leopold, the harder she falls for him. But if he doesn't return to his own time, his absence will forever alter history.",
 'popularity': 21.64,
 'poster_path': '/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg',
 'production_companies': [{'id': 85,
   'logo_path': None,
   'name': 'Konrad Pictures',
   'origin_country': ''},
  {'id': 14,
   'logo_path': '/m6AHu84oZQxvq7n1rsvMN

In [42]:
YEARS_TO_GET = [2000,2001]

In [43]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('data/basics_clean.csv.gz')

In [44]:
#Check that the dataframe loaded correctly
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
1,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021,,6,Short
2,tt0044326,short,Abstronic,Abstronic,0,2021,,6,Short
3,tt0044879,short,Mandala,Mandala,0,2021,,3,Short
4,tt0052437,short,School,Szkola,0,2011,,7,"Animation,Short"


In [45]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [46]:
# From Sample Solution function
def get_movie_with_rating(movie_id):
    ## Get movie and release dates
    movie = tmdb.Movies(movie_id)
    ## Construct output dict
    movie_info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            movie_info['certification'] = c['certification']
    return movie_info

In [47]:
# Start of OUTER loop
from tqdm.notebook import tqdm_notebook
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
            #Saving new year as the current df


    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()#.to_list()
    previous_df = pd.read_json(JSON_FILE)
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/5549 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/5070 [00:00<?, ?it/s]

In [49]:
final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

In [50]:
final_year_df.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0135240,1.0,,,0.0,[],,506214.0,en,Coming In America,...,0.0,72.0,[],Released,,Coming In America,0.0,0.0,0.0,
2,tt0138432,0.0,,,0.0,"[{'id': 28, 'name': 'Action'}]",,221255.0,en,The Falkland Man,...,0.0,77.0,[],Released,,The Falkland Man,0.0,6.0,1.0,R
3,tt0153812,0.0,/dlg0MqB1PuebsWdtPVQwcEZcVLu.jpg,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 10770,...",,239513.0,en,The Perfect Wife,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Perfect Wife,0.0,4.4,7.0,
4,tt0156976,0.0,,,0.0,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 18,...",,214690.0,en,Chasing Destiny,...,0.0,94.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Love takes you places you never expected,Chasing Destiny,0.0,5.3,4.0,PG-13
