In [None]:
import requests
import pandas as pd
import json
from tqdm import tqdm

In [None]:
all_mpaa_info = pd.read_csv('../data/clean_mpaa_data.csv')
mpaa_info = all_mpaa_info.drop(columns = 'Unnamed: 0')
mpaa_info.info()

In [None]:
# Adjust title formats to align with TMDB format
tmdb_df = pd.DataFrame()
tmdb_df['the_fix'] = mpaa_info['title'].str.endswith(', The')
tmdb_df['a_fix'] = mpaa_info['title'].str.endswith(', A')
tmdb_df['mpaa_title'] = mpaa_info['title']
tmdb_df['title'] = mpaa_info['title'].str.replace(', The', '').str.replace(', A', '')
tmdb_df.loc[tmdb_df['the_fix'] == True, 'tmdb_title'] = 'The ' + tmdb_df['title']
tmdb_df.loc[tmdb_df['a_fix'] == True, 'tmdb_title'] = 'A ' + tmdb_df['title']
tmdb_df['tmdb_title'] = tmdb_df['tmdb_title'].fillna(tmdb_df['title'])
tmdb_df = tmdb_df.drop(columns = ['the_fix', 'a_fix', 'title'])

In [None]:
tmdb_df.info()

In [None]:
# open and access api key
with open('../data/mpaa_keys.json') as fi:
    credentials = json.load(fi)
    
api_key = credentials['tmdb_key']

In [None]:
endpoint = 'https://api.themoviedb.org/3/search/movie'

#### Test the process with a small dataset

In [None]:
tmdb_test = tmdb_df.iloc[0:20, :]
tmdb_test

In [None]:
results = []
counts = []

for n in range(0, len(tmdb_test)):
    
    params = {
        'api_key' : api_key,
        'query' : tmdb_test['tmdb_title'].iloc[n]
    }
    
    response = requests.get(endpoint,
                            params=params)
    
    results.append(response.json()['results'])
    counts.append(len(response.json()['results']))
    
tmdb_test['results'] = results
tmdb_test['res_count'] = counts

In [None]:
tmdb_test

#### Iterate through all of the titles obtained from the MPAA site

In [None]:
results = []
counts = []

for n in tqdm(range(0, len(tmdb_df))):
    
    params = {
        'api_key' : api_key,
        'query' : tmdb_df['tmdb_title'].iloc[n]
    }
    
    response = requests.get(endpoint,
                            params=params)
    
    results.append(response.json()['results'])
    counts.append(len(response.json()['results']))

In [None]:
tmdb_df['results'] = results
tmdb_df['res_count'] = counts

In [None]:
tmdb_df.info()

In [None]:
# import pickle
# tmdb_df.to_pickle('../data/tmdb_raw.pkl')

In [None]:
# How many records were retrieved?
tmdb_df['res_count'].sum()

#### Normalize the records retrieved

In [None]:
tmdb_details = (pd.json_normalize(tmdb_df.loc[tmdb_df['res_count'] > 0]['results']).melt())

In [None]:
tmdb_details = pd.json_normalize(tmdb_details.loc[~tmdb_details['value'].isna()]['value'])

# Remove irrelevant columns
tmdb_details = tmdb_details.drop(columns = ['adult', 'backdrop_path', 'poster_path'])

# Convert release_date column to datetime
tmdb_details['release_date'] = pd.to_datetime(tmdb_details['release_date'])

tmdb_details.info()

In [None]:
# Remove rows for movies released prior to 1992
tmdb_filtered = tmdb_details.loc[tmdb_details['release_date'] >'1991-12-31']

In [None]:
tmdb_filtered.info()

In [None]:
# Create a release_year column to use as part of the process for ultimately merging this info back to the mpaa dataframe
tmdb_filtered['release_year'] = tmdb_filtered['release_date'].dt.year

In [None]:
# tmdb_filtered.to_pickle('../data/tmdb_filtered.pkl')