In [None]:
import pandas as pd
import pickle

import requests

import json
from tqdm import tqdm

Pull in original TMDB dataset

In [None]:
tmdb_filtered = pd.read_pickle('../data/tmdb_filtered.pkl')
tmdb_filtered.info()

Do some more cleaning

In [None]:
# Remove duplicates of the same movie
tmdb_filtered = tmdb_filtered.drop_duplicates(subset=['id']).reset_index(drop=True)
tmdb_filtered.info()

In [None]:
mpaa = pd.read_csv('../data/clean_mpaa_data.csv').drop(columns = 'Unnamed: 0')
mpaa.info()

In [None]:
# Add a column to the MPAA data such that title formatting matches TMDB title formatting
mpaa['the_fix'] = mpaa['title'].str.endswith(', The')
mpaa['a_fix'] = mpaa['title'].str.endswith(', A')
mpaa['tmdb_title'] = mpaa['title'].str.replace(', The', '').str.replace(', A', '')
mpaa.loc[mpaa['the_fix'] == True, 'tmdb_title'] = 'The ' + mpaa['tmdb_title']
mpaa.loc[mpaa['a_fix'] == True, 'tmdb_title'] = 'A ' + mpaa['tmdb_title']
mpaa['tmdb_title'] = mpaa['tmdb_title'].fillna(mpaa['title'])
mpaa = mpaa.drop(columns = ['the_fix', 'a_fix'])

In [None]:
# Keep only movies from the TMDB data that have a matching title in the MPAA data
tmdb = tmdb_filtered.loc[tmdb_filtered['title'].isin(mpaa['tmdb_title'])].reset_index(drop=True)
tmdb.info()

The TMDB dataset still has ~7000 more rows than the MPAA dataset. Take a look at movies with the same titles.

In [None]:
tmdb_dups = tmdb.loc[tmdb.duplicated(subset=['title', 'release_year'], keep=False)].groupby('title')['id'].count().reset_index()

In [None]:
tmdb_dups#.loc[tmdb_dups['id'] >= 10]

4633 movie titles appear at least twice, 391 of them appear at least 10 time   
1176 movie titles appear twice in the same year

#### Bring in additional data about these movies to hopefully make matching more successful

In [None]:
# open and access api key
with open('../data/mpaa_keys.json') as fi:
    credentials = json.load(fi)
    
my_token = credentials['tmdb_token']

In [None]:
url = "https://api.themoviedb.org/3/movie/812?append_to_response=%release_dates%&language=en-US"

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {my_token}"
}

response = requests.get(url, headers=headers)


In [None]:
response

In [None]:
response.json()

In [None]:
results = []

for n in tqdm(range(0, len(tmdb))):
    MOVIE_ID = tmdb['id'].iloc[n]
    
    endpoint = f'https://api.themoviedb.org/3/movie/{MOVIE_ID}?api_key=my_key&language=en-US&append_to_response=release_dates'
    
    headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {my_token}"
    }
    
    response = requests.get(endpoint, headers=headers)
    
    result = response.json()
    
    results.append(result)

In [None]:
tmdb_results = pd.json_normalize(results).drop(columns=['adult', 'backdrop_path','homepage','poster_path', 'belongs_to_collection.poster_path',
                                         'belongs_to_collection.backdrop_path', 'belongs_to_collection', 'success', 
                                         'status_code', 'status_message'])

In [None]:
tmdb_results = tmdb_results[['id', 'imdb_id', 'title', 'original_title', 'release_date', 'budget', 'revenue', 'popularity',
                             'vote_average', 'vote_count', 'release_dates.results', 'genres',  'original_language',
                             'overview','production_companies', 'production_countries',  'runtime', 'spoken_languages', 
                             'status', 'tagline',  'video', 'belongs_to_collection.id', 'belongs_to_collection.name']]
tmdb_results['release_date'] = pd.to_datetime(tmdb_results['release_date'])

There were some null values in the results (6). These appear to occur with the same movie was somehow associated with multiple id values, so just dropping the 6 null rows associated with the extraneous ids.

In [None]:
tmdb_results = tmdb_results.dropna(how='all').reset_index(drop=True)

In [None]:
tmdb_results['release_year'] = tmdb_results['release_date'].dt.year

In [None]:
tmdb.info()

In [None]:
import pickle
#tmdb_results.to_pickle('../data/tmdb_results.pkl')