In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests


# Additional Imports
# os - for saving and loading files
# json - to work with json files
# math - to round up results
# time - to add a short pause to not overwhelm the server
import os, json, math, time

# to make yelpapi calls
from yelpapi import YelpAPI
import tmdbsimple as tmdb

# progress bar from tqdm_notebook
from tqdm.notebook import tqdm_notebook

In [2]:
pip install tmdbsimple

Note: you may need to restart the kernel to use updated packages.


In [3]:
import json
with open('/Users/lidiv/.secret/tmdb_api.json') as f: 
    login = json.load(f)
login.keys()

dict_keys(['API Read Access Token', 'api-key'])

In [4]:
# InstantiateAPI Variable
tmdb_api_key= login['api-key']

In [5]:
filtered_df1 = pd.read_csv(r'\Users\lidiv\project03\project3\Data/title_basic_filtered.csv.gz', compression='gzip', low_memory=False)
filtered_df2 = pd.read_csv(r'\Users\lidiv\project03\project3\Data/title_rating_filtered.csv.gz', compression='gzip', low_memory=False)
filtered_df3 = pd.read_csv(r'\Users\lidiv\project03\project3\Data/title_akas_filtereds.csv.gz', compression='gzip', low_memory=False)

In [6]:
# Function to make TMDB API calls and retrieve financial data
def get_movie_info(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}&language=en-US"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return {
            'movie_id': movie_id,
            'budget': data.get('budget', 0),
            'revenue': data.get('revenue', 0),
            'certification': data.get('certification', 'N/A')
        }
    else:
        return {
            'movie_id': movie_id,
            'budget': 0,
            'revenue': 0,
            'certification': 'N/A'
        }

In [7]:
# Create empty lists to store results for each year
results_2000 = []
results_2001 = []

In [8]:
# Loop through filtered_df1 and filtered_df2 and retrieve financial data
for index, row in filtered_df1.iterrows():
    if row['startYear'] == 2000 or row['startYear'] == 2001:
        movie_info = get_movie_info(row['tconst'])
        if row['startYear'] == 2000:
            results_2000.append(movie_info)
        elif row['startYear'] == 2001:
            results_2001.append(movie_info)


In [9]:
# Create DataFrames for each year
df_2000 = pd.DataFrame(results_2000)
df_2001 = pd.DataFrame(results_2001)


In [10]:
# Save the results as separate CSV files
df_2000.to_csv('tmdb_results_2000.csv.gz', index=False, compression='gzip')
df_2001.to_csv('tmdb_results_2001.csv.gz', index=False, compression='gzip')

In [11]:
# Concatenate the data into one DataFrame for EDA
combined_df = pd.concat([df_2000, df_2001], ignore_index=True)

In [12]:
# EDA tasks
# Count movies with valid financial information (exclude 0s for budget and revenue)
valid_financial_data = combined_df[(combined_df['budget'] > 0) | (combined_df['revenue'] > 0)]

In [13]:
# Filter movies with budget or revenue > 0
valid_financial_df = combined_df[(combined_df['budget'] > 0) | (combined_df['revenue'] > 0)]

# Calculate the number of movies with valid financial information
num_movies_with_valid_financial_info = len(valid_financial_df)


In [14]:
num_movies_with_valid_financial_info

777

In [15]:
# Count movies in each certification category
certification_counts = combined_df['certification'].value_counts()

In [16]:
certification_counts

N/A    8465
Name: certification, dtype: int64

In [17]:
# Group the data by certification and calculate the average revenue and budget
certification_stats = combined_df.groupby('certification').agg({'revenue': 'mean', 'budget': 'mean'})


In [18]:
certification_stats

Unnamed: 0_level_0,revenue,budget
certification,Unnamed: 1_level_1,Unnamed: 2_level_1
,3351450.0,1611756.0


In [19]:
# Calculate average revenue and budget per certification category
average_revenue_by_cert = combined_df.groupby('certification')['revenue'].mean()
average_budget_by_cert = combined_df.groupby('certification')['budget'].mean()

In [20]:
average_revenue_by_cert

certification
N/A    3.351450e+06
Name: revenue, dtype: float64

In [21]:
average_budget_by_cert

certification
N/A    1.611756e+06
Name: budget, dtype: float64

In [22]:
# Save the final merged CSV file
combined_df.to_csv('tmdb_results_combined.csv.gz', index=False, compression='gzip')

In [23]:
combined_df.head()

Unnamed: 0,movie_id,budget,revenue,certification
0,tt0015414,0,0,
1,tt0102362,0,0,
2,tt0113026,10000000,0,
3,tt0113086,0,0,
4,tt0113092,0,0,


In [24]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8465 entries, 0 to 8464
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   movie_id       8465 non-null   object
 1   budget         8465 non-null   int64 
 2   revenue        8465 non-null   int64 
 3   certification  8465 non-null   object
dtypes: int64(2), object(2)
memory usage: 264.7+ KB
