In [1]:
import pandas as pd
import numpy as np
import pickle

Read in TMDB data

In [2]:
tmdb = pd.read_pickle('../data/tmdb_results.pkl')
tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28952 entries, 0 to 28951
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          28952 non-null  float64       
 1   imdb_id                     24983 non-null  object        
 2   title                       28952 non-null  object        
 3   original_title              28952 non-null  object        
 4   release_date                28951 non-null  datetime64[ns]
 5   budget                      28952 non-null  float64       
 6   revenue                     28952 non-null  float64       
 7   popularity                  28952 non-null  float64       
 8   vote_average                28952 non-null  float64       
 9   vote_count                  28952 non-null  float64       
 10  release_dates.results       28952 non-null  object        
 11  genres                      28952 non-null  object    

In [3]:
# Pull out only the US certification data
tmdb_certs = []

for n in range(0, len(tmdb)):
    us_dicts = [d for d in tmdb['release_dates.results'][n] if d['iso_3166_1'] == 'US']
    tmdb_certs.append(us_dicts)
    
tmdb['certifications'] = tmdb_certs

In [4]:
#Drop rows with no US data
tmdb_us = tmdb.loc[tmdb['certifications'].astype(bool)].reset_index(drop=True)

In [5]:
# Drill down to the relevant mpaa data
tmdb_mpaa = []

for n in range(0, len(tmdb_us)):
    if [d for d in tmdb_us['certifications'][n][0]['release_dates'] if d['certification'] != '']:
        cert_info = [d for d in tmdb_us['certifications'][n][0]['release_dates'] if d['certification'] != '']
    else:
        cert_info = [{'certification' : 'None'}]
    tmdb_mpaa.append(cert_info)

In [6]:
tmdb_us['mpaa'] = pd.json_normalize(pd.json_normalize(tmdb_mpaa).iloc[:,0])['certification']

In [7]:
tmdb_us['mpaa'].value_counts()

None       8648
R          6270
PG-13      2634
PG         1270
NR          714
G           370
NC-17        40
UR            1
PG-13         1
Unrated       1
Name: mpaa, dtype: int64

In [8]:
tmdb_us = tmdb_us.drop(columns = ['release_dates.results', 'status', 'belongs_to_collection.id', 'belongs_to_collection.name', 'tagline', 'certifications'])

Take a look at the financial data

In [9]:
tmdb_us[['budget', 'revenue']].describe()

Unnamed: 0,budget,revenue
count,19949.0,19949.0
mean,7622580.0,20287650.0
std,23937990.0,94755900.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,300000.0,9692.0
max,356000000.0,2923706000.0


In [10]:
# Get rows that have a minimum of a $1000 budget and $10000 in revenue
tmdb_fin = tmdb_us.loc[(tmdb_us['budget'] > 1000) & (tmdb_us['revenue'] > 10000)].reset_index(drop=True)
tmdb_fin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3841 entries, 0 to 3840
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    3841 non-null   float64       
 1   imdb_id               3838 non-null   object        
 2   title                 3841 non-null   object        
 3   original_title        3841 non-null   object        
 4   release_date          3841 non-null   datetime64[ns]
 5   budget                3841 non-null   float64       
 6   revenue               3841 non-null   float64       
 7   popularity            3841 non-null   float64       
 8   vote_average          3841 non-null   float64       
 9   vote_count            3841 non-null   float64       
 10  genres                3841 non-null   object        
 11  original_language     3841 non-null   object        
 12  overview              3841 non-null   object        
 13  production_compani

In [11]:
tmdb_fin[['budget', 'revenue']].describe()

Unnamed: 0,budget,revenue
count,3841.0,3841.0
mean,36852690.0,102992200.0
std,42627260.0,194861000.0
min,6000.0,10300.0
25%,9804690.0,8112712.0
50%,22000000.0,34619700.0
75%,49900000.0,114281100.0
max,356000000.0,2923706000.0


In [12]:
# Check for duplicate movie names in the same year financial data, there are none in this dataset
tmdb_fin.loc[tmdb_fin.duplicated(subset=['title', 'release_year'], keep=False)]

Unnamed: 0,id,imdb_id,title,original_title,release_date,budget,revenue,popularity,vote_average,vote_count,genres,original_language,overview,production_companies,production_countries,runtime,spoken_languages,video,release_year,mpaa


In [13]:
# Create a budget to revenue multiplier value
tmdb_fin['budge:rev'] = tmdb_fin['revenue'] / tmdb_fin['budget']

tmdb_fin.head()

Unnamed: 0,id,imdb_id,title,original_title,release_date,budget,revenue,popularity,vote_average,vote_count,...,original_language,overview,production_companies,production_countries,runtime,spoken_languages,video,release_year,mpaa,budge:rev
0,812.0,tt0103639,Aladdin,Aladdin,1992-11-25,28000000.0,504050219.0,52.485,7.645,10297.0,...,en,Disney’s animated take on the classic Arabian ...,"[{'id': 2, 'logo_path': '/wdrCwmRnLFJhEoH8GSfy...","[{'iso_3166_1': 'US', 'name': 'United States o...",95.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,1992.0,G,18.001794
1,10437.0,tt0104940,The Muppet Christmas Carol,The Muppet Christmas Carol,1992-12-10,12000000.0,27281507.0,8.653,7.367,797.0,...,en,A retelling of the classic Dickens tale of Ebe...,"[{'id': 2, 'logo_path': '/wdrCwmRnLFJhEoH8GSfy...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",85.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,1992.0,G,2.273459
2,16314.0,tt0103596,3 Ninjas,3 Ninjas,1992-08-07,6500000.0,29000301.0,11.402,5.701,384.0,...,en,"Each year, three brothers Samuel, Jeffrey and ...","[{'id': 9195, 'logo_path': '/ou5BUbtulr6tIt699...","[{'iso_3166_1': 'US', 'name': 'United States o...",84.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,1992.0,PG,4.461585
3,10406.0,tt0104187,Encino Man,Encino Man,1992-05-22,7000000.0,40693477.0,9.375,5.799,566.0,...,en,High school misfits Stoney and Dave discover a...,"[{'id': 915, 'logo_path': '/4neXXpjSJDZPBGBnfW...","[{'iso_3166_1': 'US', 'name': 'United States o...",88.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,1992.0,PG,5.813354
4,137.0,tt0107048,Groundhog Day,Groundhog Day,1993-02-11,14600000.0,71074049.0,26.404,7.603,6984.0,...,en,"A narcissistic TV weatherman, along with his a...","[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...","[{'iso_3166_1': 'US', 'name': 'United States o...",101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,1993.0,PG,4.868086


Take a look at the popularity/watcher rating data

In [14]:
tmdb_us[['vote_count', 'vote_average', 'popularity']].describe()

Unnamed: 0,vote_count,vote_average,popularity
count,19949.0,19949.0,19949.0
mean,528.690862,4.558984,8.272913
std,1831.619479,2.667507,72.108074
min,0.0,0.0,0.6
25%,1.0,3.0,0.679
50%,15.0,5.5,2.789
75%,191.0,6.445,8.944
max,33697.0,10.0,8687.747


In [15]:
tmdb_us.loc[(tmdb_us['vote_count'] > 0)]['vote_count'].describe()

count    15901.000000
mean       663.282435
std       2029.695108
min          1.000000
25%          6.000000
50%         40.000000
75%        331.000000
max      33697.000000
Name: vote_count, dtype: float64

In [16]:
# Filter to rows that have at least 40 votes
tmdb_pop = tmdb_us.loc[(tmdb_us['vote_count'] >= 40)].reset_index(drop=True)
tmdb_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7999 entries, 0 to 7998
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    7999 non-null   float64       
 1   imdb_id               7999 non-null   object        
 2   title                 7999 non-null   object        
 3   original_title        7999 non-null   object        
 4   release_date          7999 non-null   datetime64[ns]
 5   budget                7999 non-null   float64       
 6   revenue               7999 non-null   float64       
 7   popularity            7999 non-null   float64       
 8   vote_average          7999 non-null   float64       
 9   vote_count            7999 non-null   float64       
 10  genres                7999 non-null   object        
 11  original_language     7999 non-null   object        
 12  overview              7999 non-null   object        
 13  production_compani

In [17]:
tmdb_pop[['vote_count', 'vote_average', 'popularity']].describe()

Unnamed: 0,vote_count,vote_average,popularity
count,7999.0,7999.0,7999.0
mean,1308.712089,6.249899,17.937572
std,2711.346533,0.859618,112.975531
min,40.0,1.86,0.6
25%,112.0,5.708,7.2815
50%,326.0,6.277,10.423
75%,1183.5,6.844,16.3435
max,33697.0,8.702,8687.747


In [18]:
# check for duplicates in the popularity data, there are 8 movies that appear twice
tmdb_pop.loc[tmdb_pop.duplicated(subset=['title', 'release_year'], keep=False)].sort_values('title')#['mpaa'].value_counts()

Unnamed: 0,id,imdb_id,title,original_title,release_date,budget,revenue,popularity,vote_average,vote_count,genres,original_language,overview,production_companies,production_countries,runtime,spoken_languages,video,release_year,mpaa
0,812.0,tt0103639,Aladdin,Aladdin,1992-11-25,28000000.0,504050219.0,52.485,7.645,10297.0,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",en,Disney’s animated take on the classic Arabian ...,"[{'id': 2, 'logo_path': '/wdrCwmRnLFJhEoH8GSfy...","[{'iso_3166_1': 'US', 'name': 'United States o...",95.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,1992.0,G
7707,343693.0,tt0827990,Aladdin,Aladdin,1992-04-27,0.0,0.0,4.454,6.1,60.0,"[{'id': 16, 'name': 'Animation'}, {'id': 14, '...",en,A young man's life is turned around with the h...,"[{'id': 10820, 'logo_path': None, 'name': 'Ame...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",49.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,1992.0,
1907,509635.0,tt7711170,Alone,Alone,2020-09-10,0.0,0.0,24.161,6.29,538.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",en,A recently widowed traveler is kidnapped by a ...,"[{'id': 111865, 'logo_path': '/4UtwPhdPtJX519T...","[{'iso_3166_1': 'US', 'name': 'United States o...",98.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,2020.0,R
7333,661950.0,tt10192566,Alone,Alone,2020-10-16,0.0,0.0,11.253,5.895,147.0,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",en,"When an outbreak hits, Aidan barricades himsel...","[{'id': 141537, 'logo_path': None, 'name': 'JA...","[{'iso_3166_1': 'US', 'name': 'United States o...",92.0,"[{'english_name': 'German', 'iso_639_1': 'de',...",False,2020.0,R
6536,526052.0,tt2573372,Becoming,Becoming,2020-04-16,0.0,0.0,10.025,5.5,54.0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",en,A couple on a road trip through America encoun...,"[{'id': 44632, 'logo_path': '/neCKH4sJCBhZ1B4Q...","[{'iso_3166_1': 'US', 'name': 'United States o...",98.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,2020.0,
7684,699280.0,tt12221748,Becoming,Becoming,2020-05-06,0.0,0.0,6.297,7.5,135.0,"[{'id': 99, 'name': 'Documentary'}]",en,Join former first lady Michelle Obama in an in...,"[{'id': 122147, 'logo_path': '/eotra9uY7zyivEz...","[{'iso_3166_1': 'US', 'name': 'United States o...",89.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,2020.0,PG
3146,257874.0,tt2991296,Beneath,Beneath,2013-10-08,89.0,0.0,8.74,5.397,136.0,"[{'id': 27, 'name': 'Horror'}]",en,A crew of coal miners becomes trapped 600 feet...,"[{'id': 24428, 'logo_path': '/zH6o8LH5x3mpaJgs...","[{'iso_3166_1': 'US', 'name': 'United States o...",89.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,2013.0,NR
7459,191619.0,tt2325518,Beneath,Beneath,2013-07-19,0.0,0.0,7.122,3.677,110.0,"[{'id': 27, 'name': 'Horror'}]",en,Six high school seniors celebrating with day's...,"[{'id': 12671, 'logo_path': None, 'name': 'Chi...","[{'iso_3166_1': 'US', 'name': 'United States o...",90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,2013.0,
5363,340601.0,tt2069797,Delirium,Delirium,2018-05-10,0.0,0.0,8.039,5.753,215.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",en,A man recently released from a mental institut...,"[{'id': 562, 'logo_path': '/azANEzu3H3Kztzt63s...","[{'iso_3166_1': 'US', 'name': 'United States o...",96.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,2018.0,R
7612,401732.0,tt3131050,Delirium,Delirium,2018-01-19,1000000.0,0.0,6.358,4.8,41.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",en,A group of young men dare a classmate to reach...,"[{'id': 3312, 'logo_path': None, 'name': 'Mart...","[{'iso_3166_1': 'US', 'name': 'United States o...",86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",False,2018.0,


In [None]:
tmdb.loc[(tmdb['title'] == 'Alone') 
         & 
         (tmdb['release_year'] == 2020) 
         & 
         (tmdb['certifications'].astype(bool))]['certifications'].reset_index(drop=True)[0]



Read in MPAA data

In [19]:
mpaa = pd.read_csv('../data/clean_mpaa_data.csv').drop(columns = 'Unnamed: 0')
mpaa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21685 entries, 0 to 21684
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        21685 non-null  object
 1   rating       21685 non-null  object
 2   reason       20862 non-null  object
 3   distributor  21672 non-null  object
 4   alt_titles   6222 non-null   object
 5   other        1308 non-null   object
 6   year         21685 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.2+ MB


In [21]:
mpaa['year'].value_counts().sort_index()

1992    606
1993    602
1994    617
1995    692
1996    693
1997    658
1998    661
1999    679
2000    756
2001    735
2002    782
2003    937
2004    857
2005    926
2006    849
2007    848
2008    902
2009    789
2010    701
2011    751
2012    732
2013    709
2014    709
2015    614
2016    608
2017    571
2018    578
2019    512
2020    509
2021    521
2022    581
Name: year, dtype: int64

In [None]:
# Adjust title formats to align with TMDB format
mpaa['the_fix'] = mpaa['title'].str.endswith(', The')
mpaa['a_fix'] = mpaa['title'].str.endswith(', A')
mpaa['tmdb_title'] = mpaa['title'].str.replace(', The', '').str.replace(', A', '')
mpaa.loc[mpaa['the_fix'] == True, 'tmdb_title'] = 'The ' + mpaa['tmdb_title']
mpaa.loc[mpaa['a_fix'] == True, 'tmdb_title'] = 'A ' + mpaa['tmdb_title']
mpaa['tmdb_title'] = mpaa['tmdb_title'].fillna(mpaa['title'])
mpaa = mpaa.drop(columns = ['the_fix', 'a_fix'])

In [None]:
mpaa = mpaa.rename(columns = {'year' : 'mpaa_year'})

In [None]:
mpaa['mpaa_year-1'] = mpaa['mpaa_year'] - 1

mpaa['mpaa_year+1'] = mpaa['mpaa_year'] + 1

mpaa['mpaa_year+2'] = mpaa['mpaa_year'] + 2

In [None]:
ratings = ['G', 'PG', 'PG-13', 'R', 'NC-17']

tmdb_w_rating = tmdb_us.loc[tmdb_us['mpaa'].isin(ratings)]
tmdb_w_rating

In [None]:
mpaa.loc[(mpaa['distributor'].fillna('None').str.contains('Home Video'))
        ]

In [None]:
mpaa.loc[mpaa.duplicated(subset=['tmdb_title', 'mpaa_year', 'rating'], keep=False)]

In [None]:
tmdb_us.loc[tmdb_us.duplicated(subset=['title', 'release_year', 'mpaa'], keep=False)].sort_values('title')

In [None]:
merge1 = pd.merge(mpaa, tmdb_us, left_on = ['tmdb_title', 'mpaa_year-1',], right_on = ['title', 'release_year'])
len(merge1)

In [None]:
merge2 = pd.merge(mpaa, tmdb_us, left_on = ['tmdb_title', 'mpaa_year+1'], right_on = ['title', 'release_year'])
len(merge2)

In [None]:
merge3 = pd.merge(mpaa, tmdb_us, left_on = ['tmdb_title', 'mpaa_year+2'], right_on = ['title', 'release_year'])

In [None]:
pd.concat([merge1, merge2, merge3])

In [None]:
pd.merge(mpaa, tmdb_us, left_on = ['tmdb_title'], right_on = ['title'])

In [None]:
tmdb_us['mpaa'].value_counts()

In [None]:
pd.merge(mpaa, tmdb_w_rating, left_on = ['tmdb_title', 'rating'], right_on = ['title', 'mpaa'], how = 'inner')

In [None]:
num = pd.read_pickle('../data/numbers_data.pkl').drop(columns = 'Unnamed: 0')
num.columns = ['release_date', 'title', 'budget_num', 'domestic_gross_num', 'worldwide_gross_num']
num['release_date'] = pd.to_datetime(num['release_date'].str.replace('Unknown', ''))
num['release_year'] = num['release_date'].dt.year
num.info()

In [None]:
num = num.loc[(num['release_year'] >=1992)
        &
        (num['release_year'] <= 2022)].reset_index(drop=True)

In [None]:
num.info()