# Movie Database with SQL 
Author: Kim Hazed Delfino


## Imports 

In [53]:
import pandas as pd
import numpy as np

## Load Dataset

In [54]:
# Load files
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

### Preprossing - Title Basics

In [55]:
# Drop missing values from runtimeMinutes and genre
runtime_filter = basics['runtimeMinutes'] != '\\N'

genre_filter = basics['genres'] != '\\N'



In [56]:
# Create new filtered df 
basics_rtime_genre_filtered = basics[runtime_filter & genre_filter]

In [57]:
# Filter titleType and startYear
type_filter = basics_rtime_genre_filtered['titleType'] == 'movie'

years_filter = basics_rtime_genre_filtered['startYear'] != '\\N'

In [58]:
# Create filtered df 
basics_df = basics_rtime_genre_filtered[type_filter & years_filter]

In [59]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377026 entries, 8 to 9909030
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          377026 non-null  object
 1   titleType       377026 non-null  object
 2   primaryTitle    377026 non-null  object
 3   originalTitle   377026 non-null  object
 4   isAdult         377026 non-null  object
 5   startYear       377026 non-null  object
 6   endYear         377026 non-null  object
 7   runtimeMinutes  377026 non-null  object
 8   genres          377026 non-null  object
dtypes: object(9)
memory usage: 28.8+ MB


In [60]:
# Convert startYear value into int dtype
basics_df['startYear'] = basics_df['startYear'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_df['startYear'] = basics_df['startYear'].astype(int)


In [61]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377026 entries, 8 to 9909030
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          377026 non-null  object
 1   titleType       377026 non-null  object
 2   primaryTitle    377026 non-null  object
 3   originalTitle   377026 non-null  object
 4   isAdult         377026 non-null  object
 5   startYear       377026 non-null  int64 
 6   endYear         377026 non-null  object
 7   runtimeMinutes  377026 non-null  object
 8   genres          377026 non-null  object
dtypes: int64(1), object(8)
memory usage: 28.8+ MB


In [62]:
max(basics_df['startYear'])

2029

In [63]:
# Filter movies with only startYear 2000 to 2022
year_filter_2000 = basics_df['startYear'] >= 2000
year_filter_2021 = basics_df['startYear'] < 2022
basics_20_to_21 = basics_df[year_filter_2000 & year_filter_2021]

In [64]:
basics_20_to_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210722 entries, 13082 to 9909030
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          210722 non-null  object
 1   titleType       210722 non-null  object
 2   primaryTitle    210722 non-null  object
 3   originalTitle   210722 non-null  object
 4   isAdult         210722 non-null  object
 5   startYear       210722 non-null  int64 
 6   endYear         210722 non-null  object
 7   runtimeMinutes  210722 non-null  object
 8   genres          210722 non-null  object
dtypes: int64(1), object(8)
memory usage: 16.1+ MB


In [65]:
# Replace '\N' values with np.nan
basics_20_to_21.replace({'\\N':np.nan},inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_20_to_21.replace({'\\N':np.nan},inplace=True)


In [66]:
# Check df
basics_20_to_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210722 entries, 13082 to 9909030
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          210722 non-null  object 
 1   titleType       210722 non-null  object 
 2   primaryTitle    210722 non-null  object 
 3   originalTitle   210722 non-null  object 
 4   isAdult         210722 non-null  object 
 5   startYear       210722 non-null  int64  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  210722 non-null  object 
 8   genres          210722 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 16.1+ MB


In [67]:
basics_20_to_21.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
42384,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67668,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama


In [68]:
# Filter out Documentary in our df
is_documentary = basics_20_to_21['genres'].str.contains('documentary',case=False)
basics_20_21_filtered = basics_20_to_21[~is_documentary]



In [69]:
basics_20_21_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138585 entries, 34803 to 9908930
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          138585 non-null  object 
 1   titleType       138585 non-null  object 
 2   primaryTitle    138585 non-null  object 
 3   originalTitle   138585 non-null  object 
 4   isAdult         138585 non-null  object 
 5   startYear       138585 non-null  int64  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  138585 non-null  object 
 8   genres          138585 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 10.6+ MB


### Preprocessing - AKAs

In [70]:
# Check df 
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36139188 entries, 0 to 36139187
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [71]:
# Keep only US region
us_akas = akas[akas['region']== 'US']

In [72]:
# Replace '\\N' with np.nan
us_akas.replace({'\\N':np.nan},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_akas.replace({'\\N':np.nan},inplace=True)


In [73]:
# Double check
us_akas.sample(5)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
31144829,tt6998836,1,American Dream,US,,imdbDisplay,,0
1137083,tt0151246,1,Kooky Loopy,US,,imdbDisplay,,0
17818293,tt1796491,2,Geocache: A Global Scavenger Hunt,US,,imdbDisplay,,0
955620,tt0113100,3,The Four Diamonds,US,,imdbDisplay,,0
3636348,tt10111096,1,The First Pitch,US,,imdbDisplay,,0


In [74]:
# Filter only US using AKAs dataset 
us_filter = basics_20_21_filtered['tconst'].isin(us_akas['titleId'])


In [75]:
# Create filtered df
us_basics_df = basics_20_21_filtered[us_filter]

In [76]:
# Doublec check df 
us_basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81745 entries, 34803 to 9908846
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81745 non-null  object 
 1   titleType       81745 non-null  object 
 2   primaryTitle    81745 non-null  object 
 3   originalTitle   81745 non-null  object 
 4   isAdult         81745 non-null  object 
 5   startYear       81745 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81745 non-null  object 
 8   genres          81745 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 6.2+ MB


In [77]:
us_basics_df.sample(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
2550533,tt1288411,movie,You Are Here,You Are Here,0,2010,,79,"Drama,Fantasy,Sci-Fi"
8416442,tt6652576,movie,Ten Short Films,Ten Short Films,0,2017,,65,"Comedy,Drama,Family"
7965850,tt5639650,movie,Phantom of the Theatre,Mo gong mei ying,0,2016,,103,"Drama,Mystery,Romance"
908485,tt0938341,movie,Tokyo Sonata,Tokyo Sonata,0,2008,,120,Drama
6800431,tt2984576,movie,Death House,Death House,0,2017,,95,"Action,Crime,Horror"


### Preprocessing - Ratings

In [78]:
# Check df
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1320058 entries, 0 to 1320057
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1320058 non-null  object 
 1   averageRating  1320058 non-null  float64
 2   numVotes       1320058 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.2+ MB


In [79]:
ratings.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
152695,tt0246992,8.7,30
776484,tt1754533,5.3,17
828867,tt2015548,8.1,349
1151363,tt6039532,6.9,1285
795657,tt1840672,6.9,19


In [80]:
# Filter only US region
us_rating_filter =  ratings['tconst'].isin(us_akas['titleId'])

In [81]:
# Create filtered df 
us_ratings_df = ratings[us_rating_filter]

# Replace '\\N' with np.nan
us_ratings_df.replace({'\\N':np.nan},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_ratings_df.replace({'\\N':np.nan},inplace=True)


In [82]:
# Double check df
us_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 501007 entries, 0 to 1320033
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         501007 non-null  object 
 1   averageRating  501007 non-null  float64
 2   numVotes       501007 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [83]:
us_ratings_df.sample(5)

Unnamed: 0,tconst,averageRating,numVotes
631695,tt1355997,8.3,573
447778,tt0993781,8.2,44
453693,tt10086060,9.0,5
184735,tt0311821,6.9,272
926146,tt2597178,8.8,13


In [84]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


['title_basics.csv.gz', 'title_akas.csv.gz', 'title_ratings.csv.gz']

In [85]:
## Save dataframe to file.
us_basics_df.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

us_ratings_df.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

us_akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)



# API Calls 

## Imports

In [86]:
import os, time, json
import tmdbsimple as tmdb
from tqdm import tqdm_notebook
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title_basics.csv.gz', 'title_akas.csv.gz', 'title_ratings.csv.gz']

## Credentials

In [87]:
with open("/Users/kim/.secret/tmdb_api.json", "r") as f:
    login = json.load(f)

# Display loaded dict 
login.keys()

dict_keys(['client-id', 'api-key'])

In [88]:
# Set API Key 
tmdb.API_KEY = login['client-id'] # api key stored as client-id and api_token as api-key

## Custom Function 

In [89]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [90]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # Save the .info .realeases dict
    info = movie.info()

    release = movie.releases()
    # Loop through countries in releases
    for c in release['countries']:
        # Store if c == US 
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']

    return info

In [91]:
# Testing custom func
avenger_info = get_movie_with_rating("tt0848228")
avenger_info

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 108.942,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [92]:
the_notebook_info = get_movie_with_rating("tt0332280")
the_notebook_info

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 51.088,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/5ThIuO93vsk47oexKTSdfKEr7EC.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

In [93]:
# Load basics csv
basics_df = pd.read_csv("/Users/kim/Documents/GitHub/repo/Movie_database/Data/title_basics.csv.gz")

In [94]:
# Set target 
YEARS_TO_GET  = [2000,2001]


In [95]:
# Capture errors
errors = []

In [96]:
# Loop through the data
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position = 0):
    JSON_FILE = f"{FOLDER}tmdb_api_results_{YEAR}.json"
    is_file = os.path.isfile(JSON_FILE)
    if not is_file:
        with open(JSON_FILE,'w') as f:
            json.dump([{"imdb_id":0}],f)

        df = basics_df.loc[basics_df['startYear']== YEAR].copy()
        movie_ids = df['tconst'].copy()
        # load existing data
        previous_df = pd.read_json(JSON_FILE)

        movie_id_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

        for movie_id in tqdm_notebook(movie_id_to_get,
                                      desc= f'Movies from {YEAR}',
                                      position =1,
                                      leave=True):
            try:
                # Store data for the movie id 
                temp = get_movie_with_rating(movie_id)
                write_json(temp,JSON_FILE)
                time.sleep(.02)
                
            except Exception as e:
                errors.append([movie_id,e])
        final_year_df = pd.read_json(JSON_FILE)
        final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression='gzip',
                     index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position = 0):


YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie_id in tqdm_notebook(movie_id_to_get,


Movies from 2000:   0%|          | 0/1449 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie_id in tqdm_notebook(movie_id_to_get,


Movies from 2001:   0%|          | 0/1569 [00:00<?, ?it/s]

In [97]:
print(f" total errors: {len(errors)}")

 total errors: 444


In [110]:
# load the final datas
tmdb_2000 = pd.read_csv("/Users/kim/Documents/GitHub/repo/Movie_database/Data/final_tmdb_data_2000.csv.gz", sep=',',lineterminator='\n', low_memory=False)
tmdb_2001 = pd.read_csv("/Users/kim/Documents/GitHub/repo/Movie_database/Data/final_tmdb_data_2001.csv.gz", sep=',',lineterminator='\n',low_memory=False)


In [111]:
# Combine both years into one dataframe 
combined_df = pd.concat([tmdb_2000, tmdb_2001])

## Exploratory Data Analysis

In [112]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2576 entries, 0 to 1335
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2576 non-null   object 
 1   adult                  2574 non-null   float64
 2   backdrop_path          1408 non-null   object 
 3   belongs_to_collection  208 non-null    object 
 4   budget                 2574 non-null   float64
 5   genres                 2574 non-null   object 
 6   homepage               172 non-null    object 
 7   id                     2574 non-null   float64
 8   original_language      2574 non-null   object 
 9   original_title         2574 non-null   object 
 10  overview               2524 non-null   object 
 11  popularity             2574 non-null   float64
 12  poster_path            2316 non-null   object 
 13  production_companies   2574 non-null   object 
 14  production_countries   2574 non-null   object 
 15  rele

In [113]:
combined_df.head()


Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,"{'id': 1131062, 'name': 'Wong Kar-Wai’s Love T...",150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,...,14204632.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.112,2221.0,PG


### Questions to answer

- How many movies had at least some valid financial info? (budget OR revenue > 0)

In [116]:
# Create filter
has_budget = combined_df["budget"] > 0
has_revenue = combined_df["revenue"] > 0


In [118]:
# Filter with financial movies
with_financial_filtered = combined_df[has_budget | has_revenue]
with_financial_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 1 to 1312
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                638 non-null    object 
 1   adult                  638 non-null    float64
 2   backdrop_path          541 non-null    object 
 3   belongs_to_collection  110 non-null    object 
 4   budget                 638 non-null    float64
 5   genres                 638 non-null    object 
 6   homepage               76 non-null     object 
 7   id                     638 non-null    float64
 8   original_language      638 non-null    object 
 9   original_title         638 non-null    object 
 10  overview               636 non-null    object 
 11  popularity             638 non-null    float64
 12  poster_path            619 non-null    object 
 13  production_companies   638 non-null    object 
 14  production_countries   638 non-null    object 
 15  relea

- How many movies are there in each of the certification categories? 

In [121]:
# counter per categories
with_financial_filtered['certification'].value_counts()

R        232
PG-13    131
PG        35
NR        17
G         15
Name: certification, dtype: int64

- What is the average revenue per certification catergory? 

In [138]:
# get avg rev per certification
for cert in with_financial_filtered['certification'].unique():
    certification_filtered = with_financial_filtered[with_financial_filtered['certification']== cert]
    avg_rev = certification_filtered['revenue'].mean()
    print(f"Ave revenue for {cert} is ${avg_rev}")

Ave revenue for nan is $nan
Ave revenue for PG is $110679080.08571428
Ave revenue for R is $32465051.612068966
Ave revenue for G is $117364760.8
Ave revenue for PG-13 is $99287858.32824427
Ave revenue for NR is $9588674.352941176


- What is the ave budget per cetification? 

In [139]:
# get avg rev per certification
for cert in with_financial_filtered['certification'].unique():
    certification_filtered = with_financial_filtered[with_financial_filtered['certification']== cert]
    avg_rev = certification_filtered['budget'].mean()
    print(f"Ave budget for {cert} is ${avg_rev}")

Ave budget for nan is $nan
Ave budget for PG is $44828492.4
Ave budget for R is $19484070.905172415
Ave budget for G is $38133333.333333336
Ave budget for PG-13 is $42993571.96183206
Ave budget for NR is $6302358.470588235


In [140]:
# Convert df back to csv
combined_df.to_csv(f"tmdb_results_combined.csv.gz", compression='gzip',
                     index=False)