# Part 1

## Filtering/Cleaning

In [1]:
import pandas as pd
import numpy as np

### Title Basics

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [3]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9826265 entries, 0 to 9826264
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 674.7+ MB


#### Replace "\N" with np.nan

In [4]:
basics['runtimeMinutes']= basics['runtimeMinutes'].replace('')

In [5]:
basics = basics.replace({'\\N':np.nan})

#### Eliminate movies that are null for runtimeMinutes and genre

In [6]:
basics.dropna(subset =['runtimeMinutes','genres'], inplace= True)

In [7]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


#### Keep only titleType == Movie

In [8]:
basics_mov = basics[basics['titleType']== 'movie']
basics_mov.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


#### keep startYear 2000-2022

In [9]:
basics_mov['startYear'].value_counts()

2017    14366
2018    14325
2019    14057
2016    13953
2015    13477
        ...  
1899        1
1904        1
1897        1
1896        1
1894        1
Name: startYear, Length: 130, dtype: int64

In [10]:
basics_mov = basics_mov.astype({"startYear": float})
basics_mov.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 381850 entries, 8 to 9826215
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          381850 non-null  object 
 1   titleType       381850 non-null  object 
 2   primaryTitle    381850 non-null  object 
 3   originalTitle   381850 non-null  object 
 4   isAdult         381850 non-null  object 
 5   startYear       375411 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  381850 non-null  object 
 8   genres          381850 non-null  object 
dtypes: float64(1), object(8)
memory usage: 29.1+ MB


In [11]:
basics_year = basics_mov.loc[(basics_mov['startYear'] >= 2000) & (basics_mov['startYear'] <= 2022)]
basics_year

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
76059,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022.0,,46,Documentary
...,...,...,...,...,...,...,...,...,...
9826031,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"
9826115,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019.0,,123,Drama
9826156,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015.0,,57,Documentary
9826183,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007.0,,100,Documentary


#### Eliminate movies that include "Documentary" in genre 

In [12]:
doc = basics_year['genres'].str.contains('Documentary',case=False)
basics_doc = basics_year[~doc]

In [13]:
basics_doc.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [14]:
basics_doc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147425 entries, 34803 to 9826115
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          147425 non-null  object 
 1   titleType       147425 non-null  object 
 2   primaryTitle    147425 non-null  object 
 3   originalTitle   147425 non-null  object 
 4   isAdult         147425 non-null  object 
 5   startYear       147425 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  147425 non-null  object 
 8   genres          147425 non-null  object 
dtypes: float64(1), object(8)
memory usage: 11.2+ MB


### AKA

#### keep only US movies

In [15]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [16]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [17]:
akas_us = akas[akas['region']== 'US']
akas_us.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


#### Replace '\N' with np.nan

In [18]:
akas_us = akas_us.replace({'\\N':np.nan})

In [19]:
akas_us.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [20]:
akas_us.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1434625 entries, 5 to 35794469
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1434625 non-null  object
 1   ordering         1434625 non-null  int64 
 2   title            1434625 non-null  object
 3   region           1434625 non-null  object
 4   language         3905 non-null     object
 5   types            978439 non-null   object
 6   attributes       46518 non-null    object
 7   isOriginalTitle  1433280 non-null  object
dtypes: int64(1), object(7)
memory usage: 98.5+ MB


### Ratings

In [21]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [22]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,263
2,tt0000003,6.5,1812
3,tt0000004,5.6,178
4,tt0000005,6.2,2610


#### Replace '\N' with np.nan

In [23]:
ratings = ratings.replace({'\\N':np.nan})

In [24]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1307127 entries, 0 to 1307126
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1307127 non-null  object 
 1   averageRating  1307127 non-null  float64
 2   numVotes       1307127 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.9+ MB


### Filtered one dataset with another

#### Basics

In [25]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers1 =basics_doc['tconst'].isin(akas_us['titleId'])
keepers1

34803       True
61116       True
67669       True
77964      False
86801       True
           ...  
9825938     True
9825947     True
9825986    False
9826031     True
9826115    False
Name: tconst, Length: 147425, dtype: bool

In [26]:
basics_a = basics_doc.loc[keepers1]
basics_a

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
9825403,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9825798,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9825938,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9825947,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


#### Ratings

In [27]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =ratings['tconst'].isin(akas_us['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1307122    False
1307123    False
1307124    False
1307125    False
1307126    False
Name: tconst, Length: 1307127, dtype: bool

In [28]:
basics_n = ratings.loc[keepers]
basics_n

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,263
4,tt0000005,6.2,2610
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1307088,tt9916200,8.1,229
1307089,tt9916204,8.1,262
1307096,tt9916348,8.3,18
1307097,tt9916362,6.4,5313


## Export data

In [29]:
## Save current dataframe to file.
basics_doc.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [30]:
basics_f = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics_f.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [31]:
## Save current dataframe to file.
akas_us.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [32]:
# Open saved file and preview again
akas_f = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas_f.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [33]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [34]:
# Open saved file and preview again
ratings_f = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings_f.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,263
2,tt0000003,6.5,1812
3,tt0000004,5.6,178
4,tt0000005,6.2,2610


In [35]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json']

# Part 2.b

### Concatenate the data into 1 dataframe for the remainder of the analysis.

In [43]:
year_2000 = pd.read_csv('Data/final_tmdb_data_2000.csv.gz')
year_2001 = pd.read_csv('Data/final_tmdb_data_2001.csv.gz')

In [44]:
df = pd.concat([year_2000, year_2001], ignore_index=True)
df.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
2,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
3,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
4,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13707 entries, 0 to 13706
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                13707 non-null  object 
 1   adult                  13705 non-null  float64
 2   backdrop_path          13705 non-null  object 
 3   belongs_to_collection  8223 non-null   object 
 4   budget                 13705 non-null  float64
 5   genres                 13705 non-null  object 
 6   homepage               13705 non-null  object 
 7   id                     13705 non-null  float64
 8   original_language      13705 non-null  object 
 9   original_title         13705 non-null  object 
 10  overview               13705 non-null  object 
 11  popularity             13705 non-null  float64
 12  poster_path            13705 non-null  object 
 13  production_companies   13705 non-null  object 
 14  production_countries   13705 non-null  object 
 15  re

In [48]:
type['revenue']

type['revenue']

### How many movies had at least some valid financial information (values > 0 for budget OR revenue)?

In [50]:
valid_movies = df['revenue'] > 0
df[valid_movies].head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
1,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
2,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
3,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
4,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
5,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG


In [None]:
df= df.dropna(subset = ['']

### How many movies are there in each of the certification categories (G/PG/PG-13/R)?

In [55]:
df.groupby('certification')
df.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
2,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
3,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG
4,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,"{'id': 1118731, 'name': 'Tom and Jerry Collect...",50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,136536687.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.897,2213.0,PG


### What is the average revenue per certification category?

In [None]:
ave = df.groupby(['certification'])['Revenue'].apply(lambda x:x.average())
print('ave')

### What is the average budget per certification category?

## Data was collected from TMBD
https://www.themoviedb.org/about/logos-attribution