In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
import os, json, math, time
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb

In [4]:
df_2000 = pd.read_csv("Data/final_tmdb_data2000.csv.gz")
df_2000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1233 entries, 0 to 1232
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                1233 non-null   object 
 1   adult                  1232 non-null   float64
 2   backdrop_path          674 non-null    object 
 3   belongs_to_collection  112 non-null    object 
 4   budget                 1232 non-null   float64
 5   genres                 1232 non-null   object 
 6   homepage               64 non-null     object 
 7   id                     1232 non-null   float64
 8   original_language      1232 non-null   object 
 9   original_title         1232 non-null   object 
 10  overview               1209 non-null   object 
 11  popularity             1232 non-null   float64
 12  poster_path            1108 non-null   object 
 13  production_companies   1232 non-null   object 
 14  production_countries   1232 non-null   object 
 15  rele

In [3]:
df_2001 = pd.read_csv("Data/final_tmdb_data2001.csv.gz")
df_2001.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1323 entries, 0 to 1322
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                1323 non-null   object 
 1   adult                  1322 non-null   float64
 2   backdrop_path          706 non-null    object 
 3   belongs_to_collection  92 non-null     object 
 4   budget                 1322 non-null   float64
 5   genres                 1322 non-null   object 
 6   homepage               108 non-null    object 
 7   id                     1322 non-null   float64
 8   original_language      1322 non-null   object 
 9   original_title         1322 non-null   object 
 10  overview               1293 non-null   object 
 11  popularity             1322 non-null   float64
 12  poster_path            1189 non-null   object 
 13  production_companies   1322 non-null   object 
 14  production_countries   1322 non-null   object 
 15  rele

In [13]:
## Combining two dataframes
df_merge = pd.concat([df_2001, df_2000])
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556 entries, 0 to 1232
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2556 non-null   object 
 1   adult                  2554 non-null   float64
 2   backdrop_path          1380 non-null   object 
 3   belongs_to_collection  204 non-null    object 
 4   budget                 2554 non-null   float64
 5   genres                 2554 non-null   object 
 6   homepage               172 non-null    object 
 7   id                     2554 non-null   float64
 8   original_language      2554 non-null   object 
 9   original_title         2554 non-null   object 
 10  overview               2502 non-null   object 
 11  popularity             2554 non-null   float64
 12  poster_path            2297 non-null   object 
 13  production_companies   2554 non-null   object 
 14  production_countries   2554 non-null   object 
 15  rele

In [17]:
## How many movies had at least some valid financial information (values > 0 for budget OR revenue)?
revenue = df_merge["revenue"] > 0
budget = df_merge["budget"] > 0

movies_financial = df_merge.loc[revenue | budget]
movies_financial["id"].value_counts().sum()

634

In [27]:
## How many movies are there in each of the certification categories (G/PG/PG-13/R)?
df_merge["certifcation"].value_counts()

R          461
PG-13      183
NR          73
PG          62
G           24
NC-17        6
-            1
Unrated      1
Name: certifcation, dtype: int64

In [28]:
keys

array([nan, 'PG-13', 'R', 'NR', 'PG', 'G', '-', 'NC-17', 'Unrated'],
      dtype=object)

In [42]:
## What is the average revenue per certification category?
revenue_dict = {}
keys = df_merge["certifcation"].unique()
for char in keys:
    filter_cert = df_merge["certifcation"] == char
    df = df_merge.loc[filter_cert, :]
    total = df["revenue"].sum()
    revenue_dict.update({char : total})
    
revenue_dict

{nan: 0.0,
 'PG-13': 13003451782.0,
 'R': 7523091974.0,
 'NR': 163007464.0,
 'PG': 3900441687.0,
 'G': 1732447849.0,
 '-': 0.0,
 'NC-17': 0.0,
 'Unrated': 0.0}

In [43]:
## What is the average budget per certification category?
budget_dict = {}
for char in keys:
    filter_cert = df_merge["certifcation"] == char
    df = df_merge.loc[filter_cert, :]
    total = df["budget"].sum()
    revenue_dict.update({char : total})
    
revenue_dict

{nan: 0.0,
 'PG-13': 5634157927.0,
 'R': 4493604450.0,
 'NR': 107140094.0,
 'PG': 1567497234.0,
 'G': 572000000.0,
 '-': 0.0,
 'NC-17': 0.0,
 'Unrated': 0.0}

In [45]:
df_merge.to_csv("Data/TMDB_MERGED.cvs.gz", compression = "gzip", index = False)
