# IMDB Movie Exploratory Data Analysis

## Library Importation and Data Loading

In [1]:
import numpy as np
import pandas as pd

import glob
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Setting pandas options to max column and row displays
pd.set_option('display.max_columns', None) #Used for displaying columns
pd.set_option('display.max_rows', None) #Used for displaying rows

In [3]:
#Reading in all data as a glob
query = "Data/final_tmdb_data_*.csv.gz"
all_files = sorted(glob.glob(query))
# Showing the first 5
all_files

['Data/final_tmdb_data_2000.csv.gz',
 'Data/final_tmdb_data_2001.csv.gz',
 'Data/final_tmdb_data_2002.csv.gz',
 'Data/final_tmdb_data_2003.csv.gz',
 'Data/final_tmdb_data_2004.csv.gz',
 'Data/final_tmdb_data_2005.csv.gz',
 'Data/final_tmdb_data_2006.csv.gz',
 'Data/final_tmdb_data_2007.csv.gz',
 'Data/final_tmdb_data_2008.csv.gz',
 'Data/final_tmdb_data_2009.csv.gz',
 'Data/final_tmdb_data_2010.csv.gz',
 'Data/final_tmdb_data_2011.csv.gz',
 'Data/final_tmdb_data_2012.csv.gz',
 'Data/final_tmdb_data_2013.csv.gz',
 'Data/final_tmdb_data_2014.csv.gz',
 'Data/final_tmdb_data_2015.csv.gz',
 'Data/final_tmdb_data_2016.csv.gz',
 'Data/final_tmdb_data_2017.csv.gz',
 'Data/final_tmdb_data_2018.csv.gz',
 'Data/final_tmdb_data_2019.csv.gz',
 'Data/final_tmdb_data_2020.csv.gz',
 'Data/final_tmdb_data_2021.csv.gz',
 'Data/final_tmdb_data_2022.csv.gz']

In [None]:
#Loading all files into dfs and appending them to list
#df_list = []
#for file in all_files:
#    temp_df = pd.read_csv(file, lineterminator='\n')
#    df_list.append(temp_df)
    
## Concatenating the list of dfs into 1 combined
#df_combined = pd.concat(df_list, ignore_index = True)
#df_combined

In [None]:
#Only needs to be done once

#Saving data
#df_combined.to_csv('Data/tmdb_results_combined.csv.gz', compression='gzip',index=False)

In [None]:
#Reading in data from 2000
tmdbdata2000 = pd.read_csv("Data/final_tmdb_data_2000.csv.gz")
tmdbdata2000.head(2)

In [None]:
#Reading in data from 2001
tmdbdata2001 = pd.read_csv("Data/final_tmdb_data_2001.csv.gz")
tmdbdata2001.head(2)

In [None]:
#Only needs to be done once

#concatenating data
#results_combined = pd.concat([tmdbdata2000, tmdbdata2001], ignore_index = True)

#loading in dataset
results_combined = pd.read_csv("Data/tmdb_results_combined.csv.gz")

In [None]:
results_combined.duplicated(subset='id').sum()

In [None]:
results_combined.drop_duplicates(inplace = True)

results_combined.duplicated(subset = "id").sum()

In [None]:
#Only needs to be done once

#Saving data
#results_combined.to_csv('Data/tmdb_results_combined.csv.gz', compression='gzip',index=False)

In [None]:
results_combined.info()

## How many movies had at least some valid financial information (values > 0 for budget OR revenue)?

In [None]:
results_combined 

In [None]:
#function to determine if the budget AND revenue are both 0
def financial_ternary(row):
    if row["budget"] == 0 and row["revenue"] == 0:
        return "no"
    else: 
        return "yes"

#applying function to dataframe
results_combined["finances"] = results_combined.apply(lambda row: financial_ternary(row), axis = 1)
results_combined.head()

In [None]:
sns.countplot(data = results_combined, x = "finances")

plt.title("# Movies with financial budget or revenue data")
plt.xlabel("Has Budget or Revenue Data")
plt.ylabel("# Movies")
plt.grid(axis = "y")

We can see that a significant portion of the data has neither budget nor revenue information.

In [None]:
#removing rows based off of it has financial data
has_information = results_combined["finances"] == "yes"
results_combined = results_combined[has_information]
results_combined

## How many movies are there in each of the certification categories (G/PG/PG-13/R)?

Excluding any movies with 0's for budget AND revenue

count plot

In [None]:
sns.countplot(data = results_combined, x = "certification")

plt.title("Number Movies by Rating")
plt.xlabel("Movie Rating")
plt.ylabel("# Movies")
plt.grid(axis = "y")

We can see that there are over 200 movies rated R, roughly 125 movies rated PG-13, and less than 50 movies rated PG, G, or NR

## What is the average revenue per certification category?

Excluding any movies with 0's for budget AND revenue

bar plot

In [None]:
sns.barplot(data = results_combined, x = "certification", y = "revenue")

plt.title("Average Revenue per Movie Rating")
plt.xlabel("Movie Rating")
plt.ylabel("Average Revenue")
plt.grid(axis = "y")

We can see here that movies rated PG, G, and PG-13 all have revenues greater than or equal to 10 million USD.

## What is the average budget per certification category?

Excluding any movies with 0's for budget AND revenue

bar plot

In [None]:
sns.barplot(data = results_combined, x = "certification", y = "budget")

plt.title("Average Budget per Movie Rating")
plt.xlabel("Movie Rating")
plt.ylabel("Average Budget")
plt.grid(axis = "y")

We can see here that the average budget for PG and PG-13 movies is over 4 million USD