# Project 3 Part 2 - Exploratory Data Analysis (EDA)

In [3]:
import pandas as pd

In [4]:
# Load the CSV files for each year's movie data
data_file_paths = ["movie_results/movie_data_2000.csv.gz", "movie_results/movie_data_2001.csv.gz"]
dataframes = []

In [6]:
for file_path in data_file_paths:
    df = pd.read_csv(file_path, sep='\t', compression='gzip')
    dataframes.append(df)

In [7]:
# Concatenate the data from both years into one DataFrame
all_movie_data = pd.concat(dataframes, ignore_index=True)

In [8]:
# Filter out movies with 0 values for both budget and revenue
valid_financial_data = all_movie_data[(all_movie_data['budget'] > 0) | (all_movie_data['revenue'] > 0)]

In [9]:
# Calculate the number of movies with valid financial information
num_movies_with_financial_info = len(valid_financial_data)

In [10]:
# Count the number of movies in each certification category
certification_counts = valid_financial_data['certification'].value_counts()

In [11]:
# Calculate the average revenue and budget per certification category
average_revenue_per_certification = valid_financial_data.groupby('certification')['revenue'].mean()
average_budget_per_certification = valid_financial_data.groupby('certification')['budget'].mean()

In [12]:
# Print the results
print("Number of movies with valid financial information:", num_movies_with_financial_info)
print("Number of movies in each certification category:")
print(certification_counts)
print("Average revenue per certification category:")
print(average_revenue_per_certification)
print("Average budget per certification category:")
print(average_budget_per_certification)

Number of movies with valid financial information: 1294
Number of movies in each certification category:
R        448
PG-13    266
PG        70
NR        44
G         28
Name: certification, dtype: int64
Average revenue per certification category:
certification
G        1.126328e+08
NR       1.606649e+07
PG       1.069017e+08
PG-13    9.833302e+07
R        3.023678e+07
Name: revenue, dtype: float64
Average budget per certification category:
certification
G        3.478571e+07
NR       8.794413e+06
PG       4.442849e+07
PG-13    4.310882e+07
R        1.819752e+07
Name: budget, dtype: float64


In [14]:
# Specify the file path for the final merged CSV file
output_file = "/Users/MiPillay1/OneDrive - MMI HOLDINGS LTD/Documents/GitHub/Project_3-Data_Enrichment/movie_results/tmdb_results_combined.csv.gz"

# Save the DataFrame to a compressed CSV file
all_movie_data.to_csv(output_file, sep='\t', compression='gzip', index=False)