In [None]:
import os
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# from verstack import NaNImputer
# from functools import partial

# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import FunctionTransformer

# from sklearn.model_selection import train_test_split

In [None]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports
from src.utils import save_dataframe


In [None]:
# Load dataset
data = pd.read_csv("./data/1.1_imputed_data_full.csv")

In [None]:
data.head()
data.info()
data.describe()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numeric_cols = ['metascore', 'rotten_tomatoes_rating', 'meta_critic_rating', 'budget', 'revenue',
                'tmdb_vote_count', 'tmdb_vote_average', 'runtime_in_min', 'tmdb_popularity', 
                'imdb_rating', 'imdb_votes', 'release_year', 'release_month', 'release_day', 'movie_age']

# numeric_cols = data.select_dtypes(include=['number']).columns.tolist()

data[numeric_cols].hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

# Boxplot for revenue as an example
plt.figure(figsize=(8, 4))
sns.boxplot(x=data['revenue'])
plt.title("Boxplot of Revenue")
plt.show()


In [None]:
corr_matrix = data[numeric_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data['budget'], y=data['revenue'])
plt.title("Budget vs. Revenue")
plt.xlabel("Budget")
plt.ylabel("Revenue")
plt.show()


In [None]:
cat_cols = ['production_country_name', 'spoken_languages', 'age_rating']
for col in cat_cols:
    print(f"Value counts for {col}:")
    print(data[col].value_counts())
    print("-" * 50)


In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(y=data['production_country_name'], order=data['production_country_name'].value_counts().index)
plt.title("Production Country Frequency")
plt.show()


In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(x=data['release_year'])
plt.title("Movies per Release Year")
plt.xticks(rotation=45)  # Rotate the x-axis labels by 45 degrees
plt.show()

plt.figure(figsize=(10, 4))
sns.countplot(x=data['release_month'])
plt.title("Movies per Release Month")
plt.show()


In [None]:
print(data['is_weekend'].value_counts(normalize=True))
print(data['is_holiday_season'].value_counts(normalize=True))


In [None]:
# Explode the genre column for analysis
genres = data['genre_names'].dropna().str.split(r',\s*').explode()
print(genres.value_counts())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter for Disney movies (assuming "Disney" appears in the production_company_name)
disney_data = data[data["production_company_name"].str.contains("Marvel", case=False, na=False)]

# Further filter for rows where budget_missing is 0
disney_data = disney_data[((disney_data["budget_missing"] == 0) & (disney_data["revenue_missing"] == 0))]

# Group by release_year and compute the average budget and revenue
budget_revenue_by_year = disney_data.groupby("release_year")[["budget", "revenue"]].mean().reset_index()

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(budget_revenue_by_year["release_year"], budget_revenue_by_year["budget"], marker="o", linestyle="-", label="Average Budget")
# plt.plot(budget_revenue_by_year["release_year"], budget_revenue_by_year["revenue"], marker="o", linestyle="-", label="Average Revenue")
plt.xlabel("Release Year")
plt.ylabel("Amount")
plt.title("Average Budget and Revenue Across Release Years for Disney Movies")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter for Disney movies (assuming "Disney" appears in the production_company_name)
company_data = data[data["production_company_name"].str.contains("Marvel", case=False, na=False)]

# Further filter for rows where budget_missing is 0
company_data = company_data[((company_data["budget_missing"] == 0) & (company_data["revenue_missing"] == 0))]

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(company_data["release_year"], company_data["budget"], marker="o", linestyle="-", label="Average Budget")
# plt.plot(company_data["release_year"], company_data["revenue"], marker="o", linestyle="-", label="Average Revenue")
plt.xlabel("Release Year")
plt.ylabel("Amount")
plt.title("Average Budget and Revenue Across Release Years for Disney Movies")
plt.legend()
plt.grid(True)
plt.show()