In [29]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
df = pd.read_excel('Godzilla.xlsx')

# Convert the 'Movie Budget (Yen)' and 'Final Revenue (Yen)' columns to numeric values, removing any non-numeric characters
df['Movie Budget (Yen)'] = df['Movie Budget (Yen)'].replace('[\¥,\,]', '', regex=True).replace('-', np.nan).astype(float)
df['Final Revenue (Yen)'] = df['Final Revenue (Yen)'].replace('[\¥,\,]', '', regex=True).replace('-', np.nan).astype(float)

# Convert Yen to Dollars (1 USD = 140 JPY)
conversion_rate = 140
df['Movie Budget (USD)'] = df['Movie Budget (Yen)'] / conversion_rate
df['Final Revenue (USD)'] = df['Final Revenue (Yen)'] / conversion_rate

# Drop the Yen columns as we are using the USD columns for modeling
df = df.drop(columns=['Movie Budget (Yen)', 'Final Revenue (Yen)'])

df['Movie Budget (USD)'] = df.groupby('Era')['Movie Budget (USD)'].transform(lambda x: x.fillna(x.mean()))

# Select relevant features and target
features = ['Movie Budget (USD)', 'Release Date', 'Era', 'Runtime (mins)', "Godzilla's Sizes (In Meters)"]
target = 'Final Revenue (USD)'

# Define transformers for numerical and categorical features
numerical_features = ['Movie Budget (USD)', 'Runtime (mins)', "Godzilla's Sizes (In Meters)", 'Release Year']
categorical_features = ['Era']

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Calculate the correlation matrix using the relevant numeric columns
numeric_columns = [
    'IMDb Rating', 'Rotten Tomatoes Rating', 'Runtime (mins)', 
    'Year', 'Num Votes', "Godzilla's Sizes (In Feet)", "Godzilla's Sizes (In Meters)", 
    'Number of Monsters', 'Tickets Sold', 'Movie Budget (USD)', 'Final Revenue (USD)'
]

corr_matrix = df[numeric_columns].corr()

# Create a heatmap with Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True, linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [30]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [31]:
# Extract features and target
X = df[features]
y = df[target]

# Convert 'Release Date' to a numerical format (e.g., year only)
X['Release Year'] = X['Release Date'].dt.year
X = X.drop(columns=['Release Date'])

X_preprocessed = preprocessor.fit_transform(X)

# Convert the preprocessed features back to a DataFrame for easier inspection
preprocessed_feature_names = numerical_features + \
                            list(preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(categorical_features))

X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=preprocessed_feature_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Release Year'] = X['Release Date'].dt.year


In [32]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [33]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [34]:
# Predict on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 318004577942746.9
R-squared: 0.7222010945989945


In [35]:
# Get feature importances
importances = model.feature_importances_

# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({
    'Feature': preprocessed_feature_names,
    'Importance': importances
})

# Sort by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances)

                        Feature  Importance
0            Movie Budget (USD)    0.384941
6              Era_Monsterverse    0.329113
3                  Release Year    0.118742
1                Runtime (mins)    0.108769
2  Godzilla's Sizes (In Meters)    0.030445
8                    Era_Showa     0.018261
9                  Era_Tri-Star    0.003478
5                 Era_Milennium    0.002899
4                    Era_Heisei    0.001863
7                     Era_Reiwa    0.001489
