In [34]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_excel('../Godzilla.xlsx')

# Convert the 'Movie Budget (Yen)' and 'Final Revenue (Yen)' columns to numeric values, removing any non-numeric characters
df['Movie Budget (Yen)'] = df['Movie Budget (Yen)'].replace('[\¥,\,]', '', regex=True).replace('-', np.nan).astype(float)
df['Final Revenue (Yen)'] = df['Final Revenue (Yen)'].replace('[\¥,\,]', '', regex=True).replace('-', np.nan).astype(float)

# Convert Yen to Dollars (1 USD = 140 JPY)
conversion_rate = 140
df['Movie Budget (USD)'] = df['Movie Budget (Yen)'] / conversion_rate
df['Final Revenue (USD)'] = df['Final Revenue (Yen)'] / conversion_rate

era_to_numeric = {
    'Showa': 1,
    'Heisei': 2,
    'Reiwa': 3,
    'Milennium': 4,
    'Monsterverse': 5,
    'Tri-Star': 6
    # Add more mappings as needed
}

# Apply the mapping to the 'Era' column. Unmapped or missing values will become NaN.
df['Era'] = df['Era'].map(era_to_numeric)

E_bins = pd.cut(df['Era'], bins=6, labels=range(6))
df['Era_bins'] = E_bins

# Drop the Yen columns as we are using the USD columns for modeling
df = df.drop(columns=['Movie Budget (Yen)', 'Final Revenue (Yen)', 'Era'])

df['Movie Budget (USD)'] = df.groupby('Era_bins')['Movie Budget (USD)'].transform(lambda x: x.fillna(x.mean()))

# Define success based on revenue (e.g., revenue > 50 million)
df['Success'] = (df['Final Revenue (USD)'] > 50000000).astype(int)

# Select relevant features and target
features = ['Movie Budget (USD)', 'Release Date', 'Era_bins', 'Runtime (mins)', "Godzilla's Sizes (In Meters)"]
target = 'Success'

# Handle date feature by extracting the year
df['Release Year'] = df['Release Date'].dt.year
features.remove('Release Date')
features.append('Release Year')

# Split the data into training and testing sets
X = df[features]
y = df[target]

# Define transformers for numerical and categorical features
numerical_features = ['Movie Budget (USD)', 'Runtime (mins)', "Godzilla's Sizes (In Meters)", 'Release Year']
categorical_features = ['Era_bins']

  df['Movie Budget (USD)'] = df.groupby('Era_bins')['Movie Budget (USD)'].transform(lambda x: x.fillna(x.mean()))


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values with median
    ('scaler', StandardScaler())  # Scale numerical values
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical values
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),  # Apply numerical transformer to numerical features
        ('cat', categorical_transformer, categorical_features)  # Apply categorical transformer to categorical features
    ])

# Preprocess the data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [36]:
model = RandomForestClassifier(n_estimators=300, random_state=42)
# Train the model
model.fit(X_train, y_train)

# Save the trained model and the preprocessor
# Save the model
with open('godzilla_era_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

In [37]:

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       1.00      0.50      0.67         2

    accuracy                           0.88         8
   macro avg       0.93      0.75      0.79         8
weighted avg       0.89      0.88      0.86         8

Confusion Matrix:
 [[6 0]
 [1 1]]
Accuracy Score: 0.875
