In [21]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_excel('../../Godzilla.xlsx')

# Convert the 'Movie Budget (Yen)' and 'Final Revenue (Yen)' columns to numeric values, removing any non-numeric characters
df['Movie Budget (Yen)'] = df['Movie Budget (Yen)'].replace('[\¥,\,]', '', regex=True).replace('-', np.nan).astype(float)
df['Final Revenue (Yen)'] = df['Final Revenue (Yen)'].replace('[\¥,\,]', '', regex=True).replace('-', np.nan).astype(float)

# Convert Yen to Dollars (1 USD = 140 JPY)
conversion_rate = 140
df['Movie Budget (USD)'] = df['Movie Budget (Yen)'] / conversion_rate
df['Final Revenue (USD)'] = df['Final Revenue (Yen)'] / conversion_rate

# Drop the Yen columns as we are using the USD columns for modeling
df = df.drop(columns=['Movie Budget (Yen)', 'Final Revenue (Yen)', 'Position', 'URL', 'Japanese Title', 'Godzilla\'s Sizes (In Meters)'])

# Multi-label binarizer for genres
mlb = MultiLabelBinarizer()
df['Genres'] = df['Genres'].apply(lambda x: x.split(', '))  # assuming genres are separated by ', '
genres = mlb.fit_transform(df['Genres'])

# Create a DataFrame for genres
genre_df = pd.DataFrame(genres, columns=mlb.classes_)

# Drop original 'Genres' column and concatenate the new genre columns
df = df.drop(columns=['Genres'])
df = pd.concat([df, genre_df], axis=1)

# Select relevant features for modeling
features = ['Movie Budget (USD)', 'Runtime (mins)', "Godzilla's Sizes (In Feet)", 'IMDb Rating', 'Rotten Tomatoes Rating', 'Year']
target = mlb.classes_

In [22]:
# Split the data into training and testing sets
X = df[features]
y = genre_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define transformers for numerical features
numerical_features = ['Movie Budget (USD)', 'Runtime (mins)', "Godzilla's Sizes (In Feet)", 'IMDb Rating', 'Rotten Tomatoes Rating', 'Year']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values with median
    ('scaler', StandardScaler())  # Scale numerical values
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),  # Apply numerical transformer to numerical features
    ])

In [23]:
# Preprocess the data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Train the model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=300, random_state=42))
model.fit(X_train, y_train)

In [25]:
# Save the trained model and the preprocessor
with open('godzilla_multilabel_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
with open('mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)

In [24]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
for i, genre in enumerate(target):
    print(f"Classification Report for genre {genre}:\n", classification_report(y_test.iloc[:, i], y_pred[:, i]))
    print(f"Accuracy Score for genre {genre}:", accuracy_score(y_test.iloc[:, i], y_pred[:, i]))

print("\nOverall Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report for genre Action:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         8

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8

Accuracy Score for genre Action: 1.0
Classification Report for genre Adventure:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.38      1.00      0.55         3

    accuracy                           0.38         8
   macro avg       0.19      0.50      0.27         8
weighted avg       0.14      0.38      0.20         8

Accuracy Score for genre Adventure: 0.375
Classification Report for genre Animation:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         1

    accuracy                

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
