In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_excel('../Godzilla.xlsx')

# Convert the 'Movie Budget (Yen)' and 'Final Revenue (Yen)' columns to numeric values, removing any non-numeric characters
df['Movie Budget (Yen)'] = df['Movie Budget (Yen)'].replace('[\¥,\,]', '', regex=True).replace('-', np.nan).astype(float)
df['Final Revenue (Yen)'] = df['Final Revenue (Yen)'].replace('[\¥,\,]', '', regex=True).replace('-', np.nan).astype(float)

# Convert Yen to Dollars (1 USD = 140 JPY)
conversion_rate = 140
df['Movie Budget (USD)'] = df['Movie Budget (Yen)'] / conversion_rate
df['Final Revenue (USD)'] = df['Final Revenue (Yen)'] / conversion_rate

# Drop the Yen columns as we are using the USD columns for modeling
df = df.drop(columns=['Movie Budget (Yen)', 'Final Revenue (Yen)'])

# Select relevant features for clustering
features = ['Movie Budget (USD)', 'Runtime (mins)', "Godzilla's Sizes (In Meters)", 'IMDb Rating', 'Rotten Tomatoes Rating']


In [5]:
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Position,English Title,Japanese Title,IMDb Rating,Rotten Tomatoes Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors,Godzilla's Sizes (In Feet),Godzilla's Sizes (In Meters),Era,Number of Monsters,Tickets Sold,URL,Movie Budget (USD),Final Revenue (USD)
0,1,Godzilla (1954),Gojira,7.5,0.93,96,1954,"Horror, Sci-Fi",33861,1954-11-03,Ishirô Honda,164,50,Showa,1,9610000,https://www.imdb.com/title/tt0047034/,721428.57,1307142.86
1,2,Godzilla Raids Again,Gojira no gyakushû,5.8,0.6,82,1955,"Action, Horror, Sci-Fi",6922,1955-04-24,"Ishirô Honda, Motoyoshi Oda",164,50,Showa,2,8340000,https://www.imdb.com/title/tt0048127/,714285.71,1214285.71
2,3,King Kong vs. Godzilla,King Kong vs. Godzilla,5.7,0.5,97,1962,"Action, Adventure, Fantasy, Sci-Fi",10398,1963-06-03,"Tom Montgomery, Ishirô Honda",164,50,Showa,1,11200000,https://www.imdb.com/title/tt0056142/,1071428.57,3071428.57
3,4,Mothra vs. Godzilla,Mosura tai Gojira,6.5,0.92,89,1964,"Adventure, Fantasy, Sci-Fi",8934,1964-04-29,Ishirô Honda,164,50,Showa,2,7220000,https://www.imdb.com/title/tt0058379/,878571.43,4464285.71
4,5,Ghidorah: The Three-Headed Monster,San daikaijû: Chikyû saidai no kessen,6.5,0.75,93,1964,"Action, Adventure, Fantasy, Sci-Fi, Thriller",6703,1964-12-20,Ishirô Honda,164,50,Showa,4,5410000,https://www.imdb.com/title/tt0058544/,,2785714.29


In [None]:

# Drop rows with missing values in the selected features
df = df.dropna(subset=features)

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

# Add cluster labels to the dataframe
df['Cluster'] = clusters

In [None]:
# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Movie Budget (USD)', y='Final Revenue (USD)', hue='Cluster', palette='viridis', s=100)
plt.title('K-Means Clustering of Godzilla Movies')
plt.xlabel('Movie Budget (USD)')
plt.ylabel('Final Revenue (USD)')
plt.legend(title='Cluster')
plt.show()
