In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
# Read the CSV file once
df_org = pd.read_csv("burgerking.csv", encoding='latin-1')
df = df_org.copy()
df.drop('Item', axis=1, inplace=True)

In [None]:
df.head(5)

# Get unique Categories


In [None]:
unique_categories = df['Category'].unique()
unique_categories

# Create a dictionary of DataFrames for each unique category

In [None]:

dfs = {category: df[df['Category'] == category].drop('Category', axis=1) for category in unique_categories}


# Scale each DataFrame and store the scaled versions in a new dictionary

In [None]:
dfs_scaled = {}
for category, df_cat in dfs.items():
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_cat)
    dfs_scaled[category] = pd.DataFrame(scaled_data, columns=df_cat.columns)

# Determine the optimal number of clusters using the Elbow Method

In [None]:
def calculate_wcss(data):
    wcss = []
    num_clusters = min(10, len(np.unique(data, axis=0)))  # Limit number of clusters based on unique points
    for i in range(1, num_clusters + 1):
        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=1000, n_init=10, random_state=0)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    return wcss

# Plot the Elbow Method results

In [None]:
num_plots = len(dfs_scaled)
num_cols = 3
num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate the number of rows needed

fig, axs = plt.subplots(num_rows, num_cols, figsize=(10, 5 * num_rows))

if num_plots == 1:
    axs = [axs]
else:
    axs = axs.flatten()
    
for idx, (category, df_scaled) in enumerate(dfs_scaled.items()):
    print(f"Category: {category}, DataFrame shape: {df_scaled.shape}")  # Debug print
    wcss = calculate_wcss(df_scaled)
    print(f"WCSS for {category}: {wcss}")  # Debug print
    axs[idx].plot(range(1, len(wcss) + 1), wcss)
    axs[idx].set_title(f'Elbow Method for Optimal k - {category}')
    axs[idx].set_xlabel('Number of Clusters')
    axs[idx].set_ylabel('WCSS')
    
# Remove empty subplots
for i in range(idx + 1, len(axs)):
    fig.delaxes(axs[i])

# Adjust layout and display the plots
plt.tight_layout()
plt.show()

## Designed a loop for all categories and perform clustering on each one. I could have done indvidually however, this saves time and the dataset is relatively small. 

In [None]:


# Initialize the dictionary to store the predictions
predictions = {}

# Loop through each category to perform clustering
for category, df_scaled in dfs_scaled.items():
    # Determine the optimal number of clusters using the elbow method
    wcss = calculate_wcss(df_scaled)
    optimal_clusters = wcss.index(min(wcss[1:])) + 1  # +1 because index starts at 0
    
    # Perform clustering with the optimal number of clusters
    kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=1000, n_init=10, random_state=0)
    pred = kmeans.fit_predict(df_scaled)
    
    # Store the predictions
    predictions[category] = pred

    # Save the results back to the original DataFrame
    df_org.loc[df_org['Category'] == category, 'Pred'] = pred
    df_org[df_org['Category'] == category].to_csv(f'{category}_clustered.csv')

# Optionally, save the entire DataFrame with predictions
df_org.to_csv('all_categories_clustered.csv', index=False)
