In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from joblib import load
from joblib import dump
import pickle
import boto3
from google.colab import userdata
import sqlalchemy

In [None]:
Access_key = userdata.get('access_key')
Secret_key =userdata.get('secret_key')

In [None]:
# configuration setup
s3_client = boto3.client(
                          's3',
                          aws_access_key_id = Access_key,
                          aws_secret_access_key = Secret_key)

In [None]:
# MySQL
server = 'endpoint'
port = 3306
user = 'admin'
password = 'Gautam1773'
database = 'chefmate'
sqltype = "mysql+mysqlconnector"

engine = sqlalchemy.create_engine(f'{sqltype}://{user}:{password}@{server}:{port}/{database}')

In [None]:
# SQL query
query = "SELECT * FROM restaurant"
# Load data into a DataFrame
df = pd.read_sql(query, engine)

In [None]:
df.head()

In [None]:
# remove one index
df = df.drop(columns=['index'])

In [None]:
#  Select Relevant Columns
features = df[['Cuisines', 'Restaurant_id', 'Aggregate_rating']]

In [None]:
#One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False)
cuisines_encoded = encoder.fit_transform(features[['Cuisines']])
cuisines_df = pd.DataFrame(cuisines_encoded, columns=encoder.get_feature_names_out(['Cuisines']))

In [None]:
#Prepare Data for Clustering
numerical_features = features[['Restaurant_id', 'Aggregate_rating']]
clustering_data = pd.concat([cuisines_df, numerical_features.reset_index(drop=True)], axis=1)

In [None]:
#Determine Optimal Number of Clusters (Elbow Method)
inertia = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(clustering_data)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(clustering_data, kmeans.labels_))

In [None]:
# Plotting Elbow Method
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(K_range, inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')

plt.subplot(1, 2, 2)
plt.plot(K_range, silhouette_scores, marker='o')
plt.title('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
#Clustering Algorithms
best_k = K_range[silhouette_scores.index(max(silhouette_scores))]  # Optimal k from silhouette score
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans_labels = kmeans.fit_predict(clustering_data)

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(clustering_data)

In [None]:
agglo = AgglomerativeClustering(n_clusters=best_k)
agglo_labels = agglo.fit_predict(clustering_data)

In [None]:
gmm = GaussianMixture(n_components=best_k)
gmm_labels = gmm.fit_predict(clustering_data)

In [None]:
def evaluate_clustering_model(model, data, model_name):
    # Predict cluster labels
    if model_name == 'KMeans':
        labels = model.labels_  # KMeans has labels_ attribute after fitting
    else:
        labels = model.fit_predict(data)  # For other models, fit and predict

    # Calculate evaluation metrics
    silhouette = silhouette_score(data, labels)
    if model_name == 'KMeans':
        inertia = model.inertia_  # Only for KMeans
    else:
        inertia = None  # Not applicable for other models

    davies_bouldin = davies_bouldin_score(data, labels)
    calinski_harabasz = calinski_harabasz_score(data, labels)

    # Print the results
    print(f"Model: {model_name}")
    print(f"Silhouette Score: {silhouette:.4f}")
    if inertia is not None:
        print(f"Inertia: {inertia:.4f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
    print("-" * 50)

In [None]:
#Evaluate all models
evaluate_clustering_model(kmeans, clustering_data, 'KMeans')
evaluate_clustering_model(dbscan, clustering_data, 'DBSCAN')
evaluate_clustering_model(agglo, clustering_data, 'Agglomerative Clustering')
#evaluate_clustering_model(mean_shift, clustering_data, 'Mean Shift')
evaluate_clustering_model(gmm, clustering_data, 'Gaussian Mixture Model')

In [None]:
def plot_clusters(data, labels, model_name):
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(data)

    plt.figure(figsize=(8, 6))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis', marker='o', edgecolor='k', s=50)
    plt.title(f'Clusters formed by {model_name}')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.colorbar()
    plt.show()

In [None]:
#Plot clusters for each model
plot_clusters(clustering_data, kmeans.labels_, 'KMeans')
plot_clusters(clustering_data, dbscan.labels_, 'DBSCAN')
plot_clusters(clustering_data, agglo.labels_, 'Agglomerative Clustering')
#plot_clusters(clustering_data, mean_shift.labels_, 'Mean Shift')
plot_clusters(clustering_data, gmm.predict(clustering_data), 'Gaussian Mixture Model')

In [None]:
#Save the Models
with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

In [None]:
# upload the kmeans_model to s3
file_path = "kmeans_model.pkl"
s3_client.upload_file(file_path,'chefmatebucket1','datas/kmeans_model.pkl') #folder/filename

In [None]:
# Save the encoder
with open('onehot_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [None]:
df['Cluster'] = kmeans.labels_

In [None]:
df

In [None]:
# Save cleaned df to a CSV file
df.to_csv('Zomato_cluster_data.csv', index=False)

In [None]:
# upload the cluster data to s3
file_path = "Zomato_cluster_data.csv"
s3_client.upload_file(file_path,'chefmatebucket1','datas/Zomato_cluster_data.csv') #folder/filename