In [1]:
# Import Basis
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler as SS

from sklearn.model_selection import GridSearchCV as GCV

# Model
from sklearn.cluster import KMeans as KM, AgglomerativeClustering

# Model Evaluator
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
live = pd.read_csv('Live_cln_encd.csv')
live.sample(10)

Unnamed: 0,status_type,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,num_sads,num_angrys
765,1,217,1,0,215,2,0,0,0,0
985,1,302,9,0,302,0,0,0,0,0
5767,1,42,0,0,40,0,2,0,0,0
2254,1,14,23,0,14,0,0,0,0,0
3945,1,128,5,4,123,1,4,0,0,0
1603,1,326,9,0,326,0,0,0,0,0
4746,3,20,0,1,20,0,0,0,0,0
1000,1,136,5,1,136,0,0,0,0,0
4193,3,1,0,0,1,0,0,0,0,0
3734,1,5,0,0,5,0,0,0,0,0


In [3]:
kmeans1 = KM()

# Define the parameter grid to search
param_grid = {
    'n_clusters': range(1,11),
    'init' : ['k-means++', 'random'],
    'n_init': range(1,16),
    # 'max_iter': range(1,400),
    # 'tol': np.linspace(1e-5, 1e-3, 10),
    # 'verbose': range(0,10),
    # 'random_state': range(0,50),
    # 'copy_x': [True, False],
    # 'algorithm': ["lloyd", "elkan", "auto", "full"],
}

# Create GridSearchCV object
grid_model = GCV(
    estimator=kmeans1, 
    param_grid=param_grid, 
    cv=5, 
    return_train_score=True
)

# Fit the model
grid_model.fit(live)


# Get the best parameters
best_params = grid_model.best_params_
print(f"Best parameters found: {best_params}")

Best parameters found: {'init': 'k-means++', 'n_clusters': 10, 'n_init': 11}


In [12]:
# Convert cv_results_ to a DataFrame
results_df = pd.DataFrame(grid_model.cv_results_)
results_df.sample(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_init,param_n_clusters,param_n_init,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
225,0.009324,0.007613,0.0,0.0,random,6,1,"{'init': 'random', 'n_clusters': 6, 'n_init': 1}",-809691500.0,-66480310.0,...,-430797500.0,288719000.0,149,-766112900.0,-1123709000.0,-991953700.0,-870303100.0,-1014001000.0,-953216100.0,123406300.0
171,0.010395,0.008689,0.0,0.0,random,2,7,"{'init': 'random', 'n_clusters': 2, 'n_init': 7}",-2480455000.0,-201681500.0,...,-1453064000.0,1001171000.0,268,-2624002000.0,-5354088000.0,-4599785000.0,-4227286000.0,-3131593000.0,-3987351000.0,989106500.0
247,0.061498,0.022766,0.002905,0.00581,random,7,8,"{'init': 'random', 'n_clusters': 7, 'n_init': 8}",-783926900.0,-55271550.0,...,-400670300.0,280112000.0,119,-657735500.0,-958567600.0,-870811000.0,-755343900.0,-800317900.0,-808555200.0,102005300.0
96,0.043274,0.006753,0.0,0.0,k-means++,7,7,"{'init': 'k-means++', 'n_clusters': 7, 'n_init...",-512505500.0,-68326040.0,...,-328140000.0,192833600.0,93,-637205000.0,-866060000.0,-778521300.0,-612225000.0,-559174000.0,-690637100.0,113826900.0
173,0.020141,0.006171,0.002974,0.005948,random,2,9,"{'init': 'random', 'n_clusters': 2, 'n_init': 9}",-2480455000.0,-201681500.0,...,-1337850000.0,1072085000.0,244,-2624002000.0,-5354088000.0,-4544491000.0,-4227286000.0,-3131588000.0,-3976291000.0,982485100.0


In [13]:
kmeans = KM(
    n_clusters=10,
    init='k-means++',
    n_init=11
)
kmeans_labels = kmeans.fit_predict(live)

In [14]:
silhouette_avg = silhouette_score(live, kmeans_labels)
print("The average silhouette score is :", silhouette_avg)

The average silhouette score is : 0.6820234019228122


In [15]:
db_index = davies_bouldin_score(live, kmeans_labels)
print("The davies-bouldin index for is :", db_index)

The davies-bouldin index for is : 0.5835141434583972
