# Clustering
## Ikjot Singh
### 102116071
### 3CS11

### Imports
*Importing libraries*

In [1]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from pycaret.clustering import *

*Importing Dataset*

Estimation of obesity levels based on eating habits and physical condition . (2019). UCI Machine Learning Repository. https://doi.org/10.24432/C5H31Z.

In [2]:
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 

### Preprocessing

*Splitting Dataset into x and y* 

In [3]:
X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features 
y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets

In [4]:
X.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation


In [5]:
y.head()


Unnamed: 0,NObeyesdad
0,Normal_Weight
1,Normal_Weight
2,Normal_Weight
3,Overweight_Level_I
4,Overweight_Level_II


*Preprocessing Data*

In [6]:
X["Gender"]=X["Gender"].apply(lambda x: {True: 1, False:0}[x=="Male"])
X["FAVC"]=X["FAVC"].apply(lambda x: {True: 1, False:0}[x=="yes"])
X["family_history_with_overweight"]=X["family_history_with_overweight"].apply(lambda x: {True: 1, False:0}[x=="yes"])
X["SMOKE"]=X["SMOKE"].apply(lambda x: {True: 1, False:0}[x=="yes"])
X["SCC"]=X["SCC"].apply(lambda x: {True: 1, False:0}[x=="yes"])

In [7]:
X['CAEC']=X['CAEC'].apply(lambda x: {'no':0,'Sometimes':1,'Frequently':2,'Always':3}[x])
X['CALC']=X['CALC'].apply(lambda x: {'no':0,'Sometimes':1,'Frequently':2,'Always':3}[x])
X=pd.get_dummies(X,columns=['MTRANS'],drop_first=True)

### Metrics

*Creating dataframes for storing metrics*

In [8]:
rows = [ 'Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']
kmeans_metrics = pd.DataFrame(index = rows)
hier_metrics = pd.DataFrame(index = rows)
kmeans_shift_metrics = pd.DataFrame(index = rows)

*Defining a function to get the metrics*

In [9]:
def get_scores(Model, metrics):
    silhouette_score_function = metrics.loc['silhouette', 'Score Function']
    silhouette_score = silhouette_score_function(X, Model.labels_)

    chs_score_function = metrics.loc['chs', 'Score Function']
    Calinski_Harabasz_score = chs_score_function(X, Model.labels_)

    db_score_function = metrics.loc['db', 'Score Function']
    Davies_Bouldin_score = db_score_function(X, Model.labels_)

    return (silhouette_score, Calinski_Harabasz_score, Davies_Bouldin_score)

### Running Models

*Defining a function to run models*

In [10]:
def run_clustering_model(model_name, setup_args, num_clusters_range, data):
    metrics_dict = {}

    for setup_arg in setup_args:
        for num_clusters in num_clusters_range:
            print(model_name, setup_arg['Type'], num_clusters)
            model_params = setup(data=X, **setup_arg['setup_kwargs'])
            model = create_model(model_name, num_clusters=num_clusters)
            metrics = get_metrics()

            a,b,c = get_scores(model, metrics)

            metrics_dict[(setup_arg['Type'], f'c={num_clusters}')] = [a,b,c]

    return pd.DataFrame(data=metrics_dict, index=rows)


*Defining rows for metrics and number of clusters*

In [11]:
rows = ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']
num_clusters_range = range(3, 6) 

*Defining setup args for all scenarios*

In [12]:
setup_args = [
    {'Type': 'No Data Preprocessing', 'setup_kwargs': {'verbose': False}},
    {'Type': 'Using Normalization', 'setup_kwargs': {'normalize': True, 'normalize_method': 'zscore', 'verbose': False}},
    {'Type': 'Using Transform', 'setup_kwargs': {'transformation': True, 'transformation_method': 'yeo-johnson', 'verbose': False}},
    {'Type': 'Using PCA', 'setup_kwargs': {'pca': True, 'pca_method': 'linear', 'verbose': False}},
    {'Type': 'Using T+N', 'setup_kwargs': {'transformation': True, 'transformation_method': 'yeo-johnson', 'normalize': True, 'normalize_method': 'zscore', 'verbose': False}},
    {'Type': 'Using T+N+PCA', 'setup_kwargs': {'pca': True, 'pca_method': 'linear', 'normalize': True, 'normalize_method': 'zscore', 'transformation': True, 'transformation_method': 'yeo-johnson', 'verbose': False}}
]



*Creating a dictionary to store results*

In [13]:
result_dict={}

*Running Models*
- Kmeans Clustering
- Hierarchical Clustering
- Mean Shift Clustering

In [14]:
models = ['kmeans', 'hclust', 'meanshift']
for model in models:
    result_dict[model] = run_clustering_model(model_name=model, setup_args=setup_args, num_clusters_range=num_clusters_range, data=X)

kmeans No Data Preprocessing 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5031,4729.2488,0.6675,0,0,0


kmeans No Data Preprocessing 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4784,4817.9545,0.6844,0,0,0


kmeans No Data Preprocessing 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4317,4599.8858,0.752,0,0,0


kmeans Using Normalization 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1488,199.3682,2.4448,0,0,0


kmeans Using Normalization 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1034,184.2594,2.1646,0,0,0


kmeans Using Normalization 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1089,187.386,1.8318,0,0,0


kmeans Using Transform 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6937,219757.5847,0.4166,0,0,0


kmeans Using Transform 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5892,199768.9289,0.4992,0,0,0


kmeans Using Transform 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5517,211877.6767,0.6193,0,0,0


kmeans Using PCA 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5031,4729.2478,0.6675,0,0,0


kmeans Using PCA 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4748,4817.8761,0.6987,0,0,0


kmeans Using PCA 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4324,4599.9982,0.749,0,0,0


kmeans Using T+N 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1431,195.121,2.5347,0,0,0


kmeans Using T+N 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1053,189.0776,2.2774,0,0,0


kmeans Using T+N 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1016,181.9662,2.0372,0,0,0


kmeans Using T+N+PCA 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1431,195.121,2.5347,0,0,0


kmeans Using T+N+PCA 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1074,188.6004,2.1608,0,0,0


kmeans Using T+N+PCA 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1543,179.9479,2.0194,0,0,0


hclust No Data Preprocessing 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4757,4302.4911,0.7001,0,0,0


hclust No Data Preprocessing 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4675,4346.0394,0.667,0,0,0


hclust No Data Preprocessing 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4151,4114.6325,0.7314,0,0,0


hclust Using Normalization 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1477,172.5966,2.6492,0,0,0


hclust Using Normalization 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1529,170.969,2.2802,0,0,0


hclust Using Normalization 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1596,174.4808,1.8468,0,0,0


hclust Using Transform 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6937,219757.5847,0.4166,0,0,0


hclust Using Transform 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5923,199358.3515,0.4885,0,0,0


hclust Using Transform 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5548,211263.2927,0.6107,0,0,0


hclust Using PCA 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4757,4302.4942,0.7001,0,0,0


hclust Using PCA 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4675,4346.0409,0.667,0,0,0


hclust Using PCA 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4151,4114.6364,0.7314,0,0,0


hclust Using T+N 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2153,168.9287,2.4209,0,0,0


hclust Using T+N 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.198,168.1136,2.0814,0,0,0


hclust Using T+N 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2046,172.0094,1.7095,0,0,0


hclust Using T+N+PCA 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2153,168.9288,2.4209,0,0,0


hclust Using T+N+PCA 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.198,168.1137,2.0814,0,0,0


hclust Using T+N+PCA 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2046,172.0095,1.7095,0,0,0


meanshift No Data Preprocessing 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5592,4309.4903,0.5983,0,0,0


meanshift No Data Preprocessing 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5592,4309.4903,0.5983,0,0,0


meanshift No Data Preprocessing 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5592,4309.4903,0.5983,0,0,0


meanshift Using Normalization 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3003,30.6594,0.776,0,0,0


meanshift Using Normalization 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3003,30.6594,0.776,0,0,0


meanshift Using Normalization 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3003,30.6594,0.776,0,0,0


meanshift Using Transform 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.964,231576.6177,0.0608,0,0,0


meanshift Using Transform 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.964,231576.6177,0.0608,0,0,0


meanshift Using Transform 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.964,231576.6177,0.0608,0,0,0


meanshift Using PCA 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5592,4309.4964,0.5983,0,0,0


meanshift Using PCA 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5592,4309.4964,0.5983,0,0,0


meanshift Using PCA 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5592,4309.4964,0.5983,0,0,0


meanshift Using T+N 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3182,36.0077,0.7927,0,0,0


meanshift Using T+N 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3182,36.0077,0.7927,0,0,0


meanshift Using T+N 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3182,36.0077,0.7927,0,0,0


meanshift Using T+N+PCA 3


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3182,36.0077,0.7927,0,0,0


meanshift Using T+N+PCA 4


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3182,36.0077,0.7927,0,0,0


meanshift Using T+N+PCA 5


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3182,36.0077,0.7927,0,0,0


### Results
*Kmeans Clustering*

In [15]:
result_dict['kmeans']

Unnamed: 0_level_0,No Data Preprocessing,No Data Preprocessing,No Data Preprocessing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA,Using T+N,Using T+N,Using T+N,Using T+N+PCA,Using T+N+PCA,Using T+N+PCA
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,0.503127,0.478386,0.431739,0.153024,-0.02445,-0.050929,0.062337,-0.098156,-0.128453,0.503127,0.474788,0.432419,0.148261,-0.025149,-0.005336,0.148261,-0.027766,0.012361
Calinski-Harabasz,4729.250305,4817.956074,4599.889932,830.362258,458.815214,342.532376,384.60245,277.072482,216.554677,4729.250305,4817.879404,4599.999533,818.502893,446.300778,439.334417,818.502893,434.891653,441.930868
Davies-Bouldin,0.667549,0.684391,0.752037,1.852094,5.426774,5.27287,2.814921,3.761741,8.267418,0.667549,0.698727,0.748969,1.816003,4.733638,2.469125,1.816003,4.807169,3.495536


*Hierarchial Clustering*

In [16]:
result_dict['hclust']

Unnamed: 0_level_0,No Data Preprocessing,No Data Preprocessing,No Data Preprocessing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA,Using T+N,Using T+N,Using T+N,Using T+N+PCA,Using T+N+PCA,Using T+N+PCA
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,0.475726,0.467548,0.415117,0.060391,-0.029765,-0.042453,0.062337,-0.099628,-0.129376,0.475726,0.467548,0.415117,0.098841,-0.014811,-0.029596,0.098841,-0.014811,-0.029596
Calinski-Harabasz,4302.495839,4346.043925,4114.637298,436.363741,309.647539,232.338639,384.60245,276.709678,216.276108,4302.495839,4346.043925,4114.637298,392.483965,266.722296,200.14228,392.483965,266.722296,200.14228
Davies-Bouldin,0.700061,0.666997,0.731406,2.471153,2.976022,3.615776,2.814921,3.737062,8.253889,0.700061,0.666997,0.731406,4.727314,4.503711,4.842121,4.727314,4.503711,4.842121


*Kmeans Shift*

In [17]:
result_dict['meanshift']

Unnamed: 0_level_0,No Data Preprocessing,No Data Preprocessing,No Data Preprocessing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA,Using T+N,Using T+N,Using T+N,Using T+N+PCA,Using T+N+PCA,Using T+N+PCA
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,0.559179,0.559179,0.559179,-0.740924,-0.740924,-0.740924,0.019492,0.019492,0.019492,0.559179,0.559179,0.559179,-0.742555,-0.742555,-0.742555,-0.742555,-0.742555,-0.742555
Calinski-Harabasz,4309.498187,4309.498187,4309.498187,4.867212,4.867212,4.867212,158.963052,158.963052,158.963052,4309.498187,4309.498187,4309.498187,5.356018,5.356018,5.356018,5.356018,5.356018,5.356018
Davies-Bouldin,0.598328,0.598328,0.598328,2.757867,2.757867,2.757867,1.720077,1.720077,1.720077,0.598328,0.598328,0.598328,3.614551,3.614551,3.614551,3.614551,3.614551,3.614551


### Conclusion
*Based on results*
- Best Clustering Algorithm -> Kmean Shift
- Silhouette score -> 0.559179
- Best number of clusters -> 3