## Implementing K Means Clustering

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features 
y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets 
  
# metadata 
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.metadata

In [None]:
# variable information 
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.variables 


#### K means is distance based need to convert categorical to one hot

In [None]:
import pandas as pd
X_cat = pd.get_dummies(X,columns=['Gender','CAEC','CALC','MTRANS'],drop_first=True)

In [None]:
X_cat.columns

In [None]:
for col in X_cat.select_dtypes(include='object'):
    print(f"\nColumn: {col}")
    print(X_cat[col].value_counts())


In [None]:
yes_no_cols = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
X_cat[yes_no_cols] = X_cat[yes_no_cols].replace({'yes': 1, 'no': 0})
bool_cols = X_cat.select_dtypes(include='bool').columns
X_cat[bool_cols] = X_cat[bool_cols].astype(int)
X_cat.dtypes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X_cat, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
X_train.columns

In [None]:
y.value_counts()

### Clustering using sklearn

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
from sklearn.preprocessing import StandardScaler


In [None]:
kmeans = KMeans(n_clusters=7, random_state=42, n_init=10)
kmeans.fit(X_train_scaled)


In [None]:
y_train_pred = kmeans.predict(X_train_scaled)
y_test_pred = kmeans.predict(X_test_scaled)

In [None]:
X_train.columns

In [None]:
import pandas as pd

y_train_pred_series = pd.Series(y_train_pred)
y_train_pred_series.value_counts()


##### why does sklearns silhouette score take as input y , it should be an unsupervised metric ?

In [None]:
print("Silhouette Score (train):", silhouette_score(X_train_scaled, y_train_pred))

In [None]:
print("Silhouette Score (train):", silhouette_score(X_test_scaled, y_test_pred))

In [None]:
y_train_flat = np.ravel(y_train)
y_test_flat = np.ravel(y_test)

In [None]:
print("Adjusted Rand Index (train):", adjusted_rand_score(y_train_flat, y_train_pred))
print("Adjusted Mutual Info (train):", adjusted_mutual_info_score(y_train_flat, y_train_pred))
print("Adjusted Rand Index (test):", adjusted_rand_score(y_test_flat, y_test_pred))
print("Adjusted Mutual Info (test):", adjusted_mutual_info_score(y_test_flat, y_test_pred))


In [None]:
import pandas as pd

# y_train_flat = true labels (1D), y_train_pred = K-Means cluster assignments
cluster_vs_label = pd.crosstab(y_train_flat, y_train_pred, rownames=['True Class'], colnames=['Cluster'])
print(cluster_vs_label)


In [None]:
cluster_vs_label_norm = cluster_vs_label.div(cluster_vs_label.sum(axis=1), axis=0)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.heatmap(cluster_vs_label_norm, annot=True, cmap='Blues', fmt='.2f')
plt.title("Proportion of True Classes in K-Means Clusters")
plt.show()


### Implementation from scratch

##### We have X_train_scaled, X_test_scaled need y_train_pred and y_test_pred

In [None]:
K = 7
n_samples_train, n_features = X_train_scaled.shape


In [None]:
n_features


In [None]:
X_train.describe()

In [None]:
# Column-wise mean
arr = X_train_scaled
col_means = np.mean(arr, axis=0)
print("Column means:", col_means)

# Column-wise standard deviation
col_stds = np.std(arr, axis=0, ddof=1)  # population std
print("Column stds:", col_stds)

In [None]:
## randomly initialized centroids
centroids = np.random.randn(K, n_features)

In [None]:
centroids # 7 X 23

In [None]:
D = np.ones((n_samples_train, K))*np.inf
for i in range(n_samples_train):
    x = X_train_scaled[i:]
    for j in range(centroids.shape[0]):
        D[i,j]= np.linalg.norm(x-centroids[j])

In [None]:
#y_train_pred = np.ones((n_samples_train,1))
y_train_pred = np.argmin(D, axis = 1 )
print(y_train_pred.shape)
        

In [None]:
pd.DataFrame(y_train_pred).value_counts()

In [None]:
print("Silhouette Score (train):", silhouette_score(X_train_scaled, y_train_pred))

### Data needs preprocessing all means are close to zero and stds to 1 all datapoints are close to only 1 point x = 0 vector

In [None]:
X_train.describe(include='all')

In [None]:
X.dtypes

In [None]:
X_cat = pd.get_dummies(X,columns=['Gender','CAEC','CALC','MTRANS'],drop_first=True)
yes_no_cols = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
X_cat[yes_no_cols] = X_cat[yes_no_cols].replace({'yes': 1, 'no': 0})
bool_cols = X_cat.select_dtypes(include='bool').columns
X_cat[bool_cols] = X_cat[bool_cols].astype(int)
# list othre continuous columns that might need scaling

cols_to_scale = ['Age','Weight','Height','FCVC','NCP','CH2O','FAF','TUE']

X_cat.describe()

In [None]:
# Devide X_cat into train, test. 
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X_cat, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train[cols_to_scale])
test_scaled = scaler.transform(X_test[cols_to_scale])

# use standard scaler on the chosen subset of columns

In [None]:
type(X_train)

In [None]:
type(train_scaled)

In [None]:
# 1️⃣ Copy the DataFrame (optional)
X_train_scaled = X_train.copy()

# 2️⃣ Apply StandardScaler
scaler = StandardScaler()
X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train_scaled[cols_to_scale])

# 3️⃣ Convert whole DataFrame to NumPy array
X_train_scaled_array = X_train_scaled.values

print("Scaled DataFrame as array:\n", X_train_scaled_array)

In [None]:
X_test_scaled = X_test.copy()
X_test_scaled[cols_to_scale] = scaler.transform(X_test_scaled[cols_to_scale])
X_test_scaled_array = X_test_scaled.values

In [None]:
type((X_test_scaled_array))

## tSNE

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE


In [None]:
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_train_scaled_array)  # shape -> (n_samples, 2)


In [None]:
plt.figure(figsize=(8,6))
scatter = plt.scatter(X_tsne[:,0], X_tsne[:,1], cmap='viridis', s=50)
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.title("t-SNE visualization")

# Add legend if you have labels
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()
