In [135]:

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import davies_bouldin_score

In [136]:
# Loading data
# This data can be downloaded from https://www.kaggle.com/datasets/iamhungundji/covid19-symptoms-checker/download

data=pd.read_csv('..//Cleaned-Data.csv')


#### Data Processing

In [137]:
# Drop the null values & remove duplicate data rows

data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data.head()

In [138]:
# Some feature engineering is done, we drop the fields that are not required
data1=data.drop(['Severity_Mild','Severity_Moderate','Severity_None','Severity_Severe','Country'],axis=1) 
data1.head()

In [139]:
# We normalise the data to improve its accuracy

from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
train_df_scaled=scaler.fit_transform(data1)

## Elbow Method
This method is used to find the optimal number of clusters into which the data can be clustered
Tt is evidenced that the more clusters specified, the accuracy increases but using too many clusters reduces the within-variability hence optimal number of clusters should be used

So, this method helps to determine the k-value which is at the elbow point

In [140]:
inertias = []

for i in range(1,15):
    km=KMeans(n_clusters=i, random_state=2)
    km.fit(train_df_scaled)
    inertias.append(km.inertia_)

In [141]:
plt.figure(figsize=(10,6))
plt.plot(range(1,15), inertias, color='blue', marker='o', markerfacecolor='red', markersize=10)
plt.title('Inertias vs. number of clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()


In [142]:
# But also the Bouldin score can be used to determine the optimal number of clusters
bouldin_score=[]

for i in range(4,15):
    km=KMeans(n_clusters=i, random_state=2)
    labels=km.fit_predict(train_df_scaled)
    bouldin_score.append(davies_bouldin_score(train_df_scaled, labels))

In [143]:

plt.figure(figsize=(10,6))
plt.plot(range(4,15), bouldin_score, color='blue', marker='o', markerfacecolor='red', markersize=10)
plt.title('Davies Bouldin Score vs. number of clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Davies Bouldin Score')
plt.show()

In [144]:

#  Using the optimal number of clusters i.e. 7 from the Elbow point, we re-model & predict the clusters
kmeans = KMeans(n_clusters= 7)
 
# We generate the label of clusters & add it to the data
label = kmeans.fit_predict(train_df_scaled)
 

In [145]:
# Use PCA(Principal Component Analysis) to convert correlated variables into uncorrelated variables

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(train_df_scaled)

df = pca.transform(train_df_scaled)
df = pd.DataFrame(df, columns=['C1', 'C2'])
df['label'] = label

df.head()

In [146]:
import matplotlib.pyplot as plt
#filter rows of original data
filtered_label0 = df[label == 0]
 
#plotting a sa results
plt.scatter(filtered_label0['C1'] , filtered_label0['C2'])
plt.show()

In [147]:
#Getting the Centroids
centroids = kmeans.cluster_centers_
u_labels = np.unique(label)
 
#plotting the results:

filtered_label0 = df[label == 0]
filtered_label1 = df[label == 1]
filtered_label2 = df[label == 2]
filtered_label3 = df[label == 3]
filtered_label4 = df[label == 4]
filtered_label5 = df[label == 5]
filtered_label6 = df[label == 6]


plt.scatter(filtered_label0['C1'] , filtered_label0['C2'])
plt.scatter(filtered_label1['C1'] , filtered_label1['C2'])
plt.scatter(filtered_label2['C1'] , filtered_label2['C2'])
plt.scatter(filtered_label3['C1'] , filtered_label3['C2'])
plt.scatter(filtered_label4['C1'] , filtered_label4['C2'])
plt.scatter(filtered_label5['C1'] , filtered_label5['C2'])
plt.scatter(filtered_label6['C1'] , filtered_label6['C2'])

plt.scatter(centroids[:,0] , centroids[:,1] , s = 50, color = 'k')
plt.legend()
plt.show()

In [148]:
from matplotlib import animation
pca = PCA(n_components = 3)
pca.fit(train_df_scaled)

df = pca.transform(train_df_scaled)
tsne_df = pd.DataFrame(df, columns=['C1', 'C2', 'C3'])
tsne_df['label'] = label

def plot_animation(df, label_column, name):
    def update(num):
        ax.view_init(200, num)

    N=360
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(tsne_df['C1'], tsne_df['C2'], tsne_df['C3'], c=tsne_df[label_column],
               s=6, depthshade=True, cmap='Paired')
    ax.set_zlim(-15, 25)
    ax.set_xlim(-20, 20)
    plt.tight_layout()
    ani = animation.FuncAnimation(fig, update, N, blit=False, interval=50)
    ani.save('{}.gif'.format(name), writer='imagemagick')
    plt.show()
    
plot_animation(tsne_df, 'label', 'kmeans')