In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from kneed import KneeLocator
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score

In [9]:
init_df = pd.read_csv('Mapped Data.csv')
init_df.fillna(0, inplace=True)
init_df.at[10,'Land Area'] = 23234.78
init_df.at[10,'Pop Density'] = 177
init_df.at[11,'Land Area'] = 16904.03
init_df.at[11,'Pop Density'] = 202
init_df.set_index('AREA',inplace=True)
init_df

Unnamed: 0_level_0,Doctors,Nurses,Beds,Mortality,Host_Count,Pop Density,Land Area
AREA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NCR,10.0,12.0,13.5,71050.0,30.0,19137.0,619.54
CAR,6.0,14.8,6.5,7349.0,6.0,82.0,19818.12
REGION I,3.8,10.5,4.8,32983.0,2.0,366.0,12964.62
REGION II,3.2,11.4,5.1,19374.0,8.0,108.0,29836.88
REGION III,3.4,7.2,5.0,60409.0,2.0,463.0,21906.19
REGION IVA,2.7,6.3,5.3,75743.0,20.0,761.0,16576.26
REGION IVB,1.8,5.5,1.0,13907.0,28.0,93.0,29606.25
REGION V,2.4,7.3,3.4,33751.0,98.0,299.0,18114.47
REGION VI,2.9,6.8,5.9,48990.0,94.0,328.0,12773.46
REGION VII,2.9,9.9,6.6,43500.0,35.0,528.0,10452.01


In [10]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(init_df)

In [11]:
for cluster in list(np.arange(2,15)):
    kmeans = KMeans(n_clusters=cluster, random_state=42, n_init=15)
    kmeans.fit(df_scaled)
    cluster_labels = kmeans.predict(df_scaled)   
    silhouette_avg = silhouette_score(df_scaled, cluster_labels)
    calinski = metrics.calinski_harabasz_score(df_scaled, cluster_labels)
    davies = davies_bouldin_score(df_scaled, cluster_labels)
    ssd = []    
    ssd.append(kmeans.inertia_)
    print("""For cluster={0}, the silhouette is {1}, calinski is {2}, and davies is {3}
            SSD={4}""".format(cluster, silhouette_avg, calinski, davies,ssd))

For cluster=2, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
            SSD=[70.0907785030166]
For cluster=3, the silhouette is 0.24383538453893858, calinski is 9.767551173608327, and davies is 0.9865932531956793
            SSD=[49.67928777286927]
For cluster=4, the silhouette is 0.2956026788098978, calinski is 9.655080250399774, and davies is 0.6925196161360835
            SSD=[36.863841891715786]
For cluster=5, the silhouette is 0.26918403857441114, calinski is 10.582624675885707, and davies is 0.8072851149314493
            SSD=[26.283579832240378]
For cluster=6, the silhouette is 0.3243810482977718, calinski is 12.119306331852485, and davies is 0.7426564854657122
            SSD=[18.283008543342685]
For cluster=7, the silhouette is 0.29851947477644986, calinski is 12.604996971957464, and davies is 0.7911105516802495
            SSD=[13.897001663952732]
For cluster=8, the silhouette is 0.28429625155625504, calinski is 12.429

In [12]:
for seed in list(np.arange(2,105,7)):
    kmeans = KMeans(n_clusters=2, random_state=seed)
    kmeans.fit(df_scaled)
    cluster_labels = kmeans.predict(df_scaled)   
    silhouette_avg = silhouette_score(df_scaled, cluster_labels)
    calinski = metrics.calinski_harabasz_score(df_scaled, cluster_labels)
    davies = davies_bouldin_score(df_scaled, cluster_labels)
    print("For seed={0}, the silhouette is {1}, calinski is {2}, and davies is {3}".format(seed, silhouette_avg, calinski, davies))

For seed=2, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
For seed=9, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
For seed=16, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
For seed=23, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
For seed=30, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
For seed=37, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
For seed=44, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
For seed=51, the silhouette is 0.5714321689870848, calinski is 10.466973518109471, and davies is 0.2734340452400449
For seed=58, the silhouette is 0.5714321689870848, calinski is 10.46697351

In [13]:
kmeans = KMeans(n_clusters=2, random_state=42, n_init=15)
kmeans.fit(df_scaled)
cluster_labels = kmeans.predict(df_scaled)   
silhouette_avg = silhouette_score(df_scaled, cluster_labels)
calinski = metrics.calinski_harabasz_score(df_scaled, cluster_labels)
davies = davies_bouldin_score(df_scaled, cluster_labels)

init_df['KCluster_Labels'] = cluster_labels
init_df['KCluster_Labels'].value_counts()

0    16
1     1
Name: KCluster_Labels, dtype: int64

In [None]:
import seaborn as sns
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12,8))

sns.boxplot(x="Cluster_Labels", y="adhesion", data=c_df, ax=axes[0,0])
axes[0,0].set_title("adhesion", fontsize=16)

sns.boxplot(x='Cluster_Labels', y='assembly', data=c_df, ax=axes[0,1])
axes[0,1].set_title("assembly", fontsize=16)

sns.boxplot(x='Cluster_Labels', y='encapsidation', data=c_df, ax=axes[0,2])
axes[0,2].set_title("encapsidation", fontsize=16)

sns.boxplot(x='Cluster_Labels', y='polyprotein',data=c_df, ax=axes[1,0])
axes[1,0].set_title("polyprotein", fontsize=16)

sns.boxplot(x='Cluster_Labels', y='virulence_score', data=c_df, ax=axes[1,1])
axes[1,1].set_title("virulence_score", fontsize=16)

sns.boxplot(x='Cluster_Labels', y='GC%', data=c_df, ax=axes[1,2])
axes[1,2].set_title("GC Content", fontsize=16)


plt.tight_layout()
plt.show();

In [None]:
plt.figure(figsize=[12, 8], dpi=300)
data=init_df[['GC%','adhesion','encapsidation','polyprotein','virulence_score', 'Cluster_Labels']]
sns.set_style("whitegrid", {'axes.grid' : False})
sns.pairplot(data, hue='Cluster_Labels',palette='rocket_r')

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_minmax = scaler.fit_transform(df1)

df_minmax = pd.DataFrame(df_minmax, index=df1.index, columns=df1.columns)

df_minmax['Cluster_Labels'] = cluster_labels

df_clusters = df_minmax.set_index("Cluster_Labels")
df_clusters = df_clusters.groupby("Cluster_Labels").mean().reset_index()
df_clusters

In [None]:
from math import pi
def make_spider(row, title, color):
 
    categories=list(df_clusters)[1:]
    N = len(categories)
 
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
 
    # Initialise the spider plot
    ax = plt.subplot(3,3,row+1, polar=True )
 
    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 3.5)
    ax.set_theta_direction(-1)
    
    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], categories, color='black', size=8)
 
    ax.set_rlabel_position(0)
    plt.yticks([-0.25, 0, 0.25, 0.5, 0.75, 1], [-0.25, 0, 0.25, 0.5,0.75, 1], color="grey", size=7) #formmscaled
    plt.ylim(-0.25,1)

    # Ind1
    values=df_clusters.loc[row].drop('Cluster_Labels').values.flatten().tolist()
    values += values[:1]
    ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')
    ax.fill(angles, values, color=color, alpha=0.4)
 
    # Add a title
    plt.title(title, size=14, color=color, y=1.1)
 

In [None]:
my_dpi=100
figz=plt.figure(figsize=(1400/my_dpi, 1400/my_dpi), dpi=my_dpi)
plt.subplots_adjust(hspace=0.5,wspace=0.6)
my_palette = plt.cm.get_cmap("Set2", len(df_clusters.index))
figz.patch.set_facecolor('white')

for row in range(0, len(df_clusters.index)):
    make_spider(row=row, 
                title='Segment '+(df_clusters['Cluster_Labels'][row]).astype(str), 
                color=my_palette(row))
    
plt.savefig('Spiders.png', transparent=False)