In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# CSV dataset file name
file_name = 'D2.csv'

In [None]:
# Please put csv file in the same folder with this jupyter notebook
df = pd.read_csv(file_name)

In [None]:
print(df.info())

# Question 1: Preprocessing
- What pre-processing was required on the dataset (D2.csv) before building the clustering model on the chosen attributes?

#### Proposed changes:
- Readmitted is currently stored in binary format  while change and readmitted are bool. Will need to identify which format is required for use in clustering. 

In [None]:
print(df.describe())

In [None]:
# Preprocessing for Q2 - 3. 
#Create dataframe of selected variables for use in clustering model creation
df1 = df[["num_lab_procedures", "number_outpatient", "number_inpatient", "num_medications", "time_in_hospital"]]
print(df1.info())

In [None]:
#visualise distribution of variables to identify potential data problems. 
import seaborn as sns
import matplotlib.pyplot as plt
# Distribution of num_lab_procedures
num_lab_procedures_dist = sns.distplot(df1['num_lab_procedures'].dropna())
plt.show()
# Distribution of number_outpatient
number_outpatient_dist = sns.distplot(df1['number_outpatient'].dropna(), bins=100)
plt.show()
# Distribution of number_inpatient
number_inpatient_dist = sns.distplot(df1['number_inpatient'].dropna(), bins=100)
plt.show()
# Distribution of num_medications
num_medications_dist = sns.distplot(df1['num_medications'].dropna())
plt.show()
# Distribution of time_in_hospital
time_in_hospital_dist = sns.distplot(df1['time_in_hospital'].dropna(), bins=100)
plt.show()

In [None]:
# Variables are on different scales, need to standardise the scaling of variables to allow for model accuracy. 
print(df1.describe())

In [None]:
#scaling process. 
from sklearn.preprocessing import StandardScaler

# convert df1 to matrix
X = df1.to_numpy()

# scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# preprocessing for Q4
print(df['age'].unique())

In [None]:
# mapping
age_map = {'[0-10)':1, '[10-20)':2, '[20-30)':3, '[30-40)':4, '[40-50)':5, '[50-60)':6, '[60-70)':7, '[70-80)':8, '[80-90)':9, '[90-100)':10}
#print(age_map)
df['age'] = df['age'].map(age_map)
print(df['age'].unique())

In [None]:
#Create dataframe of selected variables for use in clustering model creation
df2 = df[["num_lab_procedures", "number_outpatient", "number_inpatient", "num_medications", "time_in_hospital", "age"]]
print(df2.info())
print("AGE Values:", df2['age'].unique())

In [None]:
#from sklearn.preprocessing import StandardScaler
# convert df to matrix
X2 = df2.to_numpy()

# scaling
scaler = StandardScaler()
X2 = scaler.fit_transform(X2)

# Question 2 & 3 
## Clustering Model 1 

In [None]:
#create model using scaled df created above

from sklearn.cluster import KMeans

# random state, we will use 42 instead of 10 for a change
rs = 42

# set the random state. different random state seeds might result in 
# different centr
model = KMeans(n_clusters=3, random_state=rs)
model.fit(X)

# sum of intra-cluster distances
print("Sum of intra-cluster distance:", model.inertia_)

print("Centroid locations:")
for centroid in model.cluster_centers_:
     print(centroid)

In [None]:
model = KMeans(n_clusters=3, random_state=rs).fit(X)

# assign cluster ID to each record in X
# Ignore the warning, does not apply to our case here
y = model.predict(X)
df1['Cluster_ID'] = y

# how many records are in each cluster
print("Cluster membership")
print(df1['Cluster_ID'].value_counts())

# pairplot the cluster distribution.
cluster_g = sns.pairplot(df1, hue='Cluster_ID',diag_kind='hist')
plt.show()

## MP Insights
- The greater the number of medications in cluster 0, the fewer visits to inpatient or outpatient hospital
- The number of medications for patients in cluster 2 doesn't seem to impact on their length of stay in hospital. 


In [None]:
#create plots for each variable's distribution in a cluster against the overall data set distribution. 
# prepare the column and bin size. Increase bin size to be more specific, but 20 is
cols = ["num_lab_procedures", "number_outpatient", "number_inpatient", "num_medications", "time_in_hospital"]
n_bins = 20

# inspecting cluster 0 and 1
clusters_to_inspect = [0,1,2]

for cluster in clusters_to_inspect:
    # inspecting cluster 0
    print("Distribution for cluster {}".format(cluster))
     
    # create subplots
    fig, ax = plt.subplots(nrows=5)
    ax[0].set_title("Cluster {}".format(cluster))

    for j, col in enumerate(cols):
        # create the bins
        bins = np.linspace(min(df1[col]), max(df1[col]), 30)
        # plot distribution of the cluster using histogram
        sns.distplot(df1[df1['Cluster_ID'] == cluster][col], bins=bins, ax=ax[j], norm_hist=True)
        # plot the normal distribution with a black line
        sns.distplot(df1[col], bins=bins, ax=ax[j], hist=False, color="k")

    #plt.tight_layout()
    plt.show()

### Determining K

In [None]:
# list to save the clusters and cost
clusters = []
inertia_vals = []

# this whole process should take a while
for k in range(2, 15, 2):
    # train clustering with the specified K
    model = KMeans(n_clusters=k, random_state=rs, n_jobs=10)
    model.fit(X)
    
    # append model to cluster list
    clusters.append(model)
    inertia_vals.append(model.inertia_)

In [None]:
# plot the inertia vs K values
plt.plot(range(2,15,2), inertia_vals, marker='*')
plt.show()

In [None]:
#Calculate silhouette scores for points k = 4, k = 6 and k = 8.
from sklearn.metrics import silhouette_score

print(clusters[1])
print("Silhouette score for k=4", silhouette_score(X, clusters[1].predict(X)))

print(clusters[2])
print("Silhouette score for k=6", silhouette_score(X, clusters[2].predict(X)))

print(clusters[2])
print("Silhouette score for k=8", silhouette_score(X, clusters[3].predict(X)))

### MP Insights
- it appears that k=6 is the optimal number of clusters for this model according to the silhouette score. 

In [None]:
# visualisation of K=6 clustering solution
model = KMeans(n_clusters=6, random_state=rs)
model.fit(X)

# sum of intra-cluster distances
print("Sum of intra-cluster distance:", model.inertia_)

print("Centroid locations:")
for centroid in model.cluster_centers_:
    print(centroid)

y = model.predict(X)
df1['Cluster_ID'] = y

# how many in each
print("Cluster membership")
print(df1['Cluster_ID'].value_counts())

# pairplot
# added alpha value to assist with overlapping points
cluster_g = sns.pairplot(df1, hue='Cluster_ID', diag_kind='hist')
plt.show()

# Question 4
## Clustering Model 2 - includes AGE

In [None]:
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

In [None]:
# list to save the clusters and cost
clusters = []
cost_vals = []

# this whole process should take a while
for k in range(2, 10, 2):
    # train clustering with the specified K
    model = KPrototypes(n_clusters=k, random_state=rs, n_jobs=10)
    model.fit_predict(X2, categorical=[1])
    
    # append model to cluster list
    clusters.append(model)
    cost_vals.append(model.cost_)