In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import f1_score, confusion_matrix

In [None]:
path = './ctg_data_full.xls'
# read the ctg data
df = pd.read_excel(path, sheet_name='Data', usecols='K:AE', skiprows=0, header=1, nrows=2126)
df.dropna(inplace=True)
X = df.to_numpy()
print(df.head(5))

# read the results column. 
# For the 3-class NSP column use 'AT', for the 10-class FHR use 'AR'.
df_y = pd.read_excel(path, sheet_name='Data', usecols='AR', skiprows=0, header=1, nrows=2126)
df_y.dropna(inplace=True)
y = df_y.to_numpy()
print(df_y.head(5))
'''
NOTE: Comment out whichever part you want to use. Choose no more than one from every category.
'''

### Data scaling

# Standardizing
scaler = StandardScaler()
X = scaler.fit_transform(X)

# # Normalizing, though better results are achieved with standardizing
# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)

In [None]:
### Data Discetization

# # K-Means Discretization
# discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')
# X = discretizer.fit_transform(X)

# # Equal-Frequency Discretization
# discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
# X = discretizer.fit_transform(X)

# # Equal-Width Discretization
# discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
# X = discretizer.fit_transform(X)

In [None]:
### Dimensionality reduction 

# # Principal Component Analysis
# pca = PCA(n_components=17)
# X = pca.fit_transform(X)

# # Linear Discriminant Analysis
# lda = LDA(n_components=17)
# X = lda.fit_transform(X)

# # Singular Value Decomposition
# svd = TruncatedSVD(n_components=17)
# X = svd.fit_transform(X)

In [None]:
### Clustering techniques

# K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=15).fit(X)
labels = kmeans.labels_
# NOTE: Different random_state values were used with 15 having the best performance

# # Agglomerative clustering
# agglomerative = AgglomerativeClustering(n_clusters=3).fit(X)
# labels = agglomerative.labels_
# # NOTE: The best performing linkage was the default one (ward)

# # DBSCAN clustering
# dbscan = DBSCAN(eps=2, min_samples=12).fit(X)
# labels = dbscan.labels_
# # # NOTE: The only time I managed to get to 3 (2 plus the noise) classes was with: eps=2, min_samples=12

In [None]:
# count the occurences of the different original clusters 
unique, count = np.unique(y, return_counts=True)
true_results = np.asarray((unique, count)).T

# count the occurences of the different predicted clusters 
unique, count = np.unique(labels, return_counts=True)
predicted_results = np.asarray((unique, count)).T

In [None]:
### Reverse the scaling process

# X = pca.inverse_transform(X)
# X = discretizer.inverse_transform(X)
X = scaler.inverse_transform(X)

In [None]:
### Plots

# plot the original clusters on the unscaled data set
fig, ax = plt.subplots()
scatter = ax.scatter(X[:,0], X[:,1], c=y)
legend = ax.legend(*scatter.legend_elements(), title='Classes', 
                        bbox_to_anchor=(1.05, 1.0), loc='upper left') # move the legend outside the graph
ax.add_artist(legend)
plt.title('Original')
plt.show()

# plot the predicted clusters from K-MEANS on the unscaled data set
fig, ax = plt.subplots()
scatter = ax.scatter(X[:,0], X[:,1], c=labels)
legend = ax.legend(*scatter.legend_elements(), title='Classes', 
                        bbox_to_anchor=(1.05, 1.0), loc='upper left') # move the legend outside the graph
ax.add_artist(legend)
plt.title('Clustering Algorithm')
plt.show()

In [None]:
### Metrics

# confusion matrix
print('Confusion Matrix:\n {}'.format(confusion_matrix(true_results[:,1], predicted_results[:,1])))

# F1-score
print('\nF1: {}'.format(f1_score(true_results[:,1], predicted_results[:,1], average='micro')))