# Load Datasets

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the datasets
file_2_path = 'Project data set 2.xlsx'
file_3_path = 'Project data set 3.xlsx'

dataset_2 = pd.read_excel(file_2_path)
dataset_3 = pd.read_excel(file_3_path)

# Preprocessing

In [None]:
# Preprocessing: Standardize the features (p1 to p9)
# Extract features (p1 to p9) and standardize
features = dataset_2[['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Exploratory Analysis

In [None]:
#Exploratory Analysis

#summary Statistics

summary_stats = dataset_2[['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']].describe()

# Print summary statistics
print(summary_stats)

In [None]:
#Correlation Matrix

correlation_matrix = dataset_2[['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']].corr()

# Print the correlation matrix
print(correlation_matrix)

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Features (p1 to p9)')
plt.show()

# Advanced Analysis

In [None]:
# Feature Selection
#p2 and p4 are removed from the feature set
features = dataset_2[['p1', 'p3', 'p5', 'p6', 'p7', 'p8', 'p9']]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


In [None]:
# K-means Clustering

# Apply K-means clustering with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans_labels = kmeans.fit_predict(features_scaled)

# Evaluate K-means clustering using Silhouette Score and Davies-Bouldin Index
silhouette_avg_kmeans = silhouette_score(features_scaled, kmeans_labels)
davies_bouldin_kmeans = davies_bouldin_score(features_scaled, kmeans_labels)

print(f"K-means Clustering:")
print(f"Silhouette Score: {silhouette_avg_kmeans:.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin_kmeans:.4f}")

In [None]:
# Compare Clustering Results with True Labels
# Merge the K-means labels with the actual labels from Project Data Set 3
comparison_df = dataset_2[['Segment ID']].copy()
comparison_df['KMeans_Label'] = kmeans_labels
comparison_df = comparison_df.merge(dataset_3, on='Segment ID')

# Clean up true labels by removing extra spaces
comparison_df['label'] = comparison_df['label'].str.strip()

# Map the K-means labels to categorical values ("Awake" and "Deep anaesthetic")
label_mapping = {0: 'Awake', 1: 'Deep anaesthetic'}
comparison_df['KMeans_Label_Categorized'] = comparison_df['KMeans_Label'].map(label_mapping)

# Confusion Matrix for K-means Clustering vs. Actual Labels
from sklearn.metrics import confusion_matrix
conf_matrix_kmeans = confusion_matrix(comparison_df['label'], comparison_df['KMeans_Label_Categorized'])
conf_matrix_df_kmeans = pd.DataFrame(conf_matrix_kmeans, 
                                      index=['Deep anaesthetic', 'Awake'], 
                                      columns=['Cluster A', 'Cluster B'])

# Plot confusion matrix for K-means
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df_kmeans, annot=True, fmt='d', cmap='Blues', linewidths=0.5)
plt.title('Confusion Matrix: KMeans Clustering vs Actual Labels')
plt.xlabel('Cluster')
plt.ylabel('True Label')
plt.show()

In [None]:
# Hierarchical Clustering
# Apply Hierarchical clustering (Ward linkage)
Z = linkage(features_scaled, method='ward')

# Plot the dendrogram for Hierarchical Clustering
plt.figure(figsize=(12, 8))
dendrogram(Z, labels=dataset_2['Segment ID'].values, orientation='top', leaf_rotation=90)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Segment ID')
plt.ylabel('Distance')
plt.show()

# Evaluate Hierarchical Clustering
# Since Hierarchical clustering does not provide explicit labels, we will use the same number of clusters (2)
# Perform clustering with the result from Hierarchical clustering (cut the tree into 2 clusters)
from scipy.cluster.hierarchy import fcluster
hierarchical_labels = fcluster(Z, 2, criterion='maxclust')

# Evaluate Hierarchical clustering using Silhouette Score and Davies-Bouldin Index
silhouette_avg_hierarchical = silhouette_score(features_scaled, hierarchical_labels)
davies_bouldin_hierarchical = davies_bouldin_score(features_scaled, hierarchical_labels)

print(f"\nHierarchical Clustering:")
print(f"Silhouette Score: {silhouette_avg_hierarchical:.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin_hierarchical:.4f}")



In [None]:
# Confusion Matrix for Hierarchical Clustering
comparison_df['Hierarchical_Label_Categorized'] = hierarchical_labels
# Map the Hierarchical labels to categorical values ("Awake" and "Deep anaesthetic")
comparison_df['Hierarchical_Label_Categorized'] = comparison_df['Hierarchical_Label_Categorized'].map({1: 'Awake', 2: 'Deep anaesthetic'})

# Confusion Matrix for Hierarchical Clustering vs. Actual Labels
conf_matrix_hierarchical = confusion_matrix(comparison_df['label'], comparison_df['Hierarchical_Label_Categorized'])
conf_matrix_df_hierarchical = pd.DataFrame(conf_matrix_hierarchical, 
                                           index=['Deep anaesthetic', 'Awake'], 
                                           columns=['Cluster A', 'Cluster B'])

# Plot confusion matrix for Hierarchical clustering
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df_hierarchical, annot=True, fmt='d', cmap='Blues', linewidths=0.5)
plt.title('Confusion Matrix: Hierarchical Clustering vs Actual Labels')
plt.xlabel('Cluster')
plt.ylabel('True Label')
plt.show()