## Clustering
### Author: Jennifer Nguyen
### UH ID: 2381357

In [35]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from collections import Counter

In [36]:
records = pd.read_csv('clinical_records_dataset.csv')

def prepare_data(records):
    X = records.drop(['time', 'DEATH_EVENT'], axis=1)
    y_true = records['DEATH_EVENT'].values
    return X, y_true

---

### Function for MinMax feature normalization
The input `x` is the raw data in a 2-D array of the shape `(number of data points, number of features`.

The output `x_norm` is the normalized data of the input `x` with the same shape as the input.

This function will be used for normalizing data before using DBSCAN for clustering.


In [37]:
def feature_norm(x):
    # x is a 2-D array of the shape (number of data points, number of features
    eps = np.finfo(float).eps
    x_norm = x - np.expand_dims(x.min(0), axis=0)
    x_norm = x_norm / (np.expand_dims((x.max(0) - x.min(0)), axis=0) + eps)
    
    return x_norm


---

### **Task 1:** Function for computing purity
This is your function of purity.

The indices of the clusters in `y_true` and `y_pred` start from 0 in `compute_purity`, e.g., [1, 1, 0, 0, 2, 2, 2].

`y_true` is the array of true class indices of all data points, `len(y_true)=number of data points`. = Ground truth class labels

`y_pred` is the array of cluster indices of all data points, `len(y_pred)=number of data points`. = Predicted cluster labels

'purity': a number between 0 and 1, with 1 being the best possible purity score

In [38]:
def compute_purity(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if len(y_true) != len(y_pred):
        raise ValueError("ERROR: The length of y_true and y_pred must equal!")
    
    clusters = np.unique(y_pred)

    total_correct = 0

    for cluster in clusters:
        cluster_indices = np.where(y_pred == cluster)[0]
        cluster_true_labels = y_true[cluster_indices]

        if len(cluster_true_labels) > 0:
            label_counts = Counter(cluster_true_labels)
            total_correct += max(label_counts.values())
    
    purity = total_correct / len(y_true)
    return purity



Testing if function works w/ example from Task 1 Assignment

In [39]:
print("Test 1: Assignment example")
y_true_test = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]
y_pred_test = [2, 2, 1, 2, 2, 2, 0, 0, 0, 1, 2, 1, 1, 1, 1, 1]
purity_score = compute_purity(y_true_test, y_pred_test)
print(f"Purity score: {purity_score:.3f}") 

Test 1: Assignment example
Purity score: 0.750


---

### **Task 2:** Running K-means
- k = 2
- Calculate percentage of points in each cluster
- Calculate overall purity score
- Calculate purity for each individual cluster
- Determine which cluster has highest purity


In [40]:
def run_kmeans_analysis(X, y_true):
    kmeans = KMeans(n_clusters=2, random_state=42)
    y_pred = kmeans.fit_predict(X)
    
    # Calculate percentage of points in each cluster
    cluster_counts = Counter(y_pred)
    total_points = len(y_pred)
    cluster_percentages = {
        f"Cluster {k}": f"{(v/total_points)*100:.1f}%" 
        for k, v in cluster_counts.items()
    }
    
    # Calculate overall purity
    overall_purity = compute_purity(y_true, y_pred)
    
    # Calculate purity for each cluster
    cluster_purities = {}
    for cluster_id in [0, 1]:
        cluster_mask = (y_pred == cluster_id)
        if sum(cluster_mask) > 0: 
            cluster_true_labels = y_true[cluster_mask]
            label_counts = Counter(cluster_true_labels)
            majority_count = max(label_counts.values())
            cluster_purity = majority_count / sum(cluster_mask)
            cluster_purities[f"Cluster {cluster_id}"] = cluster_purity
    
    return {
        'cluster_percentages': cluster_percentages,
        'overall_purity': overall_purity,
        'cluster_purities': cluster_purities,
        'predictions': y_pred
    }

X, y_true = prepare_data(records)
results = run_kmeans_analysis(X, y_true)

# Results:
print("\nK-means Clustering Results (k=2):")

print("\nPercentage of points in each cluster:")
for cluster, percentage in results['cluster_percentages'].items():
    print(f"{cluster}: {percentage}")

print(f"\nOverall purity score: {results['overall_purity']:.3f}")

print("\nPurity score for each cluster:")
for cluster, purity in results['cluster_purities'].items():
    print(f"{cluster}: {purity:.3f}")

highest_purity_cluster = max(results['cluster_purities'].items(), key=lambda x: x[1])
print(f"\nCluster {highest_purity_cluster[0]} has the highest purity: {highest_purity_cluster[1]:.3f}")


K-means Clustering Results (k=2):

Percentage of points in each cluster:
Cluster 0: 78.3%
Cluster 1: 21.7%

Overall purity score: 0.679

Purity score for each cluster:
Cluster 0: 0.692
Cluster 1: 0.631

Cluster Cluster 0 has the highest purity: 0.692


---

### **Task 3:** Experimenting with Different Values of k
- k = 2, 10, 30, 50, 100
- Run K-means 10x for each value
- Compute average purity of clustering over the 10 runs
- Compute average Silhouette coefficient of clustering over the 10 runs
- Silhouette coefficient = Eucildean distance

In [41]:
def run_multiple_kmeans(X, y_true, k_values, n_runs=10):
    results = {
        'k': [],
        'Purity': [],
        'Silhouette\nCoefficient': []
    }
    
    for k in k_values:
        purities = []
        silhouettes = []
        
        for _ in range(n_runs):
            kmeans = KMeans(n_clusters=k, random_state=None)
            y_pred = kmeans.fit_predict(X)

            purities.append(compute_purity(y_true, y_pred))
            silhouettes.append(silhouette_score(X, y_pred, metric='euclidean'))
        
        results['k'].append(k)
        results['Purity'].append(np.mean(purities))
        results['Silhouette\nCoefficient'].append(np.mean(silhouettes))
    
    return pd.DataFrame(results)

# Run experiments
k_values = [2, 10, 30, 50, 100]
results_df = run_multiple_kmeans(X, y_true, k_values)

print("\nK-Means Experiments Results:")
display(results_df.style.format({
    'Purity': '{:.3f}',
    'Silhouette\nCoefficient': '{:.3f}'
}).set_properties(**{'text-align': 'center'})
.set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'center')]}
]))

# Analysis
best_k_purity = results_df.loc[results_df['Purity'].idxmax(), 'k']
best_k_silhouette = results_df.loc[results_df['Silhouette\nCoefficient'].idxmax(), 'k']

print(f"\nBest k for Purity: {best_k_purity}")
print(f"Best k for Silhouette Coefficient: {best_k_silhouette}")

# Analyze purity changes
purity_changes = results_df['Purity'].diff()
print("\nPurity changes with increasing k:")
for k, change in zip(results_df['k'][1:], purity_changes[1:]):
    print(f"k={k}: {change:+.3f}")


K-Means Experiments Results:


Unnamed: 0,k,Purity,Silhouette Coefficient
0,2,0.679,0.568
1,10,0.685,0.581
2,30,0.7,0.553
3,50,0.722,0.562
4,100,0.765,0.513



Best k for Purity: 100
Best k for Silhouette Coefficient: 10

Purity changes with increasing k:
k=10: +0.006
k=30: +0.015
k=50: +0.021
k=100: +0.043


---

### **Task 4:** DBSCAN Experiments
- Apply MinMax normalization
- Run DBSCAN on normalized data:
    - eps = 0.3, 0.5, 0.7
    - fix minPTs = 5
    - metric = Euclidean distance
- Count total number of clusters
- Count total number of anomalies 
- Calculate purity of clustering

In [43]:
def run_dbscan_experiments(X, y_true, eps_values, min_pts):
    results = {
        'eps': [],
        'Number of\nClusters': [],
        'Number of\nAnomalies': [],
        'Purity': []
    }
    
    for eps in eps_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_pts, metric='euclidean')
        y_pred = dbscan.fit_predict(X)
        
        # Count clusters
        n_clusters = len(set(y_pred[y_pred != -1]))
        
        # Count anomalies 
        n_anomalies = np.sum(y_pred == -1)
        
        # Calculate purity 
        if n_clusters > 0:
            valid_points = y_pred != -1
            if np.any(valid_points):
                purity = compute_purity(y_true[valid_points], y_pred[valid_points])
            else:
                purity = 0
        else:
            purity = 0

        results['eps'].append(eps)
        results['Number of\nClusters'].append(n_clusters)
        results['Number of\nAnomalies'].append(n_anomalies)
        results['Purity'].append(purity)
    
    return pd.DataFrame(results)

X_normalized = feature_norm(X)

eps_values = [0.3, 0.5, 0.7]
min_pts = 5

results_df = run_dbscan_experiments(X_normalized, y_true, eps_values, min_pts)

# Results:
print("\nDBSCAN Experiments Results:")
display(results_df.style.format({
    'eps': '{:.1f}',
    'Number of\nClusters': '{:d}',
    'Number of\nAnomalies': '{:d}',
    'Purity': '{:.3f}'
}).set_properties(**{'text-align': 'center'})
.set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'center')]}
]))

# Find best eps for purity
best_eps = results_df.loc[results_df['Purity'].idxmax(), 'eps']
best_purity = results_df['Purity'].max()
print(f"\nBest eps for Purity: {best_eps:.1f} (Purity: {best_purity:.3f})")

print("\nDetailed Analysis:")
for _, row in results_df.iterrows():
    print(f"\neps = {row['eps']:.1f}:")
    print(f"- Number of Clusters: {row['Number of\nClusters']}")
    print(f"- Number of Anomalies: {row['Number of\nAnomalies']} " 
          f"({row['Number of\nAnomalies']/len(X)*100:.1f}% of data)")
    print(f"- Purity: {row['Purity']:.3f}")


DBSCAN Experiments Results:


Unnamed: 0,eps,Number of Clusters,Number of Anomalies,Purity
0,0.3,18,146,0.778
1,0.5,22,21,0.701
2,0.7,22,13,0.703



Best eps for Purity: 0.3 (Purity: 0.778)

Detailed Analysis:

eps = 0.3:
- Number of Clusters: 18.0
- Number of Anomalies: 146.0 (48.8% of data)
- Purity: 0.778

eps = 0.5:
- Number of Clusters: 22.0
- Number of Anomalies: 21.0 (7.0% of data)
- Purity: 0.701

eps = 0.7:
- Number of Clusters: 22.0
- Number of Anomalies: 13.0 (4.3% of data)
- Purity: 0.703
