In [None]:
%pip install kmodes
%pip install scikit-learn
%pip install gower
%pip install seaborn
%pip install matplotlib
%pip install pandas
%pip install numpy
%pip install scipy

In [4]:
## Import required packages ##

import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, silhouette_samples
from kmodes.kprototypes import KPrototypes
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import gower
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

In [3]:
## Load the dataset ##

data = pd.read_csv('preprocessed_data.csv')

In [4]:
## Clustering subset ##

# Define only the relevant columns
selected_columns = [
    "subj_num", "gender", "education", "income", "SVO_score", "TICS_total", "BIS_total", "BIS15_total",
    "STAI_moment", "STAI_general", "e_score", "SDS_score", "BAS_drive", "BAS_fun",
    "BAS_reward", "BFI_E", "BFI_A", "BFI_C", "BFI_N", "BFI_O"
]

# Create new DataFrame with selected columns
data_subset = data[selected_columns]

# Save to new CSV
data_subset.to_csv("cluster_features.csv", index=False)


In [None]:
## Check for missing values in data_subset ##

print(data_subset.isnull().sum())

In [6]:
## Preprocess the data for clustering ##

# Load dataset 
df = pd.read_csv("cluster_features.csv")

# Store IDs separately
subject_ids = df['subj_num']

# Define features
categorical = ['gender', 'education', 'SVO_score']
numerical = [col for col in df.columns if col not in categorical + ['subj_num']]

# Pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical),
    ('cat', categorical_pipeline, categorical)
])

# Fit & transform
X_processed = preprocessor.fit_transform(df)

# Combine with subject IDs
df_final = pd.DataFrame(X_processed)
df_final.insert(0, 'subj_num', subject_ids)

# Save final file
df_final.to_csv("clustering_input_with_id.csv", index=False)


In [None]:
## Some EDA ##

# 1. Distribution plots for numerical features
for col in numerical:
    plt.figure(figsize=(5, 3))
    sns.histplot(data_subset[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.tight_layout()
    plt.show()

# 2. Count plots for categorical features
for col in ['gender', 'education', 'SVO_score']:
    plt.figure(figsize=(4, 3))
    sns.countplot(data=data_subset, x=col)
    plt.title(f"Count of {col}")
    plt.tight_layout()
    plt.show()


# 3. Correlation heatmap for numerical features
plt.figure(figsize=(12, 8))
corr = data_subset[numerical].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Heatmap (Numerical Features)")
plt.tight_layout()
plt.show()

In [None]:
## Optimum number of clusters for KMeans using Elbow and Silhouette methods ##

df = pd.read_csv("clustering_input_with_id.csv")
X = df.drop(columns=['subj_num'])

# Elbow method: plot inertia for k=2 to 10
inertias = []
k_range = range(2, 10)
for k in k_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X)
    inertias.append(km.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(k_range, inertias, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for KMeans")
plt.grid(True)
plt.tight_layout()
plt.show()

# Silhouette scores for k=2 to 10
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
    score = silhouette_score(X, kmeans.labels_)
    print(f"k={k} → silhouette score: {score:.3f}")

In [None]:
## Optimum number of clusters for KPrototypes using Elbow and Silhouette methods ##

# Prepare data for KPrototypes
df = pd.read_csv("cluster_features.csv")
categorical = ['gender', 'education', 'SVO_score']
numerical = [col for col in df.columns if col not in categorical + ['subj_num']]

# Impute and scale numerical, impute categorical
num_imputer = SimpleImputer(strategy='mean')
X_num = pd.DataFrame(num_imputer.fit_transform(df[numerical]), columns=numerical)
scaler = StandardScaler()
X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=numerical)
cat_imputer = SimpleImputer(strategy='most_frequent')
X_cat = pd.DataFrame(cat_imputer.fit_transform(df[categorical]), columns=categorical)
X_full = pd.concat([X_num_scaled, X_cat], axis=1)
X_matrix = X_full.to_numpy()
cat_indices = [X_full.columns.get_loc(col) for col in categorical]

# Elbow method: plot cost for k=2 to 10
costs = []
k_range = range(2, 10)
for k in k_range:
    kproto = KPrototypes(n_clusters=k, init='Cao', random_state=42, verbose=0)
    clusters = kproto.fit_predict(X_matrix, categorical=cat_indices)
    costs.append(kproto.cost_)

plt.figure(figsize=(6, 4))
plt.plot(k_range, costs, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for KPrototypes")
plt.grid(True)
plt.tight_layout()
plt.show()

# Silhouette scores using Gower distance

gower_matrix = gower.gower_matrix(df.drop(columns=['subj_num']))
for k in k_range:
    kproto = KPrototypes(n_clusters=k, init='Cao', random_state=42, verbose=0)
    labels = kproto.fit_predict(X_matrix, categorical=cat_indices)
    sil = silhouette_score(gower_matrix, labels, metric="precomputed")
    print(f"k={k} → Gower silhouette score: {sil:.3f}")

In [None]:
## optimum number of clusters for agglomerative clustering using Elbow and Silhouette methods using Gower distance ##

Z = linkage(gower_dist, method='ward')

# Plot dendrogram (visual elbow)
plt.figure(figsize=(10, 5))
dendrogram(Z, truncate_mode='lastp', p=30, no_labels=True)
plt.title("Dendrogram - Hierarchical Clustering")
plt.xlabel("Sample Index / Cluster Size")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()

# Silhouette scores for k=2 to 10

for k in range(2, 10):
    labels = fcluster(Z, t=k, criterion='maxclust')
    sil = silhouette_score(gower_dist, labels, metric="precomputed")
    print(f"k={k} → Gower silhouette score: {sil:.3f}")

In [None]:
## Clustering with KMeans ##

df = pd.read_csv("clustering_input_with_id.csv")

# Separate subj_num and features
subj_ids = df['subj_num']
X = df.drop(columns=['subj_num'])

# Pick the best k based on plots and scores
best_k = 3  
kmeans = KMeans(n_clusters=best_k, random_state=42)
clusters = kmeans.fit_predict(X)

# Combine with IDs
df['cluster'] = clusters
df[['subj_num', 'cluster']].to_csv("kmeans_cluster_assignments.csv", index=False)


In [None]:
## clustering with KPrototypes ##

df = pd.read_csv("cluster_features.csv")

# Separate ID
ids = df['subj_num']

# Define feature types
categorical = ['gender', 'education', 'SVO_score']
numerical = [col for col in df.columns if col not in categorical + ['subj_num']]

# === Preprocess numerical ===
num_imputer = SimpleImputer(strategy='mean')
X_num = pd.DataFrame(num_imputer.fit_transform(df[numerical]), columns=numerical)

scaler = StandardScaler()
X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=numerical)

# === Preprocess categorical ===
cat_imputer = SimpleImputer(strategy='most_frequent')
X_cat = pd.DataFrame(cat_imputer.fit_transform(df[categorical]), columns=categorical)

# === Combine ===
X_full = pd.concat([X_num_scaled, X_cat], axis=1)

# Convert all to object type for KPrototypes
X_matrix = X_full.to_numpy()

# Indices of categorical columns
cat_indices = [X_full.columns.get_loc(col) for col in categorical]

# === Run K-Prototypes ===
kproto = KPrototypes(n_clusters=3, init='Cao', verbose=1, random_state=42)
clusters = kproto.fit_predict(X_matrix, categorical=cat_indices)

# Add cluster labels
df['kproto_cluster'] = clusters
df[['subj_num', 'kproto_cluster']].to_csv("kproto_assignments.csv", index=False)


In [None]:
## clustering with agglomerative clustering ##


df = pd.read_csv("cluster_features.csv")
subject_ids = df['subj_num']
X = df.drop(columns=['subj_num'])

# Compute Gower distance matrix
gower_dist = gower.gower_matrix(X)

# Hierarchical clustering
Z = linkage(gower_dist, method='ward')  # or try 'average', 'complete'

# Dendrogram
plt.figure(figsize=(10, 5))
dendrogram(Z, no_labels=True, truncate_mode='lastp', p=30)
plt.title("Dendrogram - Agglomerative Clustering (Gower)")
plt.xlabel("Sample Index / Cluster Size")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()

# Choose number of clusters
n_clusters = 3
labels = fcluster(Z, t=n_clusters, criterion='maxclust')

# Save assignments
df_result = pd.DataFrame({
    'subj_num': subject_ids,
    'agglo_cluster': labels - 1  # subtract 1 for 0-based labels
})
df_result.to_csv("agglo_cluster_assignments.csv", index=False)
print("✅ Saved to 'agglo_cluster_assignments.csv'")

In [None]:
## plot cluster distributions (cluster counts) for each method ##


# cluster size distribution for kmeans

# Load cluster assignments
df_clusters = pd.read_csv("kmeans_cluster_assignments.csv")

# Count members per cluster
cluster_counts = df_clusters['cluster'].value_counts().sort_index()

# Plot
plt.figure(figsize=(6, 4))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, palette='Set2')
plt.xlabel("Cluster")
plt.ylabel("Number of Participants")
plt.title("Cluster Size Distribution - KMeans")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print("Cluster sizes for KMeans:")
print(cluster_counts)

############################################

# cluster size distribution for kprototypes

# Load and merge 
df_clusters = pd.read_csv("kproto_assignments.csv")
df_features = pd.read_csv("cluster_features.csv")
df = df_features.merge(df_clusters, on="subj_num")

# Cluster size distribution
cluster_counts1 = df['kproto_cluster'].value_counts().sort_index()
plt.figure(figsize=(6, 4))
sns.barplot(x=cluster_counts1.index, y=cluster_counts1.values, palette='Set2')
plt.title("Cluster Size Distribution - KPrototypes")
plt.xlabel("Cluster")
plt.ylabel("Number of Participants")
plt.tight_layout()
plt.show()

# each clusters size for k proto
print("Cluster sizes for KPrototypes:")
print(cluster_counts1)

#############################################

# cluster size distribution for agglomerative clustering

# Load cluster assignments
df_agglo = pd.read_csv("agglo_cluster_assignments.csv")

# Count members per cluster
cluster_counts2 = df_agglo['agglo_cluster'].value_counts().sort_index()

# Plot
plt.figure(figsize=(6, 4))
sns.barplot(x=cluster_counts2.index, y=cluster_counts2.values, palette='Set2')
plt.xlabel("Cluster")
plt.ylabel("Number of Participants")
plt.title("Cluster Size Distribution - Agglomerative")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print("Cluster sizes for Agglomerative Clustering:")
print(cluster_counts2)

In [None]:
## visualize k-means clusters ##

# Load preprocessed data and cluster labels
df_features = pd.read_csv("clustering_input_with_id.csv")   
df_clusters = pd.read_csv("kmeans_cluster_assignments.csv")        
df = df_features.merge(df_clusters, on="subj_num")

# Drop ID and cluster for PCA
X = df.drop(columns=["subj_num", "cluster"])

# PCA Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['cluster'], palette='Set2', s=60)
plt.title("PCA Projection - KMeans")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

# t-SNE Visualization (nonlinear, captures local structure)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X)

plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=df['cluster'], palette='Set2', s=60)
plt.title("t-SNE Projection of Clusters - KMeans")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()


In [None]:
## visualize k-prototypes clusters ##

# Load and merge 
df_clusters = pd.read_csv("kproto_assignments.csv")
df_features = pd.read_csv("cluster_features.csv")
df = df_features.merge(df_clusters, on="subj_num")


# t-SNE visualization
tsne = TSNE(metric='precomputed', init='random', random_state=42)
X_embedded = tsne.fit_transform(gower_matrix)

plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=df['kproto_cluster'], palette='Set2', s=60)
plt.title("t-SNE Projection of K-Prototypes Clusters (Gower Distance)")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()


## PCA visualization

# Select numerical columns
numerical_cols = [
    'income', 'TICS_total', 'BIS_total', 'BIS15_total',
    'STAI_moment', 'STAI_general', 'e_score', 'SDS_score',
    'BAS_drive', 'BAS_fun', 'BAS_reward',
    'BFI_E', 'BFI_A', 'BFI_C', 'BFI_N', 'BFI_O'
]

# Standardize numerical features
from sklearn.preprocessing import StandardScaler
X_num = df[numerical_cols].copy()
X_scaled = StandardScaler().fit_transform(X_num)

# Run PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['kproto_cluster'], palette='Set2', s=60)
plt.title("PCA Projection - K-Prototypes")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

In [None]:
## Visualize agglomerative clusters with PCA ##

# Merge cluster assignments with original features
df_agglo = pd.read_csv("agglo_cluster_assignments.csv")
df_features = pd.read_csv("cluster_features.csv")
df = df_features.merge(df_agglo, on="subj_num")

# Select numerical columns for PCA
numerical_cols = [
    'income', 'TICS_total', 'BIS_total', 'BIS15_total',
    'STAI_moment', 'STAI_general', 'e_score', 'SDS_score',
    'BAS_drive', 'BAS_fun', 'BAS_reward',
    'BFI_E', 'BFI_A', 'BFI_C', 'BFI_N', 'BFI_O'
]

# Standardize numerical features
X_num = df[numerical_cols].copy()
X_scaled = StandardScaler().fit_transform(X_num)

# Run PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['agglo_cluster'], palette='Set2', s=60)
plt.title("PCA Projection - Hierarchical Clustering")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()


In [None]:
## K-Means Silhouette Score ##

# Merge cluster labels with feature data
df_clusters = pd.read_csv("kmeans_cluster_assignments.csv")
df_features = pd.read_csv("clustering_input_with_id.csv")
df = df_features.merge(df_clusters, on="subj_num")

# Drop ID and cluster columns to get feature matrix
X = df.drop(columns=["subj_num", "cluster"])
labels = df["cluster"]

# Calculate silhouette score
score = silhouette_score(X, labels)
print(f"Silhouette Score for K-Means: {score:.3f}")

In [None]:
## Calculate Gower-based silhouette scores for K-Prototypes clusters ##


# Use gower_matrix (precomputed) and kproto cluster labels
df_labels = pd.read_csv("kproto_assignments.csv")
df_features = pd.read_csv("cluster_features.csv")
df = df_features.merge(df_labels, on="subj_num")

gower_matrix = gower.gower_matrix(df.drop(columns=["subj_num", "kproto_cluster"]))
labels = df["kproto_cluster"]

sil_samples = silhouette_samples(gower_matrix, labels, metric="precomputed")
df["gower_silhouette"] = sil_samples

# Print average silhouette score
print(f"Average Gower Silhouette Score: {sil_samples.mean():.3f}")


In [None]:
## Silhouette Score from PCA Space ##

## kprototypes

# Compute silhouette scores from PCA space
labels = df['kproto_cluster']
sil_pca = silhouette_samples(X_pca, labels)
avg_sil_pca = silhouette_score(X_pca, labels)

# Add to dataframe
df['pca_silhouette'] = sil_pca

# Plot
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='kproto_cluster', y='pca_silhouette', palette='Set2')
plt.axhline(0, color='red', linestyle='--')
plt.title("Silhouette Scores by Cluster (from PCA Space)")
plt.ylabel("Silhouette Coefficient")
plt.xlabel("Cluster")
plt.tight_layout()
plt.show()

# Print average
print(f"\n🔢 Average Silhouette Score kprototypes (PCA space): {avg_sil_pca:.3f}")


#################################################

## compare pca silhouette scores ##

# Step 1: Merge Cluster Assignments

# Load all data
df_kproto = pd.read_csv("kproto_assignments.csv")  # subj_num + kproto_cluster
df_kmeans = pd.read_csv("kmeans_cluster_assignments.csv")  # subj_num + cluster (K-means)

# Merge on subj_num
df_compare = df_kproto.merge(df_kmeans, on="subj_num")
df_compare.columns = ['subj_num', 'kproto_cluster', 'kmeans_cluster']

# Step 2: Measure Agreement (Adjusted Rand Index)

from sklearn.metrics import adjusted_rand_score

ari = adjusted_rand_score(df_compare['kproto_cluster'], df_compare['kmeans_cluster'])
print(f"🔁 Adjusted Rand Index (K-Means vs. K-Prototypes): {ari:.3f}")

# Step 3: Compare Silhouette Scores

from sklearn.metrics import silhouette_score

sil_kmeans = silhouette_score(X_pca, df_compare['kmeans_cluster'])
sil_kproto = silhouette_score(X_pca, df_compare['kproto_cluster'])

print(f"K-Means silhouette (PCA space): {sil_kmeans:.3f}")
print(f"K-Prototypes silhouette (PCA space): {sil_kproto:.3f}")

# Step 4: Side-by-Side PCA Plot

fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)

# K-Means
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df_compare['kmeans_cluster'],
                ax=axes[0], palette='Set1', s=60)
axes[0].set_title("K-Means Clusters (PCA)")

# K-Prototypes
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df_compare['kproto_cluster'],
                ax=axes[1], palette='Set2', s=60)
axes[1].set_title("K-Prototypes Clusters (PCA)")

plt.tight_layout()
plt.show()

In [None]:
# sillouette scores for agglomerative clustering (PCA space)

# Load and merge cluster assignment
df_agglo = pd.read_csv("agglo_cluster_assignments.csv")
df_features = pd.read_csv("clustering_input_with_id.csv")
df = df_features.merge(df_agglo, on="subj_num")

# Compute silhouette scores in PCA space
labels = df['agglo_cluster']
sil_samples = silhouette_samples(X_pca, labels)
df['pca_silhouette'] = sil_samples

# Plot
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='agglo_cluster', y='pca_silhouette', palette='Set2')
plt.axhline(0, color='red', linestyle='--')
plt.title("Silhouette Scores by Cluster (Agglomerative, PCA space)")
plt.tight_layout()
plt.show()

# Print average silhouette
avg_sil = silhouette_score(X_pca, labels)
print(f"📏 Average Silhouette Score (Agglomerative in PCA): {avg_sil:.3f}")


In [None]:
## compare agglomerative clustering with kmeans and kprototypes ##

from sklearn.metrics import adjusted_rand_score

# Load comparisons
df_kproto = pd.read_csv("kproto_assignments.csv")
df_kmeans = pd.read_csv("kmeans_cluster_assignments.csv")

# Merge all labels
df_all = df_kproto.merge(df_kmeans, on="subj_num").merge(df_agglo, on="subj_num")

# Adjusted Rand Index
ari_kproto_vs_agglo = adjusted_rand_score(df_all['kproto_cluster'], df_all['agglo_cluster'])
ari_kmeans_vs_agglo = adjusted_rand_score(df_all['cluster'], df_all['agglo_cluster'])

print(f"🔁 ARI: K-Prototypes vs Agglomerative: {ari_kproto_vs_agglo:.3f}")
print(f"🔁 ARI: K-Means vs Agglomerative: {ari_kmeans_vs_agglo:.3f}")

In [None]:
# K-Prototypes Clustering

# === Load and merge ===
df_clusters = pd.read_csv("kproto_assignments.csv")
df_features = pd.read_csv("cluster_features.csv")
df = df_features.merge(df_clusters, on="subj_num")

traits = [
    'income', 'TICS_total', 'BIS_total', 'BIS15_total',
    'STAI_moment', 'STAI_general', 'e_score', 'SDS_score',
    'BAS_drive', 'BAS_fun', 'BAS_reward',
    'BFI_E', 'BFI_A', 'BFI_C', 'BFI_N', 'BFI_O'
]

# === Profile summary ===
numerical_summary = df.groupby("kproto_cluster")[traits].mean().T
print("\n=== Cluster Profile Summary (Means) ===")
print(numerical_summary)

categorical = ['gender', 'education', 'SVO_score']
for col in categorical:
    print(f"\n=== {col} distribution per cluster ===")
    print(df.groupby("kproto_cluster")[col].value_counts(normalize=True))

#####

# descriptive statistics for k-prototypes clusters including categorical features with means, and standard deviations

# Numerical: mean and std per cluster
numerical_stats = df.groupby("kproto_cluster")[traits].agg(['mean', 'std']).T
print("\n=== Numerical Feature Means and STDs by Cluster ===")
print(numerical_stats)

# Categorical: counts and proportions per cluster
for col in categorical:
    print(f"\n=== {col} counts per cluster ===")
    print(df.groupby("kproto_cluster")[col].value_counts())
    print(f"\n=== {col} proportions per cluster ===")
    print(df.groupby("kproto_cluster")[col].value_counts(normalize=True))
