In [None]:
# Import packages

# Data managing
import numpy as np
import pandas as pd

# Machine Learning
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Plooting
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from tabulate import tabulate

# 3 METRICS METHODOLOGY

In [None]:
# Data Load
# Change the path to where the file 'tabla_6_mod.xlsx is located
df = pd.read_excel('tabla_6_mod.xlsx')

In [None]:
# Since we are going to work with distances, it is better to transform the data. In this case we remove the mean and set the variance to 1 (Z-score)
# Drop unncesary columns
data = df.drop(['CellType', 'ExperimentType'], axis = 1)


scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [None]:
# CHECK APROPIATE NUMBER OF CLUSTERS for K-Means
max_clusters = 10
silhouette_scores_kmeans = []
silhouette_scores_agg = []

for num_clusters in range(2, max_clusters + 1):

    k_means = KMeans(n_clusters=num_clusters, random_state=0, n_init=10)
    agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric='euclidean', linkage='ward')

    cluster_labels_kmeans = k_means.fit_predict(scaled_data)
    cluster_labels_agg = agg_clustering.fit_predict(scaled_data)

    silhouette_scores_kmeans.append(silhouette_score(scaled_data, cluster_labels_kmeans))
    silhouette_scores_agg.append(silhouette_score(scaled_data, cluster_labels_agg))

# Plot the silhouette scores to find the optimal number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(2, max_clusters + 1), silhouette_scores_kmeans, marker='o')
plt.title('Silhouette Score for Different Numbers of Clusters using K-Means')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range(2, max_clusters + 1))
plt.show()

# Plot the silhouette scores to find the optimal number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(2, max_clusters + 1), silhouette_scores_agg, marker='o')
plt.title('Silhouette Score for Different Numbers of Clusters using Agglomerative Clustering ')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range(2, max_clusters + 1))
plt.show()


In [None]:
# Agglomerative Clustering

n_clusters = [3,4]
all_labels = []

fig, axs = plt.subplots(1, len(n_clusters), figsize=(15, 5), subplot_kw={'projection': '3d'})

for idx, num_clusters in enumerate(n_clusters):

    agg_clustering = AgglomerativeClustering(n_clusters = num_clusters, metric = 'euclidean', linkage = 'ward', compute_full_tree = True)
    cluster_labels = agg_clustering.fit_predict(scaled_data)

    all_labels.append(cluster_labels)

    ax = axs[idx]

    scatter = ax.scatter(data.iloc[:,0], data.iloc[:,1], data.iloc[:,2], c = cluster_labels, cmap = 'viridis')
    ax.view_init(elev=30, azim=80)
    ax.set_title(f'{num_clusters} Clusters')
    ax.set_xlabel('Metric 1')
    ax.set_ylabel('Metric 2')
    ax.set_zlabel('Metric 3')
    cbar = plt.colorbar(scatter, ax = ax)
    ax.grid(False)

plt.tight_layout()
plt.show()

# Agglomerative Clustering pie charts

for idx in range(0,2):
    results_table  = []

    cluster_labels = all_labels[idx]
    aux = [df, pd.DataFrame(cluster_labels)]
    aux = pd.concat(aux, axis = 1)

    clusters = aux[0].unique()
    clusters.sort()

    for cluster_id in clusters:
        cluster_data = aux[aux[0] == cluster_id]
        total_num_cluster = len(cluster_data)

        cell_type_prop = (cluster_data['CellType'].value_counts() / total_num_cluster).round(2)
        expe_type_prop = (cluster_data['ExperimentType'].value_counts() / total_num_cluster).round(2)

        fig, axs = plt.subplots(1, 2, figsize=(7, 5))

        axs[0].pie(cell_type_prop, labels=cell_type_prop.index, autopct='%1.1f%%', startangle=140)
        axs[0].set_title(f"Cluster {cluster_id} - Cell Type Proportions")

        axs[1].pie(expe_type_prop, labels=expe_type_prop.index, autopct='%1.1f%%', startangle=140)
        axs[1].set_title(f"Cluster {cluster_id} - Experiment Type Proportions")

        plt.tight_layout()
        plt.show()

        results_table.append([
            f"Cluster {cluster_id}",
            tabulate(cell_type_prop.reset_index(), headers = ['Cell Type', 'Proportion'], tablefmt= 'plain', showindex=False),
             tabulate(expe_type_prop.reset_index(), headers = ['Experiment Type', 'Proportion'], tablefmt = 'plain', showindex=False)
        ])

    with open(f'table_{idx}.txt' ,'w',) as f:
        f.write(tabulate(results_table, headers=['Cluster', 'Cell Type Proportions', 'Experiment Type Proportions'], tablefmt='grid'))
    print(tabulate(results_table, headers=['Cluster', 'Cell Type Proportions', 'Experiment Type Proportions'], tablefmt='grid'))

In [None]:
# KMeans

n_clusters = [3,4]
all_labels = []

fig, axs = plt.subplots(1, len(n_clusters), figsize=(15, 5), subplot_kw={'projection': '3d'})

for idx, num_clusters in enumerate(n_clusters):

    kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init = 10)
    cluster_labels = kmeans.fit_predict(scaled_data)

    all_labels.append(cluster_labels)

    ax = axs[idx]

    scatter = ax.scatter(data.iloc[:,0], data.iloc[:,1], data.iloc[:,2], c = cluster_labels, cmap = 'viridis')
    ax.view_init(elev=30, azim=80)
    ax.set_title(f'{num_clusters} Clusters')
    ax.set_xlabel('Metric 1')
    ax.set_ylabel('Metric 2')
    ax.set_zlabel('Metric 3')
    cbar = plt.colorbar(scatter, ax = ax)
    ax.grid(False)

plt.tight_layout()
plt.show()

# Pie charts
for idx in range(0,2):
  results_table  = []

  cluster_labels = all_labels[idx]
  aux = [df, pd.DataFrame(cluster_labels)]
  aux = pd.concat(aux, axis = 1)

  clusters = aux[0].unique()
  clusters.sort()

  for cluster_id in clusters:
      cluster_data = aux[aux[0] == cluster_id]
      total_num_cluster = len(cluster_data)

      cell_type_prop = (cluster_data['CellType'].value_counts() / total_num_cluster).round(2)
      expe_type_prop = (cluster_data['ExperimentType'].value_counts() / total_num_cluster).round(2)

      fig, axs = plt.subplots(1, 2, figsize=(7, 5))

      axs[0].pie(cell_type_prop, labels=cell_type_prop.index, autopct='%1.1f%%', startangle=140)
      axs[0].set_title(f"Cluster {cluster_id} - Cell Type Proportions")

      axs[1].pie(expe_type_prop, labels=expe_type_prop.index, autopct='%1.1f%%', startangle=140)
      axs[1].set_title(f"Cluster {cluster_id} - Experiment Type Proportions")

      plt.tight_layout()
      plt.show()

      results_table.append([
          f"Cluster {cluster_id}",
          tabulate(cell_type_prop.reset_index(), headers = ['Cell Type', 'Proportion'], tablefmt= 'plain', showindex=False),
          tabulate(expe_type_prop.reset_index(), headers = ['Experiment Type', 'Proportion'], tablefmt = 'plain', showindex=False)
       ])

  with open(f'table_{idx}.txt' ,'w',) as f:
        f.write(tabulate(results_table, headers=['Cluster', 'Cell Type Proportions', 'Experiment Type Proportions'], tablefmt='grid'))
  print(tabulate(results_table, headers=['Cluster', 'Cell Type Proportions', 'Experiment Type Proportions'], tablefmt='grid'))



# 6 Metrics Methodology




In [None]:
# Data Load
# Data Load
# Change the path to where the file 'tabla_7_mod.xlsx' is located
df = pd.read_excel('tabla_7_mod.xlsx')

In [None]:
# Having 6 dimensions and wanting to see the clusters in 3D, we are going to do a PCA with 3 components. Since PCA is sensitive to scale, it is better to transform the data.
# In this case we remove the mean and set the variance to 1 (Z-score)

data = df.drop(['CellType', 'ExperimentType'], axis = 1)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
scaled_data_5 = np.delete(scaled_data,3,1)

In [None]:
# PCA

n_components = 3
pca = PCA(n_components = n_components)
pca_components = pca.fit_transform(scaled_data_5)

pca_components = pd.DataFrame(pca_components)
pca_components.rename(columns = {0: 'PCA 1', 1: 'PCA 2', 2: 'PCA 3'}, inplace = True)

pca_components['CellType'] = df['CellType']
pca_components['ExperimentType'] = df['ExperimentType']

expvar_ratio = pca.explained_variance_ratio_
cum_expvar = np.cumsum(expvar_ratio)

print("Explained Variance Ratios for Each Component:")
for i, explained_ratio in enumerate(expvar_ratio):
    print(f"Component {i+1}: {explained_ratio:.2%}")

print("\nCumulative Explained Variance:")
for i, cumulative_variance in enumerate(cum_expvar):
    print(f"Components {i+1}: {cumulative_variance:.2%}")

In [None]:
# CHECK APROPIATE NUMBER OF CLUSTERS

max_clusters = 10
silhouette_scores_agg = []
silhouette_scores_kmean= []

data_to_clus = pca_components.drop(['CellType', 'ExperimentType'], axis = 1)

for num_clusters in range(2, max_clusters + 1):

    k_means = KMeans(n_clusters=num_clusters, random_state=0, n_init=10)
    agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric='euclidean', linkage='ward')

    cluster_labels_agg = agg_clustering.fit_predict(data_to_clus)
    cluster_labels_kmeans = k_means.fit_predict(data_to_clus)

    silhouette_scores_agg.append(silhouette_score(data_to_clus, cluster_labels_agg))
    silhouette_scores_kmeans.append(silhouette_score(scaled_data, cluster_labels_kmeans))

# Plot the silhouette scores to find the optimal number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(2, max_clusters + 1), silhouette_scores_agg, marker='o')
plt.title('Silhouette Score for Different Numbers of Clusters using Agglomerative Clustering')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range(2, max_clusters + 1))

plt.show()

plt.figure(figsize=(10, 6))
plt.plot(range(2, max_clusters + 1), silhouette_scores_kmeans[:9], marker='o')
plt.title('Silhouette Score for Different Numbers of Clusters using K-Means')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range(2, max_clusters + 1))

plt.show()

In [None]:
# K-means
n_clusters = [3,4]
all_labels = []

data_to_clus = pca_components.drop(['CellType', 'ExperimentType'], axis = 1)

fig, axs = plt.subplots(1, len(n_clusters), figsize=(15, 5), subplot_kw={'projection': '3d'})

for idx, num_clusters in enumerate(n_clusters):
    kmeans = KMeans(n_clusters = num_clusters, random_state = 0, n_init = 10)
    cluster_labels = kmeans.fit_predict(data_to_clus)

    all_labels.append(cluster_labels)

    ax = axs[idx]

    scatter = ax.scatter(data_to_clus.iloc[:,0], data_to_clus.iloc[:,1], data_to_clus.iloc[:,2], c = cluster_labels, cmap = 'viridis')
    ax.view_init(elev=30, azim=80)
    ax.set_title(f'{num_clusters} Clusters')
    ax.set_xlabel('PC 1')
    ax.set_ylabel('PC 2')
    ax.set_zlabel('PC 3')
    cbar = plt.colorbar(scatter, ax = ax)
    ax.grid(False)

plt.tight_layout()
plt.show()

# Pie charts

for idx in range(0,2):
    results_table  = []

    cluster_labels = all_labels[idx]
    aux = [pca_components, pd.DataFrame(cluster_labels)]
    aux = pd.concat(aux, axis = 1)

    clusters = aux[0].unique()
    clusters.sort()

    for cluster_id in clusters:
        cluster_data = aux[aux[0] == cluster_id]
        total_num_cluster = len(cluster_data)

        cell_type_prop = (cluster_data['CellType'].value_counts() / total_num_cluster).round(2)
        expe_type_prop = (cluster_data['ExperimentType'].value_counts() / total_num_cluster).round(2)

        fig, axs = plt.subplots(1, 2, figsize=(7, 5))

        axs[0].pie(cell_type_prop, labels=cell_type_prop.index, autopct='%1.1f%%', startangle=140)
        axs[0].set_title(f"Cluster {cluster_id} - Cell Type Proportions")

        axs[1].pie(expe_type_prop, labels=expe_type_prop.index, autopct='%1.1f%%', startangle=140)
        axs[1].set_title(f"Cluster {cluster_id} - Experiment Type Proportions")

        plt.tight_layout()
        plt.show()

        results_table.append([
            f"Cluster {cluster_id}",
            tabulate(cell_type_prop.reset_index(), headers = ['Cell Type', 'Proportion'], tablefmt= 'plain', showindex=False),
             tabulate(expe_type_prop.reset_index(), headers = ['Experiment Type', 'Proportion'], tablefmt = 'plain', showindex=False)
        ])

    with open(f'table_{idx}.txt' ,'w',) as f:
        f.write(tabulate(results_table, headers=['Cluster', 'Cell Type Proportions', 'Experiment Type Proportions'], tablefmt='grid'))

    print(tabulate(results_table, headers=['Cluster', 'Cell Type Proportions', 'Experiment Type Proportions'], tablefmt='grid'))

In [None]:
#Agglomerative clustering

n_clusters = [3,4]
all_labels = []

fig, axs = plt.subplots(1, len(n_clusters), figsize=(15, 5), subplot_kw={'projection': '3d'})

for idx, num_clusters in enumerate(n_clusters):
    agg_clustering = AgglomerativeClustering(n_clusters = num_clusters, metric = 'euclidean', linkage = 'ward', compute_full_tree = True)
    cluster_labels = agg_clustering.fit_predict(data_to_clus)

    all_labels.append(cluster_labels)

    ax = axs[idx]

    scatter = ax.scatter(data_to_clus.iloc[:,0], data_to_clus.iloc[:,1], data_to_clus.iloc[:,2], c = cluster_labels, cmap = 'viridis')
    ax.view_init(elev=30, azim=80)
    ax.set_title(f'{num_clusters} Clusters')
    ax.set_xlabel('PC 1')
    ax.set_ylabel('PC 2')
    ax.set_zlabel('PC 3')
    cbar = plt.colorbar(scatter, ax = ax)
    ax.grid(False)

plt.tight_layout()
plt.show()

# Aglomerative Clustering Pie charts

for idx in range(0,2):
    results_table  = []

    cluster_labels = all_labels[idx]
    aux = [pca_components, pd.DataFrame(cluster_labels)]
    aux = pd.concat(aux, axis = 1)

    clusters = aux[0].unique()
    clusters.sort()

    for cluster_id in clusters:
        cluster_data = aux[aux[0] == cluster_id]
        total_num_cluster = len(cluster_data)

        cell_type_prop = (cluster_data['CellType'].value_counts() / total_num_cluster).round(2)
        expe_type_prop = (cluster_data['ExperimentType'].value_counts() / total_num_cluster).round(2)

        fig, axs = plt.subplots(1, 2, figsize=(7, 5))

        axs[0].pie(cell_type_prop, labels=cell_type_prop.index, autopct='%1.1f%%', startangle=140)
        axs[0].set_title(f"Cluster {cluster_id} - Cell Type Proportions")

        axs[1].pie(expe_type_prop, labels=expe_type_prop.index, autopct='%1.1f%%', startangle=140)
        axs[1].set_title(f"Cluster {cluster_id} - Experiment Type Proportions")

        plt.tight_layout()
        plt.show()

        results_table.append([
            f"Cluster {cluster_id}",
            tabulate(cell_type_prop.reset_index(), headers = ['Cell Type', 'Proportion'], tablefmt= 'plain', showindex=False),
             tabulate(expe_type_prop.reset_index(), headers = ['Experiment Type', 'Proportion'], tablefmt = 'plain', showindex=False)
        ])

    with open(f'table_{idx}.txt' ,'w',) as f:
        f.write(tabulate(results_table, headers=['Cluster', 'Cell Type Proportions', 'Experiment Type Proportions'], tablefmt='grid'))

    print(tabulate(results_table, headers=['Cluster', 'Cell Type Proportions', 'Experiment Type Proportions'], tablefmt='grid'))