<a href="https://colab.research.google.com/github/MatRitchie/Sunshine/blob/main/Sunshine_but_wrapped_In_funcs_for_AUTOMATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import libraries

In [53]:
from google.colab import userdata, files
import os
import pandas as pd
import numpy as np
import plotly.express as px

# Setup Github access

In [54]:
os.environ['GITHUB_TOKEN'] = userdata.get('GITHUB_TOKEN')

%cd /content
!git clone https://$GITHUB_TOKEN@github.com/MatRitchie/Sunshine.git

%cd /content/Sunshine
!git pull https://$GITHUB_TOKEN@github.com/MatRitchie/Sunshine.git main

/content
fatal: destination path 'Sunshine' already exists and is not an empty directory.
/content/Sunshine
From https://github.com/MatRitchie/Sunshine
 * branch            main       -> FETCH_HEAD
Already up to date.


# Data

In [55]:
df_all = pd.read_csv("combined_project_status.csv")
status = df_all['status']

##Normalizing the data

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler

def scale_data(df_all, method='minmax'):
  """
  Scales the input data using the specified method.

  Args:
    df_all: A NumPy array or pandas DataFrame representing the data to be scaled.
    method: The scaling method to use. Options are:
      - 'minmax': Min-Max scaling (default)
      - 'standard': Standard scaling (Z-score normalization)
      - 'robust': Robust scaling
      - 'maxabs': MaxAbs scaling

  Returns:
    A NumPy array representing the scaled data, or None if an invalid method is provided.
  """
  if method == 'minmax':
    scaler = MinMaxScaler()
  elif method == 'standard':
    scaler = StandardScaler()
  elif method == 'robust':
    scaler = RobustScaler()
  elif method == 'maxabs':
    scaler = MaxAbsScaler()
  else:
    print("Invalid scaling method. Choose from 'minmax', 'standard', 'robust', or 'maxabs'.")
    return None

  df_numeric = df_all.select_dtypes(include=[np.number])
  scaled_data = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)
  return scaled_data

df_norm = scale_data(df_all, 'minmax')
#df_norm.head()
df_norm.columns

Index(['COM-1', 'COM-2', 'POP-1', 'STA-1', 'STA-2', 'STA-3', 'STA-4', 'STA-5',
       'STA-6', 'STA-7', 'STA-8', 'STA-9', 'TEC-1', 'TEC-2', 'TEC-3', 'TEC-4',
       'SWQ-1', 'SWQ-2.1', 'SWQ-2.2', 'SWQ-2.3', 'SWQ-2.4', 'SWQ-2.5',
       'SWQ-2.6', 'SWQ-2.7'],
      dtype='object')

In [None]:
def get_features(df, features_to_include):
  """
  Returns a DataFrame containing only the specified features.

  Args:
    df: The input DataFrame.
    features_to_include: A list of feature names or feature groups to include.

  Returns:
    A DataFrame with the selected features, or None if an invalid feature is provided.
  """

  communication = ['COM-1', 'COM-2']
  popularity = ['POP-1']
  stability = ['STA-1','STA-2', 'STA-3', 'STA-4', 'STA-5', 'STA-6', 'STA-7', 'STA-8', 'STA-9']
  technical_activity = ['TEC-1', 'TEC-2', 'TEC-3', 'TEC-4']
  quality = ['SWQ-1', 'SWQ-2.1', 'SWQ-2.2', 'SWQ-2.3', 'SWQ-2.4', 'SWQ-2.5', 'SWQ-2.6', 'SWQ-2.7']

  all_features = {
      "communication": communication,
      "popularity": popularity,
      "stability": stability,
      "technical_activity": technical_activity,
      "quality": quality
  }

  selected_features = []
  for feature_group_or_name in features_to_include:
    if feature_group_or_name in all_features:
      selected_features.extend(all_features[feature_group_or_name])
    elif feature_group_or_name in df.columns:
      selected_features.append(feature_group_or_name)
    else:
      print(f"Invalid feature or feature group name: {feature_group_or_name}")
      return None

  return df[selected_features]

features = ['communication', 'popularity', 'stability', 'technical_activity', 'quality']
X = get_features(df_norm, features)
X.head()
X.columns


Index(['COM-1', 'COM-2', 'POP-1', 'STA-1', 'STA-2', 'STA-3', 'STA-4', 'STA-5',
       'STA-6', 'STA-7', 'STA-8', 'STA-9', 'TEC-1', 'TEC-2', 'TEC-3', 'TEC-4',
       'SWQ-1', 'SWQ-2.1', 'SWQ-2.2', 'SWQ-2.3', 'SWQ-2.4', 'SWQ-2.5',
       'SWQ-2.6', 'SWQ-2.7'],
      dtype='object')

#Initial vizualization




##Imports

In [58]:
from sklearn.manifold import TSNE
from umap import UMAP

##t-sne

[t-sne intro](https://www.datacamp.com/tutorial/introduction-t-sne) <br>
[Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)

>Parameters:
>>Perplexity: how much to focus on local vs global aspect of data. (5-50)low perplexity is more focus on local structure, while higher perplexity focus more in global structure

In [59]:
tsne_2 = TSNE(n_components=2, perplexity=30, random_state=42)
proj_2d = tsne_2.fit_transform(X)
kl = tsne_2.kl_divergence_

fig = px.scatter(x=proj_2d[:, 0], y=proj_2d[:, 1], color=status)
fig.update_layout(
    title=f"t-SNE vizualization kl_divergence:{kl}",
    xaxis_title="x",
    yaxis_title="y",
)
fig.show()

In [60]:
tsne_3 = TSNE(n_components=3, perplexity=50, random_state=42)
proj_3d = tsne_3.fit_transform(X)
kl = tsne_3.kl_divergence_


fig = px.scatter_3d(x=proj_3d[:, 0], y=proj_3d[:, 1], z=proj_3d[:, 2], color=status)
fig.update_layout(
    title=f"3D t-SNE vizualization kl_divergence:{kl}",
    scene=dict(
        xaxis_title="x",
        yaxis_title="y",
        zaxis_title="z"
    )
)

fig.show()

##UMAP
Uniform Manifold Approximation and Projection for Dimension Reduction <br>

[Understanding UMAP](https://pair-code.github.io/understanding-umap/) <br>
[Documentation](https://umap-learn.readthedocs.io/en/latest/)

Parameters:
>n_neighbors: determine the size of the local neighborhood, low is local, high is more global. default=15<br>
>min_dist: minimum distance points may have. low values is more clumps, while high is more spread out <br>
>

In [61]:
umap_2d = UMAP(n_neighbors=15, min_dist=0.1, n_components=2)
proj_2d = umap_2d.fit_transform(X)

fig = px.scatter(x=proj_2d[:, 0], y=proj_2d[:, 1], color=status)
fig.update_layout(
    title="UMAP vizualization",
    xaxis_title="x",
    yaxis_title="y",
)
fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [62]:
umap_3d = UMAP(n_neighbors=15, min_dist=0.1, n_components=3)
proj_3d = umap_3d.fit_transform(X)


fig = px.scatter_3d(x=proj_3d[:, 0], y=proj_3d[:, 1], z=proj_3d[:, 2], color=status)
fig.update_layout(
    title="UMAP vizualization",
    scene=dict(
        xaxis_title="x",
        yaxis_title="y",
        zaxis_title="z"
    )
)

fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



#KMeans

## Elbow method
Used to determine how many clusters should be used in K-means. <br>

For each cluster Within-Cluster Sum of Squares (wcss) is calculated, which is a measure of how well points are clustered around their centroid. The elbow point is when the graph break and adding more cluster doesnt significantly reduce the wcss score. The elbow point is then the optimal amount of clusters, where adding would be unnesesary and could lead to overfitting and less to not capture the complete variability of the data.

[Explanation](https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/) <br>

[Link to implementation](https://www.comet.com/site/blog/how-to-evaluate-clustering-models-in-python/)

In [None]:
import numpy as np

def find_optimal_k_w_angle(wcss):
    """Finds the optimal k using the angle method.

    Args:
        wcss: A list of WCSS values for different k values.

    Returns:
        The optimal k value, or None if the list is too short.
    """

    if len(wcss) < 3:
        print("WCSS list is too short to calculate angles.")
        return None

    angles = []
    for i in range(1, len(wcss) - 2):
        a = np.array([i, wcss[i]]) 
        b = np.array([i + 1, wcss[i + 1]])
        c = np.array([i + 2, wcss[i + 2]])

        ba = a - b
        bc = c - b
        cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
        angle = np.arccos(cosine_angle)
        angles.append(angle)

    optimal_k = np.argmax(angles) + 2
    return optimal_k

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

def find_optimal_n_clusters(X, max_k=10, model_type='kmeans', **kwargs):
    silhouette_avg = []
    wcss = []

    for i in range(2, max_k + 1):
        if model_type == 'kmeans':
            model = KMeans(n_clusters=i, init='k-means++', random_state=42)
            model.fit(X)
            wcss.append(model.inertia_)
        elif model_type == 'agglomerativeclustering':
            model = AgglomerativeClustering(n_clusters=i, **kwargs)
            model.fit(X)

        score = silhouette_score(X, model.labels_)
        silhouette_avg.append(score)

    #if model_type == 'kmeans':
    #    plt.plot(range(2, max_k + 1), wcss)
    #    plt.title('Elbow Method')
    #    plt.xlabel('Number of clusters')
    #    plt.ylabel('WCSS')
    #    plt.show()
    
    optimal_k_angle = find_optimal_k_w_angle(wcss) if model_type == 'kmeans' else None

    optimal_k = optimal_k_angle if model_type == 'kmeans' else np.argmax(silhouette_avg) + 2

    optimal_k_sil = np.argmax(silhouette_avg) + 2
    return optimal_k

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

def fit_unsupervised_model(X, model_type='kmeans', n_clusters=None, eps=0.5, min_samples=5, linkage='ward'):

    if model_type == 'kmeans':
        if n_clusters is None:
            n_clusters = find_optimal_n_clusters(X, max_k=20)
        model = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    elif model_type == 'dbscan':
        model = DBSCAN(eps=eps, min_samples=min_samples)
    elif model_type == 'agglomerativeclustering':
        if n_clusters is None:
            n_clusters = find_optimal_n_clusters(X, max_k=20, model_type='agglomerativeclustering')
        model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    elif model_type == 'spectral':
        model = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors')

    labels = model.fit_predict(X)
    return model, labels

model, labels = fit_unsupervised_model(X, model_type='kmeans', n_clusters=None)
print(model)
model, labels = fit_unsupervised_model(X, model_type='dbscan')
print('DBSCAN n_clusters:', max(labels)+1)
model, labels = fit_unsupervised_model(X, model_type='agglomerativeclustering', linkage='ward')
print(model)
model, labels = fit_unsupervised_model(X, model_type='spectral', n_clusters=3)
print(model)


KMeans(n_clusters=np.int64(16), random_state=42)
DBSCAN n_clusters: 2
AgglomerativeClustering(n_clusters=np.int64(2))
SpectralClustering(affinity='nearest_neighbors', n_clusters=3)


##Training

In [None]:
#using k-means to test evaluation metrics
from sklearn.cluster import KMeans
import plotly.graph_objects as go

Kmean = KMeans(n_clusters=16)
Kmean.fit(X)
y_pred = Kmean.predict(X)

umap_2d = UMAP(n_neighbors=15, min_dist=0.1, n_components=2)
proj_2d = umap_2d.fit_transform(X)

def UMAP_plot(X, y_pred, status):
    df = pd.DataFrame({
        'x': proj_2d[:, 0],
        'y': proj_2d[:, 1],
        'cluster': y_pred,
        'status': status.astype(str)
    })

    fig = go.Figure()
    status_symbols = {
        'evolved': 'circle',
        'active': 'square',
        'graduated': 'diamond',
        'retired': 'cross'
    }

    cluster_colors = {
        0: 'blue',
        1: 'red',
        2: 'green',
        3: 'orange',
        4: 'purple',
        5: 'cyan',
        6: 'magenta',
        7: 'lime',
        8: 'pink',
        9: 'brown',
        10: 'gray',
        11: 'olive',
        12: 'teal',
        13: 'navy',
        14: 'maroon',
        15: 'black'
    }

    for cluster in df['cluster'].unique():
        for status_val in df['status'].unique():
            df_subset = df[(df['cluster'] == cluster) & (df['status'] == status_val)]
            fig.add_trace(go.Scatter(
                x=df_subset['x'],
                y=df_subset['y'],
                mode='markers',
                name=f"Cluster {cluster}, {status_val} (n={len(df_subset)})",
                marker=dict(
                    symbol=status_symbols.get(status_val, 'circle'),
                    color=cluster_colors.get(cluster, 'gray')
                )
            ))

    fig.update_layout(
        title="UMAP vizualization",
        xaxis_title="x",
        yaxis_title="y",
    )
    fig.show()

UMAP_plot(X, y_pred, status)


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



#Evaluation of clustering algorithms

Evaluation of clusters is done by different metrics of combining cohesion within a cluster and seperation between clusters.

* Internal cluster validation
  * Evaluation based on the clusters themselves
* External cluster validation
  * Evaluating of clusters with external labels eg. graduated/retired
* Relative cluster validation
  *  Evaluation results by variyng different parameters of the models


##Internal validation

###Silhuette score

The silhuette score for a sample is given as: <br>
s = a - b / max(a, b) <br>

a = mean distance between sample and all other points in cluster <br>
b = mean distance bewteen sample and all other in next nearest cluster <br>

The Silhuette Coefficient for a set sample is then the mean of the coefficient for each sample in the set. <br>

-1 is an incorrect cluster, 1 is highly dense, and around 0 indicates overlapping clusters.


[Documentation](https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient)

In [67]:
from sklearn.metrics import silhouette_score

print(silhouette_score(X,y_pred))
print(silhouette_score(X,status))

0.1894176611212186
-0.06544323988003009


### Calinski Harabaz Index (Variance Ratio Criterion)

A hihger score indicates a model with better defined clusters. Assesment of how well seperated and compact clusters are. Ther isn't a upper level to the index.

Index = ratio of between-clusters dispersion and within-cluster dispersion

between-cluster dispersion = how well **seperated** clusters are

within-cluster dispersion = measures the **compactness** of each cluster

[Documentation](https://scikit-learn.org/stable/modules/clustering.html#calinski-harabasz-index)

In [68]:
from sklearn.metrics import calinski_harabasz_score
print(calinski_harabasz_score(X,y_pred))
print(calinski_harabasz_score(X,status))

86.02611162241325
4.374930551532691


###Davies Bouldin index

Lower index (0 is lowest possible score) = better seperation between clusters

Index = average similarity between clusters

Similarity compares the distance between clusters with the size of the clusters



[Documentation](https://scikit-learn.org/stable/modules/clustering.html#davies-bouldin-index)

In [69]:
from sklearn.metrics import davies_bouldin_score
print(davies_bouldin_score(X,y_pred))
print(davies_bouldin_score(X,status))

1.5041705369158915
10.660554076058627


In [None]:
from pprint import pprint
from itertools import combinations

def generate_combinations(features, min_length = 1, max_length = 1000):
  all_combinations = []
  for i in range(min_length, (len(features) if len(features)<max_length else max_length) + 1):
    for combination in combinations(features, i):
      all_combinations.append(list(combination))
  return all_combinations

#feature_groups = ['communication', 'popularity', 'stability', 'technical_activity', 'quality']
feature_groups = range(1,6)
combs = generate_combinations(feature_groups, 2, 3)
pprint(combs)
print(len(combs))

[[1, 2],
 [1, 3],
 [1, 4],
 [1, 5],
 [2, 3],
 [2, 4],
 [2, 5],
 [3, 4],
 [3, 5],
 [4, 5],
 [1, 2, 3],
 [1, 2, 4],
 [1, 2, 5],
 [1, 3, 4],
 [1, 3, 5],
 [1, 4, 5],
 [2, 3, 4],
 [2, 3, 5],
 [2, 4, 5],
 [3, 4, 5]]
20


In [None]:
def get_subset_by_status(df, status_list):
  if 'status' not in df.columns:
    df['status'] = status

  status_set = set(status_list)

  data_subset = df[df['status'].isin(status_set)]
  status_subset = data_subset['status']
  data_subset = data_subset.drop(columns=['status'])
  return data_subset, status_subset

data_subset = ['graduated', 'retired', 'evolved', 'bypassed']
data_subsets = generate_combinations(data_subset, min_length=2)
for subset in data_subsets:
  b, h = get_subset_by_status(df_all, subset)
  print(f"{subset}, {b.shape}, {h.shape}")

if data_subsets is not None:
  print(data_subsets)
else:
  print("No 'status' column found in the df.")


['graduated', 'retired'], (236, 28), (236,)
['graduated', 'evolved'], (378, 28), (378,)
['graduated', 'bypassed'], (288, 28), (288,)
['retired', 'evolved'], (236, 28), (236,)
['retired', 'bypassed'], (146, 28), (146,)
['evolved', 'bypassed'], (288, 28), (288,)
['graduated', 'retired', 'evolved'], (425, 28), (425,)
['graduated', 'retired', 'bypassed'], (335, 28), (335,)
['graduated', 'evolved', 'bypassed'], (477, 28), (477,)
['retired', 'evolved', 'bypassed'], (335, 28), (335,)
['graduated', 'retired', 'evolved', 'bypassed'], (524, 28), (524,)
[['graduated', 'retired'], ['graduated', 'evolved'], ['graduated', 'bypassed'], ['retired', 'evolved'], ['retired', 'bypassed'], ['evolved', 'bypassed'], ['graduated', 'retired', 'evolved'], ['graduated', 'retired', 'bypassed'], ['graduated', 'evolved', 'bypassed'], ['retired', 'evolved', 'bypassed'], ['graduated', 'retired', 'evolved', 'bypassed']]


## Cross checking clustering algorithms with hyper parameters

In [None]:
import pandas as pd
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import spectral_clustering

scale_methods = ['standard'] #, 'minmax', 'robust', 'maxabs']
feature_groups = ['communication', 'popularity', 'stability', 'technical_activity', 'quality']
feature_groups_combinations = generate_combinations(feature_groups)
model_types = ['spectral'] #, 'kmeans', 'dbscan', 'agglomerativeclustering']
data_subset = ['graduated', 'retired', 'evolved', 'bypassed']
data_subsets = generate_combinations(data_subset, min_length=2)

results = []
index = 0

for scale_method_index, scale_method in enumerate(scale_methods):
    scaled_data = scale_data(df_all, scale_method)
    for data_subset_index, data_subset in enumerate(data_subsets):
        data_sub, status_subset = get_subset_by_status(scaled_data, data_subset)
        if data_sub is not None: 
          for features_to_include_index, features_to_include in enumerate(feature_groups_combinations):
            X = get_features(data_sub, features_to_include)
            if X is not None:
              for model_type_index, model_type in enumerate(model_types):

                if model_type == 'kmeans':
                    for n_clusters in range(1, 16):
                        index += 1
                        if n_clusters == 1:
                            model, labels = fit_unsupervised_model(X, model_type=model_type, n_clusters=None)
                        else:
                            model, labels = fit_unsupervised_model(X, model_type=model_type, n_clusters=n_clusters)
                        if model is not None and labels is not None and max(labels)+1 >= 2:
                            silhouette = silhouette_score(X, labels)
                            calinski = calinski_harabasz_score(X, labels)
                            davies = davies_bouldin_score(X, labels)
                            ars = adjusted_rand_score(status_subset, labels)
                            results.append([scale_method, features_to_include, model_type, data_subset, X, labels, status_subset, f"n_clusters = {n_clusters}", n_clusters, silhouette, calinski, davies, ars])
                            print(f"{index} Scale method: {scale_method}, Features: {features_to_include}, Model: {model_type}, Data subset: {data_subset}")
                        else:
                            print(f"KMeans with n_clusters={n_clusters}, Scale method={scale_method}, features={features_to_include}, data_subset={data_subset} resulted in only one cluster or all noise. Skipping metrics calculation.")
                elif model_type == 'dbscan':
                    for eps in [0.1, 0.2, 0.3, 0.5, 0.8, 1.0]:
                        for min_samples in [3, 5, 10]:
                          index += 1
                          model, labels = fit_unsupervised_model(X, model_type=model_type, eps=eps, min_samples=min_samples)
                          if model is not None and labels is not None and max(labels)+1 >= 2:
                              #silhouette = silhouette_score(X, labels)
                              calinski = calinski_harabasz_score(X, labels)
                              davies = davies_bouldin_score(X, labels)
                              ars = adjusted_rand_score(status_subset, labels)
                              results.append([scale_method, features_to_include, model_type, data_subset, X, labels, status_subset, f"eps:{eps}_min_samples:{min_samples}", max(labels)+1, np.nan, calinski, davies, ars])
                              print(f"{index} Scale method: {scale_method}, Features: {features_to_include}, Model: {model_type}, Data subset: {data_subset}")
                          else:
                              print(f"DBSCAN with eps={eps}, min_samples={min_samples}, Scale method={scale_method}, features={features_to_include}, data_subset={data_subset} resulted in only one cluster or all noise. Skipping metrics calculation.")
                elif model_type == 'agglomerativeclustering':
                    for n_clusters in range(1, 16):
                        for linkage in ['ward', 'complete', 'average', 'single']:
                          index += 1
                          if n_clusters == 1:
                              model, labels = fit_unsupervised_model(X, model_type=model_type, n_clusters=None, linkage=linkage)
                          else:
                              model, labels = fit_unsupervised_model(X, model_type=model_type, n_clusters=n_clusters, linkage=linkage)
                          if model is not None and labels is not None and max(labels)+1 >= 2:
                              silhouette = silhouette_score(X, labels)
                              calinski = calinski_harabasz_score(X, labels)
                              davies = davies_bouldin_score(X, labels)
                              ars = adjusted_rand_score(status_subset, labels)
                              results.append([scale_method, features_to_include, model_type, data_subset, X, labels, status_subset, f"n_clusters = {n_clusters}, linkage={linkage}", n_clusters, silhouette, calinski, davies, ars])
                              print(f"{index} Scale method: {scale_method}, Features: {features_to_include}, Model: {model_type}, Data subset: {data_subset}")
                          else:
                              print(f"AgglomerativeClustering with n_clusters={n_clusters}, linkage={linkage}, Scale method={scale_method}, features={features_to_include}, data_subset={data_subset} resulted in only one cluster. Skipping metrics calculation.")
                elif model_type == 'spectral':
                    if X is not None:
                        index += 1
                        model, labels = fit_unsupervised_model(X, model_type=model_type, n_clusters=len(data_subset))
                        if model is not None and labels is not None and max(labels)+1 >= 2:
                            silhouette = silhouette_score(X, labels)
                            calinski = calinski_harabasz_score(X, labels)
                            davies = davies_bouldin_score(X, labels)
                            ars = adjusted_rand_score(status_subset, labels)
                            results.append([scale_method, features_to_include, model_type, data_subset, X, labels, status_subset, f"n_clusters = {len(data_subset)}", len(data_subset), silhouette, calinski, davies, ars])
                            print(f"{index} Scale method: {scale_method}, Features: {features_to_include}, Model: {model_type}, Data subset: {data_subset}")
                        else:
                            print(f"AgglomerativeClustering with n_clusters={n_clusters}, linkage={linkage}, Scale method={scale_method}, features={features_to_include}, data_subset={data_subset} resulted in only one cluster. Skipping metrics calculation.")

df_results = pd.DataFrame(results, columns=['Scaling', 'Features', 'Model', 'Data_subset', 'Data', 'Labels', 'Status', 'Parameters', 'Clusters', 'Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin', 'Adjusted_Rand_Score'])
df_results


1 Scale method: standard, Features: ['communication'], Model: spectral, Data subset: ['graduated', 'retired']
2 Scale method: standard, Features: ['popularity'], Model: spectral, Data subset: ['graduated', 'retired']
3 Scale method: standard, Features: ['stability'], Model: spectral, Data subset: ['graduated', 'retired']
4 Scale method: standard, Features: ['technical_activity'], Model: spectral, Data subset: ['graduated', 'retired']
5 Scale method: standard, Features: ['quality'], Model: spectral, Data subset: ['graduated', 'retired']
6 Scale method: standard, Features: ['communication', 'popularity'], Model: spectral, Data subset: ['graduated', 'retired']



Graph is not fully connected, spectral embedding may not work as expected.



7 Scale method: standard, Features: ['communication', 'stability'], Model: spectral, Data subset: ['graduated', 'retired']
8 Scale method: standard, Features: ['communication', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired']
9 Scale method: standard, Features: ['communication', 'quality'], Model: spectral, Data subset: ['graduated', 'retired']
10 Scale method: standard, Features: ['popularity', 'stability'], Model: spectral, Data subset: ['graduated', 'retired']
11 Scale method: standard, Features: ['popularity', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired']
12 Scale method: standard, Features: ['popularity', 'quality'], Model: spectral, Data subset: ['graduated', 'retired']
13 Scale method: standard, Features: ['stability', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired']



Graph is not fully connected, spectral embedding may not work as expected.



14 Scale method: standard, Features: ['stability', 'quality'], Model: spectral, Data subset: ['graduated', 'retired']
15 Scale method: standard, Features: ['technical_activity', 'quality'], Model: spectral, Data subset: ['graduated', 'retired']
16 Scale method: standard, Features: ['communication', 'popularity', 'stability'], Model: spectral, Data subset: ['graduated', 'retired']
17 Scale method: standard, Features: ['communication', 'popularity', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired']
18 Scale method: standard, Features: ['communication', 'popularity', 'quality'], Model: spectral, Data subset: ['graduated', 'retired']
19 Scale method: standard, Features: ['communication', 'stability', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired']
20 Scale method: standard, Features: ['communication', 'stability', 'quality'], Model: spectral, Data subset: ['graduated', 'retired']
21 Scale method: standard, Features: ['communication'


Graph is not fully connected, spectral embedding may not work as expected.



34 Scale method: standard, Features: ['stability'], Model: spectral, Data subset: ['graduated', 'evolved']
35 Scale method: standard, Features: ['technical_activity'], Model: spectral, Data subset: ['graduated', 'evolved']
36 Scale method: standard, Features: ['quality'], Model: spectral, Data subset: ['graduated', 'evolved']
37 Scale method: standard, Features: ['communication', 'popularity'], Model: spectral, Data subset: ['graduated', 'evolved']
38 Scale method: standard, Features: ['communication', 'stability'], Model: spectral, Data subset: ['graduated', 'evolved']
39 Scale method: standard, Features: ['communication', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'evolved']
40 Scale method: standard, Features: ['communication', 'quality'], Model: spectral, Data subset: ['graduated', 'evolved']
41 Scale method: standard, Features: ['popularity', 'stability'], Model: spectral, Data subset: ['graduated', 'evolved']
42 Scale method: standard, Features: ['populari


Graph is not fully connected, spectral embedding may not work as expected.



67 Scale method: standard, Features: ['quality'], Model: spectral, Data subset: ['graduated', 'bypassed']
68 Scale method: standard, Features: ['communication', 'popularity'], Model: spectral, Data subset: ['graduated', 'bypassed']
69 Scale method: standard, Features: ['communication', 'stability'], Model: spectral, Data subset: ['graduated', 'bypassed']
70 Scale method: standard, Features: ['communication', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'bypassed']
71 Scale method: standard, Features: ['communication', 'quality'], Model: spectral, Data subset: ['graduated', 'bypassed']



Graph is not fully connected, spectral embedding may not work as expected.



72 Scale method: standard, Features: ['popularity', 'stability'], Model: spectral, Data subset: ['graduated', 'bypassed']
73 Scale method: standard, Features: ['popularity', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'bypassed']
74 Scale method: standard, Features: ['popularity', 'quality'], Model: spectral, Data subset: ['graduated', 'bypassed']
75 Scale method: standard, Features: ['stability', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'bypassed']
76 Scale method: standard, Features: ['stability', 'quality'], Model: spectral, Data subset: ['graduated', 'bypassed']
77 Scale method: standard, Features: ['technical_activity', 'quality'], Model: spectral, Data subset: ['graduated', 'bypassed']
78 Scale method: standard, Features: ['communication', 'popularity', 'stability'], Model: spectral, Data subset: ['graduated', 'bypassed']
79 Scale method: standard, Features: ['communication', 'popularity', 'technical_activity'], Model: spectral, Da


Graph is not fully connected, spectral embedding may not work as expected.



132 Scale method: standard, Features: ['communication', 'technical_activity'], Model: spectral, Data subset: ['retired', 'bypassed']
133 Scale method: standard, Features: ['communication', 'quality'], Model: spectral, Data subset: ['retired', 'bypassed']
134 Scale method: standard, Features: ['popularity', 'stability'], Model: spectral, Data subset: ['retired', 'bypassed']
135 Scale method: standard, Features: ['popularity', 'technical_activity'], Model: spectral, Data subset: ['retired', 'bypassed']
136 Scale method: standard, Features: ['popularity', 'quality'], Model: spectral, Data subset: ['retired', 'bypassed']
137 Scale method: standard, Features: ['stability', 'technical_activity'], Model: spectral, Data subset: ['retired', 'bypassed']
138 Scale method: standard, Features: ['stability', 'quality'], Model: spectral, Data subset: ['retired', 'bypassed']



Graph is not fully connected, spectral embedding may not work as expected.



139 Scale method: standard, Features: ['technical_activity', 'quality'], Model: spectral, Data subset: ['retired', 'bypassed']
140 Scale method: standard, Features: ['communication', 'popularity', 'stability'], Model: spectral, Data subset: ['retired', 'bypassed']
141 Scale method: standard, Features: ['communication', 'popularity', 'technical_activity'], Model: spectral, Data subset: ['retired', 'bypassed']
142 Scale method: standard, Features: ['communication', 'popularity', 'quality'], Model: spectral, Data subset: ['retired', 'bypassed']
143 Scale method: standard, Features: ['communication', 'stability', 'technical_activity'], Model: spectral, Data subset: ['retired', 'bypassed']
144 Scale method: standard, Features: ['communication', 'stability', 'quality'], Model: spectral, Data subset: ['retired', 'bypassed']
145 Scale method: standard, Features: ['communication', 'technical_activity', 'quality'], Model: spectral, Data subset: ['retired', 'bypassed']
146 Scale method: standard,


Graph is not fully connected, spectral embedding may not work as expected.



190 Scale method: standard, Features: ['technical_activity'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved']
191 Scale method: standard, Features: ['quality'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved']
192 Scale method: standard, Features: ['communication', 'popularity'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved']
193 Scale method: standard, Features: ['communication', 'stability'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved']
194 Scale method: standard, Features: ['communication', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved']
195 Scale method: standard, Features: ['communication', 'quality'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved']
196 Scale method: standard, Features: ['popularity', 'stability'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved']
197 Scale method: standard, Features: ['popularity', 'technical_activi


Graph is not fully connected, spectral embedding may not work as expected.



222 Scale method: standard, Features: ['quality'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
223 Scale method: standard, Features: ['communication', 'popularity'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
224 Scale method: standard, Features: ['communication', 'stability'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
225 Scale method: standard, Features: ['communication', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
226 Scale method: standard, Features: ['communication', 'quality'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
227 Scale method: standard, Features: ['popularity', 'stability'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
228 Scale method: standard, Features: ['popularity', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
229 Scale method: standard, Features: ['popularit


Graph is not fully connected, spectral embedding may not work as expected.



230 Scale method: standard, Features: ['stability', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
231 Scale method: standard, Features: ['stability', 'quality'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
232 Scale method: standard, Features: ['technical_activity', 'quality'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
233 Scale method: standard, Features: ['communication', 'popularity', 'stability'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
234 Scale method: standard, Features: ['communication', 'popularity', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
235 Scale method: standard, Features: ['communication', 'popularity', 'quality'], Model: spectral, Data subset: ['graduated', 'retired', 'bypassed']
236 Scale method: standard, Features: ['communication', 'stability', 'technical_activity'], Model: spectral, Data subset: ['gra


Graph is not fully connected, spectral embedding may not work as expected.



251 Scale method: standard, Features: ['stability'], Model: spectral, Data subset: ['graduated', 'evolved', 'bypassed']
252 Scale method: standard, Features: ['technical_activity'], Model: spectral, Data subset: ['graduated', 'evolved', 'bypassed']
253 Scale method: standard, Features: ['quality'], Model: spectral, Data subset: ['graduated', 'evolved', 'bypassed']
254 Scale method: standard, Features: ['communication', 'popularity'], Model: spectral, Data subset: ['graduated', 'evolved', 'bypassed']
255 Scale method: standard, Features: ['communication', 'stability'], Model: spectral, Data subset: ['graduated', 'evolved', 'bypassed']
256 Scale method: standard, Features: ['communication', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'evolved', 'bypassed']
257 Scale method: standard, Features: ['communication', 'quality'], Model: spectral, Data subset: ['graduated', 'evolved', 'bypassed']
258 Scale method: standard, Features: ['popularity', 'stability'], Model: spe


Graph is not fully connected, spectral embedding may not work as expected.



312 Scale method: standard, Features: ['popularity'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved', 'bypassed']
313 Scale method: standard, Features: ['stability'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved', 'bypassed']
314 Scale method: standard, Features: ['technical_activity'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved', 'bypassed']
315 Scale method: standard, Features: ['quality'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved', 'bypassed']
316 Scale method: standard, Features: ['communication', 'popularity'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved', 'bypassed']
317 Scale method: standard, Features: ['communication', 'stability'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved', 'bypassed']
318 Scale method: standard, Features: ['communication', 'technical_activity'], Model: spectral, Data subset: ['graduated', 'retired', 'evolved', 'bypassed']
319 Scale meth

Unnamed: 0,Scaling,Features,Model,Data_subset,Data,Labels,Status,Parameters,Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin,Adjusted_Rand_Score
0,standard,[communication],spectral,"[graduated, retired]",COM-1 COM-2 189 -0.510398 -0.34314...,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",189 graduated 190 graduated 191 gradu...,n_clusters = 2,2,0.651491,284.146052,0.641365,-0.016849
1,standard,[popularity],spectral,"[graduated, retired]",POP-1 189 -0.378903 190 -0.363747 191 ...,"[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, ...",189 graduated 190 graduated 191 gradu...,n_clusters = 2,2,0.211792,35.134024,1.095949,0.045342
2,standard,[stability],spectral,"[graduated, retired]",STA-1 STA-2 STA-3 STA-4 ...,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, ...",189 graduated 190 graduated 191 gradu...,n_clusters = 2,2,0.287066,51.635937,1.147767,0.096899
3,standard,[technical_activity],spectral,"[graduated, retired]",TEC-1 TEC-2 TEC-3 TEC-4 18...,"[1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, ...",189 graduated 190 graduated 191 gradu...,n_clusters = 2,2,0.340822,65.059009,1.236161,0.020827
4,standard,[quality],spectral,"[graduated, retired]",SWQ-1 SWQ-2.1 SWQ-2.2 SWQ-2.3 ...,"[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, ...",189 graduated 190 graduated 191 gradu...,n_clusters = 2,2,0.301120,46.934585,1.488118,-0.068791
...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,standard,"[communication, popularity, stability, quality]",spectral,"[graduated, retired, evolved, bypassed]",COM-1 COM-2 POP-1 STA-1 ...,"[0, 2, 1, 1, 1, 1, 2, 2, 0, 1, 1, 0, 0, 0, 2, ...",0 evolved 1 evolved 2 evolve...,n_clusters = 4,4,0.086733,38.959234,2.033220,0.006078
337,standard,"[communication, popularity, technical_activity...",spectral,"[graduated, retired, evolved, bypassed]",COM-1 COM-2 POP-1 TEC-1 ...,"[2, 1, 0, 0, 0, 0, 3, 2, 1, 0, 0, 3, 2, 3, 1, ...",0 evolved 1 evolved 2 evolve...,n_clusters = 4,4,0.021862,41.443252,2.055126,0.027924
338,standard,"[communication, stability, technical_activity,...",spectral,"[graduated, retired, evolved, bypassed]",COM-1 COM-2 STA-1 STA-2 ...,"[3, 0, 1, 1, 1, 1, 0, 0, 3, 1, 1, 3, 3, 3, 0, ...",0 evolved 1 evolved 2 evolve...,n_clusters = 4,4,0.076605,37.916133,2.104624,-0.000041
339,standard,"[popularity, stability, technical_activity, qu...",spectral,"[graduated, retired, evolved, bypassed]",POP-1 STA-1 STA-2 STA-3 ...,"[3, 1, 0, 2, 3, 2, 1, 1, 3, 2, 2, 0, 3, 0, 1, ...",0 evolved 1 evolved 2 evolve...,n_clusters = 4,4,0.000909,33.492470,2.558306,0.029453


### Showing result

#### Best scores for each metric

In [None]:
for metric in ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']:
    if metric == 'Davies-Bouldin':
        df_results[metric] = 1 - MinMaxScaler().fit_transform(df_results[[metric]])
    else:
        df_results[metric] = MinMaxScaler().fit_transform(df_results[[metric]])

df_filtered = df_results[df_results['Clusters'] >= 2]

best_scores = pd.DataFrame()
for metric in ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']:
    idxmax_values = df_filtered.groupby('Model')[metric].idxmax()
    valid_idxmax_values = idxmax_values[idxmax_values.notna()]
    metric_best_scores = df_filtered.loc[valid_idxmax_values]
    best_scores = pd.concat([best_scores, metric_best_scores])

print("Best scores for each metric:")
best_scores

Best scores for each metric:


Unnamed: 0,Scaling,Features,Model,Data_subset,Data,Labels,Status,Parameters,Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin,Adjusted_Rand_Score
32,standard,[popularity],spectral,"[graduated, evolved]",POP-1 0 1.371812 1 -0.378903 2 ...,"[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, ...",0 evolved 1 evolved 2 evo...,n_clusters = 2,2,1.0,1.0,0.945836,-0.001604
32,standard,[popularity],spectral,"[graduated, evolved]",POP-1 0 1.371812 1 -0.378903 2 ...,"[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, ...",0 evolved 1 evolved 2 evo...,n_clusters = 2,2,1.0,1.0,0.945836,-0.001604
0,standard,[communication],spectral,"[graduated, retired]",COM-1 COM-2 189 -0.510398 -0.34314...,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",189 graduated 190 graduated 191 gradu...,n_clusters = 2,2,0.885407,0.840391,1.0,-0.016849


In [74]:
for index, score in best_scores.iterrows():
  UMAP_plot(score['Data'], score['Labels'], score['Status'])

ValueError: All arrays must be of the same length

#### Top 10 best scores from a combined overall score

In [None]:
# Calculate the overall score
df_filtered.loc[:, 'Overall_Score'] = df_filtered[['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']].mean(axis=1)

top_10_overall = df_filtered.nlargest(10, 'Overall_Score')
print("\nTop 10 models overall:")
top_10_overall

In [None]:
for label in top_10_overall['Labels']:
  UMAP_plot(X, label, status)

#### Top 10 best scores from a combined overall score with 3 clusters

In [None]:
top_10_3_clusters = df_filtered[df_filtered['Clusters'] == 3].nlargest(10, 'Overall_Score')
print("\nTop 10 models with exactly 3 clusters:")
top_10_3_clusters

In [None]:
for label in top_10_3_clusters['Labels']:
  UMAP_plot(X, label, status)

#### Top 10 best scores from a combined overall score with 3 clusters and only DBSCAN

In [None]:
top_10_3_clusters_dbscan = df_filtered[(df_filtered['Model'] == 'dbscan') & (df_filtered['Clusters'] == 3)].nlargest(10, 'Overall_Score')
top_10_3_clusters_dbscan

In [None]:
for label in top_10_3_clusters_dbscan['Labels']:
  UMAP_plot(X, label, status)

#### Top 10 best scores from a combined overall score with 3 clusters and only KMeans

In [None]:
top_10_3_clusters_kmeans = df_filtered[(df_filtered['Model'] == 'kmeans') & (df_filtered['Clusters'] == 3)].nlargest(10, 'Overall_Score')
top_10_3_clusters_kmeans

In [None]:
for label in top_10_3_clusters_kmeans['Labels']:
  UMAP_plot(X, label, status)

#### Top 10 best scores from a combined overall score with 3 clusters and only AgglomerativeClustering

In [None]:
top_10_3_clusters_agglomerativeclustering = df_filtered[(df_filtered['Model'] == 'agglomerativeclustering') & (df_filtered['Clusters'] == 3)].nlargest(10, 'Overall_Score')
top_10_3_clusters_agglomerativeclustering

In [None]:
for label in top_10_3_clusters_agglomerativeclustering['Labels']:
  UMAP_plot(X, label, status)

#### Top 10 best scores from a combined overall score with 3 clusters and all features

In [None]:
top_10_3_clusters_specific_features = df_filtered[
    (df_filtered['Clusters'] == 3) &
    (df_filtered['Features'].apply(lambda x: set(x) == set(['stability', 'technical_activity', 'quality', 'popularity', 'communication'])))
].nlargest(10, 'Overall_Score')

print("\nTop 10 models with exactly 3 clusters and specific features:")
top_10_3_clusters_specific_features


In [None]:
for label in top_10_3_clusters_specific_features['Labels']:
  UMAP_plot(X, label, status)

#### Top 10 best Adjusted Rand index (ARI) scores

In [None]:
top_10_ari = df_results.nlargest(10, 'Adjusted_Rand_Score')

print("\nTop 10 ARI score models")
top_10_ari

In [None]:
for label in top_10_ari['Labels']:
  UMAP_plot(X, label, status)

##External validation

We want to identity inconsistencies between (retired/graduated) labels and clusters

###Homogeneity and completeness

* homogeneity: each cluster contains only members of a single class.

* completeness: all members of a given class are assigned to the same cluster.

1 = perfect score, 0 = bad

[Source](https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure)

In [None]:
from sklearn import metrics

metrics.homogeneity_completeness_v_measure(status, y_pred)

###Comparing with classification, accurary and feature importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y_clusters, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {accuracy:.2f}")


Feature importance

In [None]:
import seaborn as sns


overall_importance = rf.feature_importances_

plt.figure(figsize=(20, 5))
sns.barplot(x=features, y=overall_importance, palette="viridis")
plt.xlabel("Feature")
plt.ylabel("Overall Importance Score")
plt.title("Overall Feature Importance (Random Forest)")
plt.show()

df_cluster = pd.DataFrame({
    "cluster": y_clusters,
    "status": status
})
status_distribution = df_cluster.groupby(["cluster", "status"]).size().unstack(fill_value=0)
status_percentage = status_distribution.div(status_distribution.sum(axis=1), axis=0)* 100

number_clusters = len(df_cluster["cluster"].unique())
cluster_importance = []
for cluster in range(number_clusters):
    binary_target = (y_clusters == cluster)
    rf_cluster = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_cluster.fit(X, binary_target)
    cluster_importance.append(rf_cluster.feature_importances_)


#get most importent feature for each cluster
most_important_feature_idx = np.argmax(cluster_importance, axis=1)
most_important_feature = [features[idx] for idx in most_important_feature_idx]
max_importance_values = np.max(cluster_importance, axis=1)
clusters = [f"Cluster {i}" for i in range(number_clusters)]

#most importent feature for each cluster
plt.figure(figsize=(12, 5))
sns.barplot(x=clusters, y=max_importance_values, hue=most_important_feature, palette="viridis")
plt.xlabel("Clusters")
plt.ylabel("Importance")
plt.title("Most important feature for each cluster")
plt.legend(title="Features", bbox_to_anchor=(1.2, 1), loc='upper right')
plt.show()

#status distribution
plt.figure(figsize=(12, 5))
status_percentage.plot(kind="bar", stacked=True, colormap="viridis")
plt.xlabel("Clusters")
plt.ylabel("Percentage")
plt.title("Status distribution")
plt.legend(title="Status", bbox_to_anchor=(1.2, 1), loc='upper right')
plt.show()