# Cluster Uncertainty
___

In [42]:
clustering_strategy = 'MiniBatchKMeans' #BGMM or MiniBatchKMeans
assignment_type = 'closest' #modal or closest

## Setup

### Importing Packages

In [46]:
import os
import pandas as pd

### Directories

In [47]:
# Data Root Directory
data_root_dir = os.path.join(os.getcwd(), 'Data')

# Test Subset Directory
test_subset_dir = os.path.join(data_root_dir, f'{clustering_strategy}_test_subset.parquet')

In [48]:
test_subset = pd.read_parquet(test_subset_dir)
print('Test Subset: ', test_subset.shape)

Test Subset:  (10624, 156)


## Final Cluster Assignments

In [49]:
# Group by Cluster and Type to find how many of each type in each cluster
counts = test_subset.groupby(['Cluster', 'Type']).size()
counts_df = counts.reset_index(name='Count')
# Add Total and Percentage Accuracy Columns
counts_df['Total'] = counts_df.groupby('Cluster')['Count'].transform('sum')
counts_df['Percentage'] = counts_df['Count'] / counts_df['Total'] * 100

if assignment_type == 'modal':
    # Find Dominant Type in Each Cluster
    dominant = counts_df.loc[counts_df.groupby('Cluster')['Count'].idxmax()]
    # Save Summary Array
    cluster_df = dominant.set_index('Cluster')[['Type', 'Percentage']]
    cluster_df.columns = ['Dominant_Type', 'Percentage']
    cluster_df['Percentage'] = cluster_df['Percentage'].round(2)
    print(cluster_df)
elif assignment_type == 'closest':
    # Find closest point in each cluster and assign that type to the cluster
    closest = []
    for cluster in range(20):
        # Subset for each cluster
        one_cluster = test_subset[test_subset['Cluster']==cluster]
        # Finding index of minimum dist
        min_val = one_cluster[f'Distance_{cluster}'].idxmin()
        # Finding associated galaxy type
        cluster_type = one_cluster.loc[min_val, 'Type']
        # Save Summary Array
        percentage = counts_df.loc[(counts_df['Cluster'] == cluster) & (counts_df['Type'] == cluster_type), 'Percentage']
        percentage = percentage.round(2)
        closest.append({'Cluster':cluster, 'Closest_Type':cluster_type, 'Percentage':percentage.values[0]})
    closest_df = pd.DataFrame(closest).set_index('Cluster')
    print(closest_df)

        Closest_Type  Percentage
Cluster                         
0                  S       39.39
1                  S       12.50
2                  E       91.72
3                  R       73.26
4                  S       91.49
5                  R       32.26
6                  S       20.69
7                  E       63.16
8                  E       94.34
9                  S       27.08
10                 E       34.29
11                 R       92.44
12                 S       95.08
13                 R       85.53
14                 R       23.81
15                 E       73.81
16                 S       39.79
17                 S       23.62
18                 R       92.99
19                 R       94.75
