# Cluster Uncertainty
___

In [1]:
clustering_strategy = 'BGMM' #BGMM or MiniBatchKMeans
assignment_type = 'modal' #modal or closest

## Setup

### Importing Packages

In [2]:
import os
import pandas as pd

### Directories

In [3]:
# Data Root Directory
data_root_dir = os.path.join(os.getcwd(), 'Data')

# Test Subset Directory
test_subset_dir = os.path.join(data_root_dir, f'{clustering_strategy}_test_subset.parquet')

In [4]:
test_subset = pd.read_parquet(test_subset_dir)
print('Test Subset: ', test_subset.shape)

Test Subset:  (10624, 156)


## Final Cluster Assignments

In [5]:
# Group by Cluster and Type to find how many of each type in each cluster
counts = test_subset.groupby(['Cluster', 'Type']).size()
counts_df = counts.reset_index(name='Count')
# Add Total and Percentage Accuracy Columns
counts_df['Total'] = counts_df.groupby('Cluster')['Count'].transform('sum')
counts_df['Percentage'] = counts_df['Count'] / counts_df['Total'] * 100

if assignment_type == 'modal':
    # Find Dominant Type in Each Cluster
    dominant = counts_df.loc[counts_df.groupby('Cluster')['Count'].idxmax()]
    # Save Summary Array
    cluster_df = dominant.set_index('Cluster')[['Type', 'Percentage']]
    cluster_df.columns = ['Dominant_Type', 'Percentage']
    cluster_df['Percentage'] = cluster_df['Percentage'].round(2)
    print(cluster_df)
elif assignment_type == 'closest':
    # Find closest point in each cluster and assign that type to the cluster
    closest = []
    for cluster in range(20):
        # Subset for each cluster
        one_cluster = test_subset[test_subset['Cluster']==cluster]
        # Finding index of minimum dist
        min_val = one_cluster[f'Distance_{cluster}'].idxmin()
        # Finding associated galaxy type
        cluster_type = one_cluster.loc[min_val, 'Type']
        # Save Summary Array
        percentage = counts_df.loc[(counts_df['Cluster'] == cluster) & (counts_df['Type'] == cluster_type), 'Percentage']
        percentage = percentage.round(2)
        closest.append({'Cluster':cluster, 'Closest_Type':cluster_type, 'Percentage':percentage.values[0]})
    closest_df = pd.DataFrame(closest).set_index('Cluster')
    print(closest_df)

        Dominant_Type  Percentage
Cluster                          
0                   S       78.06
1                   S       97.16
2                   E       48.21
3                   E       66.72
4                   E       66.67
5                   E       47.31
6                   E       47.06
7                   R       97.36
8                   E       43.75
9                   E       45.24
10                  E       84.42
11                  E       66.49
12                  S       72.75
13                  E       47.54
14                  E       97.77
15                  E       58.90
16                  R       70.96
17                  S       58.93
18                  R       94.86
19                  R       75.57
