# Cluster Uncertainty
___

In [84]:
clustering_strategy = 'MiniBatchKMeans' #BGMM or MiniBatchKMeans
assignment_type = 'closest' #modal or closest

## Setup

### Importing Packages

In [85]:
import os
import pandas as pd

### Directories

In [86]:
# Data Root Directory
data_root_dir = os.path.join(os.getcwd(), 'Data')

# Test Subset Directory
test_subset_dir = os.path.join(data_root_dir, f'{clustering_strategy}_test_subset3.parquet')

In [87]:
test_subset = pd.read_parquet(test_subset_dir)
print('Test Subset: ', test_subset.shape)

Test Subset:  (6779, 156)


## Final Cluster Assignments

In [103]:
# Group by Cluster and Type to find how many of each type in each cluster
counts = test_subset.groupby(['Cluster', 'Type']).size()
counts_df = counts.reset_index(name='Count')
# Add Total and Percentage Accuracy Columns
counts_df['Total'] = counts_df.groupby('Cluster')['Count'].transform('sum')
counts_df['Percentage'] = counts_df['Count'] / counts_df['Total'] * 100

if assignment_type == 'modal':
    # Find Dominant Type in Each Cluster
    dominant = counts_df.loc[counts_df.groupby('Cluster')['Count'].idxmax()]
    # Save Summary Array
    cluster_df = dominant.set_index('Cluster')[['Type', 'Percentage']]
    cluster_df.columns = ['Dominant_Type', 'Percentage']
    cluster_df['Percentage'] = cluster_df['Percentage'].round(2)
    print(cluster_df)
elif assignment_type == 'closest':
    # Find closest point in each cluster and assign that type to the cluster
    closest = []
    for cluster in range(20):
        # Subset for each cluster
        one_cluster = test_subset[test_subset['Cluster']==cluster]
        # Finding index of minimum dist
        min_val = one_cluster[f'Distance_{cluster}'].idxmin()
        cluster_type = one_cluster.loc[min_val, 'Type']
        # Finding associated galaxy tag names
        closest_five = one_cluster.nsmallest(5, f'Distance_{cluster}')
        galaxy_tags = closest_five.index.tolist()
            #print(galaxy_tags)
        # Save Summary Array
        percentage = counts_df.loc[(counts_df['Cluster'] == cluster) & (counts_df['Type'] == cluster_type), 'Percentage']
        percentage = percentage.iloc[0] if not percentage.empty else 0
        percentage = percentage.round(2)
        closest.append({'Cluster':cluster, 'Percentage':percentage, 'Closest': min_val, 'Closest_Type': cluster_type, 'Galaxy_Tags':galaxy_tags})
    closest_df = pd.DataFrame(closest).set_index('Cluster')
    closest_df.to_parquet(os.path.join(data_root_dir, f'{clustering_strategy}_{assignment_type}_assignment3.parquet'))
    print(closest_df.columns)

Index(['Percentage', 'Closest', 'Closest_Type', 'Galaxy_Tags'], dtype='object')


In [105]:
run1 = pd.read_parquet(os.path.join(data_root_dir, f'{clustering_strategy}_{assignment_type}_assignment1.parquet')) 
tags1 = run1['Galaxy_Tags'].explode().unique()
closest1 = run1[['Closest', 'Percentage', 'Closest_Type']]
print(closest1)
run2 = pd.read_parquet(os.path.join(data_root_dir, f'{clustering_strategy}_{assignment_type}_assignment2.parquet')) 
tags2 = run2['Galaxy_Tags'].explode().unique()
closest2 = run2[['Closest', 'Percentage', 'Closest_Type']]
print(closest2)
run3 = pd.read_parquet(os.path.join(data_root_dir, f'{clustering_strategy}_{assignment_type}_assignment3.parquet')) 
tags3 = run3['Galaxy_Tags'].explode().unique()
closest3 = run3[['Closest', 'Percentage', 'Closest_Type']]
print(closest3)

print('##########################\nClosest Galaxy\n##########################')
print('Run 1 and Run 2 Common Closest: ', len(set(closest1['Closest']).intersection(set(closest2['Closest']))))
print(set(closest1['Closest']).intersection(set(closest2['Closest'])))
print('Run 1 and Run 3 Common Closest: ', len(set(closest1['Closest']).intersection(set(closest3['Closest']))))
print(set(closest1['Closest']).intersection(set(closest3['Closest'])))
print('Run 2 and Run 3 Common Closest: ', len(set(closest2['Closest']).intersection(set(closest3['Closest']))))
print(set(closest2['Closest']).intersection(set(closest3['Closest'])))
print('All Three Runs Common Closest: ', len(set(closest1['Closest']).intersection(set(closest2['Closest'])).intersection(set(closest3['Closest']))))
print(set(closest1['Closest']).intersection(set(closest2['Closest'])).intersection(set(closest3['Closest'])))

#print('##########################\nClosest 5 Galaxies\n##########################')

#print('Run 1 and Run 2 Common Tags: ', len(set(tags1).intersection(set(tags2))))
#print(set(tags1).intersection(set(tags2)))
#print('Run 1 and Run 3 Common Tags: ', len(set(tags1).intersection(set(tags3))))
#print('Run 2 and Run 3 Common Tags: ', len(set(tags2).intersection(set(tags3))))
#print('All Three Runs Common Tags: ', len(set(tags1).intersection(set(tags2)).intersection(set(tags3))))

                     Closest  Percentage Closest_Type
Cluster                                              
0        J150006.06+123947.9       62.22            S
1        J213442.37-015025.7       50.00            R
2        J144456.28+123003.2       48.15            S
3        J141836.11+193453.9       91.42            R
4        J220340.40-010325.7       84.48            S
5        J215035.70-010554.3       79.59            R
6        J162033.13+082211.8       28.16            R
7        J151724.80+135712.9       98.56            R
8        J232242.90-010552.4       66.67            S
9        J223207.25+000213.5       30.77            S
10       J212854.15+000042.1       47.37            R
11       J000712.35-052950.7       69.23            R
12       J211052.54-023805.9       20.00            S
13       J154146.42+071941.4       93.76            R
14       J164248.62+275853.3        5.94            E
15       J171754.83+270047.3       98.14            R
16       J081727.99-004101.0