# BGMM Cluster Assignment
___

## Setup

### Importing Packages

In [1]:
import os
import shutil
import pandas as pd
from sklearn.mixture import BayesianGaussianMixture
from astronomaly.dimensionality_reduction import pca

### Directories

In [2]:
# Data Root Directory
data_root_dir = os.path.join(os.getcwd(), 'Data')

# Label Directory
label_dir = os.path.join(data_root_dir, 'DeCaLs/gz_decals_volunteers_1_and_2.parquet')

# Image Directory
img_dir = os.path.join(data_root_dir, 'DeCaLs')

# Feature Directory
#feature_dir = os.path.join(data_root_dir, 'GalaxyZoo_Main_Features.csv')
feature_dir = os.path.join(data_root_dir, 'features.parquet')
pca_feature_dir = os.path.join(data_root_dir, 'pca_features.parquet')

## Features

In [3]:
#features = pd.read_csv(feature_dir, index_col=1)
#features.drop('Unnamed: 0', axis=1, inplace=True)
#features.to_parquet(os.path.join(data_root_dir, 'features.parquet'))
features = pd.read_parquet(feature_dir)
print('Features: ', features.shape)
print(features.head(n=2))

Features:  (230575, 512)
                     Feature: 0  Feature: 1  Feature: 2  Feature: 3  \
Image Name                                                            
J085308.16+165953.5    0.383313    1.037078    0.107918    0.696278   
J140925.33+130923.4    0.335099    0.656469    0.257390    0.540220   

                     Feature: 4  Feature: 5  Feature: 6  Feature: 7  \
Image Name                                                            
J085308.16+165953.5    0.640173    0.288795    0.087937    0.874013   
J140925.33+130923.4    0.442894    0.387346    0.488610    1.072740   

                     Feature: 8  Feature: 9  ...  Feature: 502  Feature: 503  \
Image Name                                   ...                               
J085308.16+165953.5    0.718489    0.099370  ...      1.916415      0.087020   
J140925.33+130923.4    0.414410    0.673366  ...      0.204061      0.257959   

                     Feature: 504  Feature: 505  Feature: 506  Feature: 507  \
Image

## PCA

In [4]:
#my_pca = pca.PCA_Decomposer(force_rerun=False, n_components=29, threshold=0.95, output_dir=data_root_dir)
#pca_features = my_pca.run(features)
#pca_features.to_parquet(os.path.join(data_root_dir, 'pca_features.parquet'))
pca_features = pd.read_parquet(pca_feature_dir)
print('PCA Features: ', pca_features.shape)

PCA Features:  (230575, 29)


## BGMM Fitting

In [5]:
my_bgmm = BayesianGaussianMixture(n_components=20, weight_concentration_prior=0.5, n_init=10, max_iter=1000).fit(pca_features)
bgm_labels = my_bgmm.predict(pca_features)
clusters = pd.DataFrame(index = features.index.copy())
clusters['Cluster'] = bgm_labels
clusters.to_parquet(os.path.join(data_root_dir, 'BGMM_clusters.parquet'))
print(clusters.head(n=2))
# J085308.16+165953.5       18



                     Cluster
Image Name                  
J085308.16+165953.5        7
J140925.33+130923.4       14


In [6]:
bgm_prob = my_bgmm.predict_proba(pca_features)
prob_cols = []
for i in range(bgm_prob.shape[1]):
    prob_cols.append(f'Prob_{i}')
clusters[prob_cols] = bgm_prob
clusters.to_parquet(os.path.join(data_root_dir, 'BGMM_clusters_with_prob.parquet'))
print(clusters.head(n=2)) 
#J140925.33+130923.4       19

                     Cluster        Prob_0        Prob_1  Prob_2  \
Image Name                                                         
J085308.16+165953.5        7  2.346102e-28  0.000000e+00     0.0   
J140925.33+130923.4       14  2.587263e-09  3.546854e-37     0.0   

                           Prob_3  Prob_4        Prob_5  Prob_6        Prob_7  \
Image Name                                                                      
J085308.16+165953.5  1.540840e-72     0.0  4.899476e-62     0.0  1.000000e+00   
J140925.33+130923.4  6.042901e-12     0.0  2.413077e-57     0.0  1.199869e-58   

                            Prob_8  ...       Prob_10        Prob_11  \
Image Name                          ...                                
J085308.16+165953.5  3.817140e-200  ...  1.681573e-78   0.000000e+00   
J140925.33+130923.4   5.541021e-34  ...  1.231344e-66  1.083103e-146   

                          Prob_12       Prob_13   Prob_14        Prob_15  \
Image Name                           