# MiniBatchKMeans Cluster Assignment
___

## Setup

### Importing Packages

In [1]:
import os
import shutil
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from astronomaly.dimensionality_reduction import pca

### Directories

In [2]:
# Data Root Directory
data_root_dir = os.path.join(os.getcwd(), 'Data')

# Label Directory
label_dir = os.path.join(data_root_dir, 'DeCaLs/gz_decals_volunteers_1_and_2.parquet')

# Image Directory
img_dir = os.path.join(data_root_dir, 'DeCaLs')

# Feature Directory
#feature_dir = os.path.join(data_root_dir, 'GalaxyZoo_Main_Features.csv')
feature_dir = os.path.join(data_root_dir, 'features.parquet')
pca_feature_dir = os.path.join(data_root_dir, 'pca_features.parquet')

## Features

In [4]:
#features = pd.read_csv(feature_dir, index_col=1)
#features.drop('Unnamed: 0', axis=1, inplace=True)
#features.to_parquet(os.path.join(data_root_dir, 'features.parquet'))
features = pd.read_parquet(feature_dir)
print('Features: ', features.shape)
print(features.head(n=2))

Features:  (230575, 512)
                     Feature: 0  Feature: 1  Feature: 2  Feature: 3  \
Image Name                                                            
J085308.16+165953.5    0.383313    1.037078    0.107918    0.696278   
J140925.33+130923.4    0.335099    0.656469    0.257390    0.540220   

                     Feature: 4  Feature: 5  Feature: 6  Feature: 7  \
Image Name                                                            
J085308.16+165953.5    0.640173    0.288795    0.087937    0.874013   
J140925.33+130923.4    0.442894    0.387346    0.488610    1.072740   

                     Feature: 8  Feature: 9  ...  Feature: 502  Feature: 503  \
Image Name                                   ...                               
J085308.16+165953.5    0.718489    0.099370  ...      1.916415      0.087020   
J140925.33+130923.4    0.414410    0.673366  ...      0.204061      0.257959   

                     Feature: 504  Feature: 505  Feature: 506  Feature: 507  \
Image

## PCA

In [5]:
#my_pca = pca.PCA_Decomposer(force_rerun=False, n_components=29, threshold=0.95, output_dir=data_root_dir)
#pca_features = my_pca.run(features)
#pca_features.to_parquet(os.path.join(data_root_dir, 'pca_features.parquet'))
pca_features = pd.read_parquet(pca_feature_dir)
print('PCA: ', pca_features.shape)

PCA:  (230575, 29)


## MiniBatchKMeans Fitting

In [6]:
my_kmeans = MiniBatchKMeans(n_clusters=20, max_iter=10, n_init="auto").fit(pca_features)
kmeans_labels = my_kmeans.predict(pca_features)
clusters = pd.DataFrame(index = features.index.copy())
clusters['Cluster'] = kmeans_labels
clusters.to_parquet(os.path.join(data_root_dir, 'MiniBatchKMeans_clusters.parquet'))
print(clusters.head(n=2))

                     Cluster
Image Name                  
J085308.16+165953.5        5
J140925.33+130923.4        8


In [7]:
kmeans_prob = my_kmeans.transform(pca_features)
prob_cols = []
for i in range(kmeans_prob.shape[1]):
    prob_cols.append(f'Prob_{i}')
clusters[prob_cols] = kmeans_prob
clusters.to_parquet(os.path.join(data_root_dir, 'MiniBatchKMeans_clusters_with_prob.parquet'))
print(clusters.head(n=2))

                     Cluster    Prob_0    Prob_1      Prob_2     Prob_3  \
Image Name                                                                
J085308.16+165953.5        5  8.124366  8.016758  498.391971  39.929194   
J140925.33+130923.4        8  6.856061  7.149682  498.262090  39.579373   

                       Prob_4    Prob_5     Prob_6    Prob_7    Prob_8  ...  \
Image Name                                                              ...   
J085308.16+165953.5  8.256111  5.257393  89.677064  8.553757  8.637881  ...   
J140925.33+130923.4  6.495645  7.128894  89.313525  7.293323  4.641401  ...   

                       Prob_10    Prob_11    Prob_12   Prob_13   Prob_14  \
Image Name                                                                 
J085308.16+165953.5  41.873256  25.363678  11.679500  7.144777  8.515982   
J140925.33+130923.4  41.916680  25.233109  10.730865  7.329722  7.022551   

                      Prob_15   Prob_16   Prob_17   Prob_18   Prob_19  
Image