# Creating Filtered Data Files
___

## Setup

### Importing Packages

In [1]:
import os
import pandas as pd
from astronomaly.dimensionality_reduction import pca

### Directories

In [2]:
# Data Root Directory
data_root_dir = os.path.join(os.getcwd(), 'Data')

# Label Directory
label_dir = os.path.join(data_root_dir, 'DeCaLs/gz_decals_volunteers_1_and_2.parquet')

# Feature Directory
feature_dir = os.path.join(data_root_dir, 'GalaxyZoo_Main_Features.csv')

# Image Directory
img_dir = os.path.join(data_root_dir, 'DeCaLs')

## Volunteer Labels

In [3]:
volunteer_labels = pd.read_parquet(label_dir)
volunteer_labels = volunteer_labels.set_index('iauname')
print('Volunteer Labels: ', volunteer_labels.shape)

Volunteer Labels:  (92960, 114)


### Fixing fraction and totals

In [4]:
volunteer_labels['disk-edge-on_manual-total'] =   (volunteer_labels['disk-edge-on_yes'] + volunteer_labels['disk-edge-on_no'])
volunteer_labels['disk-edge-on_yes_manual-fraction'] = volunteer_labels['disk-edge-on_yes'] / volunteer_labels['disk-edge-on_manual-total']
volunteer_labels['disk-edge-on_no_manual-fraction'] = volunteer_labels['disk-edge-on_no'] / volunteer_labels['disk-edge-on_manual-total']

volunteer_labels['merging_manual-total'] =   (volunteer_labels['merging_merger'] 
                                               + volunteer_labels['merging_tidal-debris'] 
                                               + volunteer_labels['merging_both'] 
                                               + volunteer_labels['merging_neither'])
volunteer_labels['merging_merger_manual-fraction'] = volunteer_labels['merging_merger'] / volunteer_labels['merging_manual-total']
volunteer_labels['merging_tidal-debris_manual-fraction'] = volunteer_labels['merging_tidal-debris'] / volunteer_labels['merging_manual-total']
volunteer_labels['merging_both_manual-fraction'] = volunteer_labels['merging_both'] / volunteer_labels['merging_manual-total']
volunteer_labels['merging_neither_manual-fraction'] = volunteer_labels['merging_neither'] / volunteer_labels['merging_manual-total']

volunteer_labels['smooth-or-featured_manual-total'] =   (volunteer_labels['smooth-or-featured_smooth'] 
                                                          + volunteer_labels['smooth-or-featured_featured-or-disk'] 
                                                          + volunteer_labels['smooth-or-featured_artifact'])
volunteer_labels['smooth-or-featured_smooth_manual-fraction'] = volunteer_labels['smooth-or-featured_smooth'] / volunteer_labels['smooth-or-featured_manual-total']
volunteer_labels['smooth-or-featured_featured-or-disk_manual-fraction'] = volunteer_labels['smooth-or-featured_featured-or-disk'] / volunteer_labels['smooth-or-featured_manual-total']
volunteer_labels['smooth-or-featured_artifact_manual-fraction'] = volunteer_labels['smooth-or-featured_artifact'] / volunteer_labels['smooth-or-featured_manual-total']

volunteer_labels['how-rounded_manual-total'] =   (volunteer_labels['how-rounded_completely'] 
                                                   + volunteer_labels['how-rounded_in-between'] 
                                                   + volunteer_labels['how-rounded_cigar-shaped'])
volunteer_labels['how-rounded_completely_manual-fraction'] = volunteer_labels['how-rounded_completely'] / volunteer_labels['how-rounded_manual-total']
volunteer_labels['how-rounded_in-between_manual-fraction'] = volunteer_labels['how-rounded_in-between'] / volunteer_labels['how-rounded_manual-total']
volunteer_labels['how-rounded_cigar-shaped_manual-fraction'] = volunteer_labels['how-rounded_cigar-shaped'] / volunteer_labels['how-rounded_manual-total']

volunteer_labels['has-spiral-arms_manual-total'] =   (volunteer_labels['has-spiral-arms_yes'] + volunteer_labels['has-spiral-arms_no'])
volunteer_labels['has-spiral-arms_yes_manual-fraction'] = volunteer_labels['has-spiral-arms_yes'] / volunteer_labels['has-spiral-arms_manual-total']
volunteer_labels['has-spiral-arms_no_manual-fraction'] = volunteer_labels['has-spiral-arms_no'] / volunteer_labels['has-spiral-arms_manual-total']

print(volunteer_labels.shape)

(92960, 133)


### Edge-on Galaxies

In [5]:
edge_on_gal = volunteer_labels[(volunteer_labels['smooth-or-featured_manual-total'] >= 35) &
                            (volunteer_labels['disk-edge-on_manual-total'] >= 5) &
                            (volunteer_labels['disk-edge-on_yes_manual-fraction'] > 0.8) &
                            (volunteer_labels['merging_manual-total'] >= 10) &
                            (volunteer_labels['merging_merger_manual-fraction'] < 0.2)].copy()
edge_on_gal['Type'] = 'E'
print('Edge-on Galaxies: ', edge_on_gal.shape[0])

Edge-on Galaxies:  6249


In [6]:
filenames_to_find = set(edge_on_gal.index.astype(str) + '.png')
count = 0

for root, dirs, files in os.walk(img_dir):
    for file in files:
        if file in filenames_to_find:
            count += 1

print(count)

5949


### Round Elliptical Galaxies

In [7]:
round_ell_gal = volunteer_labels[(volunteer_labels['smooth-or-featured_manual-total'] >= 35) &
                              (volunteer_labels['smooth-or-featured_smooth_manual-fraction'] > 0.8) &
                              (volunteer_labels['how-rounded_manual-total'] >= 5) &
                              (volunteer_labels['how-rounded_completely_manual-fraction'] > 0.8) &
                              (volunteer_labels['merging_manual-total'] >= 10) &
                              (volunteer_labels['merging_merger_manual-fraction'] < 0.2)].copy()
round_ell_gal['Type'] = 'R'
print('Round Elliptical Galaxies: ', round_ell_gal.shape[0])

Round Elliptical Galaxies:  4748


In [8]:
filenames_to_find = set(round_ell_gal.index.astype(str) + '.png')
count = 0

for root, dirs, files in os.walk(img_dir):
    for file in files:
        if file in filenames_to_find:
            count += 1

print(count)

4174


### Spiral Galaxies

In [9]:
spiral_gal = volunteer_labels[(volunteer_labels['smooth-or-featured_manual-total'] >= 35) &
                           (volunteer_labels['smooth-or-featured_featured-or-disk_manual-fraction'] > 0.8) &
                           (volunteer_labels['has-spiral-arms_manual-total'] >= 5) &
                           (volunteer_labels['has-spiral-arms_yes_manual-fraction'] > 0.8) &
                           (volunteer_labels['merging_manual-total'] >= 10) &
                           (volunteer_labels['merging_merger_manual-fraction'] < 0.2)].copy()
spiral_gal['Type'] = 'S'
print('Spiral Galaxies: ', spiral_gal.shape[0])

Spiral Galaxies:  3967


In [10]:
filenames_to_find = set(spiral_gal.index.astype(str) + '.png')
count = 0

for root, dirs, files in os.walk(img_dir):
    for file in files:
        if file in filenames_to_find:
            count += 1

print(count)

3932


## Combining the assigned galaxies

In [11]:
assigned = pd.concat([spiral_gal, round_ell_gal, edge_on_gal])
print('Galaxy Assignment: ', assigned.shape)

# Check volunteer_labels for duplicate iauname
dupes_labels = assigned.index[assigned.index.duplicated()]
print(f"Volunteer Labels duplicates: {len(dupes_labels)}")
if len(dupes_labels):
    print(dupes_labels.unique())
# Removing ambigous classifications
assigned.drop(['J143434.58+140548.3', 'J125711.89-014223.0'], inplace=True)

assigned.to_parquet(os.path.join(data_root_dir, 'assigned_volunteer_labels.parquet'))
print('Confident Galaxy Assignment: ', assigned.shape)

Galaxy Assignment:  (14964, 134)
Volunteer Labels duplicates: 2
Index(['J143434.58+140548.3', 'J125711.89-014223.0'], dtype='object', name='iauname')
Confident Galaxy Assignment:  (14960, 134)


## Features

In [12]:
features = pd.read_csv(feature_dir, index_col=1)
# Dropping unnecessary index column
features.drop('Unnamed: 0', axis=1, inplace=True)
features.to_parquet(os.path.join(data_root_dir, 'GalaxyZoo_Main_Features.parquet'))
print('Features: ', features.shape)

Features:  (230575, 512)


### PCA

In [13]:
my_pca = pca.PCA_Decomposer(force_rerun=True, n_components=29, threshold=0.95, output_dir=data_root_dir)
pca_features = my_pca.run(features)
pca_features.to_parquet(os.path.join(data_root_dir, 'pca_features.parquet'))
print('PCA: ', pca_features.shape)

Running PCA_Decomposer ...
Total explained variance: 0.951611267647979
Done! Time taken: 2.8624415397644043 s
PCA:  (230575, 29)
