In [1]:
# Importing all the necessary libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

In [2]:
# Reading in the CSV as a pandas dataframe
df = pd.read_csv('Galaxy_Classes.csv')
df

Unnamed: 0,objid,sample,asset_id,gz2_class,category
0,5.880000e+17,original,3,Sc?l,Spiral
1,5.880000e+17,original,4,Sc?l,Spiral
2,5.880000e+17,original,5,Er,Elliptical
3,5.880000e+17,original,6,Sc1t,Spiral
4,5.880000e+17,original,7,Sc1t,Spiral
...,...,...,...,...,...
267745,5.880000e+17,stripe82,295294,SBc4m,Irregular
267746,5.880000e+17,stripe82,295295,Er,Elliptical
267747,5.880000e+17,stripe82,295296,Er,Elliptical
267748,5.880000e+17,stripe82,295304,Sc?l,Spiral


In [3]:
#Creating a new target column with the final classification
df['Target'] = df['gz2_class']
df

Unnamed: 0,objid,sample,asset_id,gz2_class,category,Target
0,5.880000e+17,original,3,Sc?l,Spiral,Sc?l
1,5.880000e+17,original,4,Sc?l,Spiral,Sc?l
2,5.880000e+17,original,5,Er,Elliptical,Er
3,5.880000e+17,original,6,Sc1t,Spiral,Sc1t
4,5.880000e+17,original,7,Sc1t,Spiral,Sc1t
...,...,...,...,...,...,...
267745,5.880000e+17,stripe82,295294,SBc4m,Irregular,SBc4m
267746,5.880000e+17,stripe82,295295,Er,Elliptical,Er
267747,5.880000e+17,stripe82,295296,Er,Elliptical,Er
267748,5.880000e+17,stripe82,295304,Sc?l,Spiral,Sc?l


In [4]:
df.dtypes #Checking the data types

objid        float64
sample        object
asset_id       int64
gz2_class     object
category      object
Target        object
dtype: object

In [5]:
df['Target'] = df['Target'].astype(str) #Turning the Target column into string object to avoid errors

In [6]:
df.dtypes

objid        float64
sample        object
asset_id       int64
gz2_class     object
category      object
Target        object
dtype: object

Creating 16 different categories based on the different classes. Making them more general, so that the model can find the more obvious features from the images as currently it is too detailed. Creating more detailed categories compared to "category" but disregarding tightness of the arms, prominence of the bulge, the shape of the bulge and any abnormalities discoeverd by people (abonrmalities are labelled as "A").

In [7]:
#Looping through all the rows in the Target column and relabelling them to create a total of 16 usable classes.

for i in range(len(df['Target'])):
    if 'Er' in df.loc[i,'Target']:
        df.loc[i, 'Target'] = 'Er'
    elif 'Ei' in df.loc[i,'Target']:
        df.loc[i, 'Target'] = 'Ei'
    elif 'Ec' in df.loc[i,'Target']:
        df.loc[i, 'Target'] = 'Ec' #Doing it this way in case there are more labels after the Er/i/c
    elif 'Ser' in df.loc[i,'Target']:
        df.loc[i, 'Target'] = 'Se' #edge on spiral
    elif 'Seb' in df.loc[i,'Target']:
        df.loc[i, 'Target'] = 'Se' #edge on spiral
    elif 'Sen' in df.loc[i,'Target']:
        df.loc[i, 'Target'] = 'Se' #edge on spiral
    elif 'SB' in df.loc[i,'Target']: 
        if '1' in df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'SB1' #barred spiral with 1 arm 
        elif '2' in  df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'SB2' #barred spiral with 2 arms
        elif '3' in  df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'SB3' #barred spiral with 3 arms
        elif '4' in  df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'SB4' #barred spiral with 4 arms
        elif '+' in  df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'SB5' #barred spiral with more than 4 arms
        else: 
            df.loc[i,'Target'] = 'SB' #barred spiral with no arms
    elif 'S' in df.loc[i,'Target']:
        if '1' in df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'S1' #no bar spiral with 1 arm 
        elif '2' in  df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'S2' #no bar spiral with 2 arms
        elif '3' in  df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'S3' #no bar spiral with 3 arms
        elif '4' in  df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'S4' #no bar spiral with 4 arms
        elif '+' in  df.loc[i,'Target']: 
            df.loc[i,'Target'] = 'S5' #no bar spiral with more than 4 arms
        else: 
            df.loc[i,'Target'] = 'S' #no bar spiral with no arms
    else: 
        df.loc[i,'Target'] = 'A' #means arifact which will be ignored
        
df['Target'] #Checking that all the values had been labelled

0           S
1           S
2          Er
3          S1
4          S1
         ... 
267745    SB4
267746     Er
267747     Er
267748      S
267749      S
Name: Target, Length: 267750, dtype: object

In [8]:
df

Unnamed: 0,objid,sample,asset_id,gz2_class,category,Target
0,5.880000e+17,original,3,Sc?l,Spiral,S
1,5.880000e+17,original,4,Sc?l,Spiral,S
2,5.880000e+17,original,5,Er,Elliptical,Er
3,5.880000e+17,original,6,Sc1t,Spiral,S1
4,5.880000e+17,original,7,Sc1t,Spiral,S1
...,...,...,...,...,...,...
267745,5.880000e+17,stripe82,295294,SBc4m,Irregular,SB4
267746,5.880000e+17,stripe82,295295,Er,Elliptical,Er
267747,5.880000e+17,stripe82,295296,Er,Elliptical,Er
267748,5.880000e+17,stripe82,295304,Sc?l,Spiral,S


In [9]:
df['Target'].value_counts()

S      54148
Ei     53212
Er     49196
SB2    26401
Se     19935
SB     18069
S2     15729
Ec      9479
S3      6337
SB3     3715
S1      3295
S5      2600
S4      1778
SB5     1068
SB1     1053
SB4     1052
A        683
Name: Target, dtype: int64

In [10]:
#Saving a new file with all the new categories

df.to_csv("Galaxy_16classes.csv", index=False)