In [1]:
#Importing necessary libraries

import random as rn
import os
import numpy as np
import pandas as pd
import cv2 as cv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Defining the path to the data directories
#One path will be for the csv which contains information about the classification and the other is folder with the images

df = pd.read_csv('Galaxy_Classes.csv')
pic_all = "C:/python_lessons/January_Project/images/"

In [3]:
df.head(10) #Looking at the first 10 rows

Unnamed: 0,objid,sample,asset_id,gz2_class,category
0,5.88e+17,original,3,Sc?l,Spiral
1,5.88e+17,original,4,Sc?l,Spiral
2,5.88e+17,original,5,Er,Elliptical
3,5.88e+17,original,6,Sc1t,Spiral
4,5.88e+17,original,7,Sc1t,Spiral
5,5.88e+17,original,8,Sc1t,Spiral
6,5.88e+17,original,9,Sc1t,Spiral
7,5.88e+17,original,11,Sb,Spiral
8,5.88e+17,original,12,Sb,Spiral
9,5.88e+17,original,13,Ec,Elliptical


This time making the categories more basic and combining different ellipticals with each other, edge on spirals with each other and spirals with arms and no arms with each other.

In [4]:
# Defining a new function which loops through the rows and renames the label
def categorise(row):  
    x = row['gz2_class']
    if 'Er' in x or 'Ei' in x or 'Ec'  in x:
        return 'E' #elliptical
    elif 'Ser'  in x or 'Seb' in x or 'Sen' in x:
        return 'Se' #edge on spiral
    elif 'S'  in x and any(char.isdigit() for char in x):
        return 'S_arms' #spiral with arms
    elif 'S'  in x:
        return 'S_N' #spiral with no arms
    else:
        return 'Unknown'

In [5]:
df.pop('category') #Popping out the "category" column as not needed
df

Unnamed: 0,objid,sample,asset_id,gz2_class
0,5.880000e+17,original,3,Sc?l
1,5.880000e+17,original,4,Sc?l
2,5.880000e+17,original,5,Er
3,5.880000e+17,original,6,Sc1t
4,5.880000e+17,original,7,Sc1t
...,...,...,...,...
267745,5.880000e+17,stripe82,295294,SBc4m
267746,5.880000e+17,stripe82,295295,Er
267747,5.880000e+17,stripe82,295296,Er
267748,5.880000e+17,stripe82,295304,Sc?l


In [6]:
# Creating a new column named category and applying the lambda function of the function defined before to rename the labels
df['category'] = df.apply(lambda row: categorise(row), axis=1)
df

Unnamed: 0,objid,sample,asset_id,gz2_class,category
0,5.880000e+17,original,3,Sc?l,S_N
1,5.880000e+17,original,4,Sc?l,S_N
2,5.880000e+17,original,5,Er,E
3,5.880000e+17,original,6,Sc1t,S_arms
4,5.880000e+17,original,7,Sc1t,S_arms
...,...,...,...,...,...
267745,5.880000e+17,stripe82,295294,SBc4m,S_arms
267746,5.880000e+17,stripe82,295295,Er,E
267747,5.880000e+17,stripe82,295296,Er,E
267748,5.880000e+17,stripe82,295304,Sc?l,S_N


In [7]:
df['category'].value_counts() # 4 different classes

E          111887
S_N         75885
S_arms      59360
Se          19935
Unknown       683
Name: category, dtype: int64

In [8]:
df.loc[df['category']=='Unknown']['gz2_class'].value_counts()

A    683
Name: gz2_class, dtype: int64

In [9]:
df.drop(df[df['category'] == 'Unknown'].index, inplace = True)

In [10]:
#Making a copy of the column being encoded
df['category_encoded'] = df['category']

In [11]:
df.head()

Unnamed: 0,objid,sample,asset_id,gz2_class,category,category_encoded
0,5.88e+17,original,3,Sc?l,S_N,S_N
1,5.88e+17,original,4,Sc?l,S_N,S_N
2,5.88e+17,original,5,Er,E,E
3,5.88e+17,original,6,Sc1t,S_arms,S_arms
4,5.88e+17,original,7,Sc1t,S_arms,S_arms


In [12]:
#Label encoding the category (target) feature in the data frame

le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['category_encoded'])

In [13]:
df.head(20)

Unnamed: 0,objid,sample,asset_id,gz2_class,category,category_encoded
0,5.88e+17,original,3,Sc?l,S_N,1
1,5.88e+17,original,4,Sc?l,S_N,1
2,5.88e+17,original,5,Er,E,0
3,5.88e+17,original,6,Sc1t,S_arms,2
4,5.88e+17,original,7,Sc1t,S_arms,2
5,5.88e+17,original,8,Sc1t,S_arms,2
6,5.88e+17,original,9,Sc1t,S_arms,2
7,5.88e+17,original,11,Sb,S_N,1
8,5.88e+17,original,12,Sb,S_N,1
9,5.88e+17,original,13,Ec,E,0


In [14]:
# Defining new dataframes with specific categories to prepare for undersampling to balance the dataset
Edge_On_Spiral = df[df.category_encoded == 0]
Elliptical = df[df.category_encoded == 1]
Spiral_With_Arms = df[df.category_encoded ==2]
Spiral_With_No_Arms = df[df.category_encoded ==3]

In [15]:
categories = ['Se', 'E', 'S_arms','S_N']
#Creating a list with the new dataframes
categories_dfs = [Edge_On_Spiral, Elliptical, Spiral_With_Arms,Spiral_With_No_Arms]

In [16]:
df.category_encoded.value_counts() #Checking how many values each class has

0    111887
1     75885
2     59360
3     19935
Name: category_encoded, dtype: int64

In [17]:
#Saving a file

df.to_csv("Galaxy_4classes.csv", index=False)

In [17]:
# Making a new list and looping through the other one with all the classes and getting 500 samples from each class
df_list = []
for i in categories_dfs:
    df_list.append(i.sample(n=500, random_state=101))


df_balanced = pd.concat(df_list,axis=0)

In [18]:
#Shuffling the data and resetting the index to minimise bias

df_processed = df_balanced.sample(frac=1).reset_index(drop=True)

df_processed

Unnamed: 0,objid,sample,asset_id,gz2_class,category,category_encoded
0,5.880000e+17,original,47128,Ei,E,0
1,5.880000e+17,extra,263990,Ser,Se,3
2,5.880000e+17,original,22198,Sb?t,S_N,1
3,5.880000e+17,original,124432,Sc4m,S_arms,2
4,5.880000e+17,original,116657,Ser,Se,3
...,...,...,...,...,...,...
1995,5.880000e+17,original,122376,Sc?t,S_N,1
1996,5.880000e+17,original,141176,Ei,E,0
1997,5.880000e+17,original,30395,Ser,Se,3
1998,5.880000e+17,original,161515,Sb?t,S_N,1


In [19]:
#df_processed.pop('objid')
#df_processed.pop('sample')
#df_processed.pop('gz2_class')
df_processed

Unnamed: 0,objid,sample,asset_id,gz2_class,category,category_encoded
0,5.880000e+17,original,47128,Ei,E,0
1,5.880000e+17,extra,263990,Ser,Se,3
2,5.880000e+17,original,22198,Sb?t,S_N,1
3,5.880000e+17,original,124432,Sc4m,S_arms,2
4,5.880000e+17,original,116657,Ser,Se,3
...,...,...,...,...,...,...
1995,5.880000e+17,original,122376,Sc?t,S_N,1
1996,5.880000e+17,original,141176,Ei,E,0
1997,5.880000e+17,original,30395,Ser,Se,3
1998,5.880000e+17,original,161515,Sb?t,S_N,1
