In [1]:
import pandas as pd
import os

# Loading files
DR_df = pd.read_csv("../Resources/DR_complete.csv")
DR_df.head()

Unnamed: 0.1,Unnamed: 0,objid_,ra_,dec_,u_,g_,r_,i_,z_,run_,rerun_,camcol_,field_,specobjid_,class_,redshift_,plate_,mjd_,fiberid_
0,0,1237663228535374206,300.841762,76.511282,19.19619,17.83329,17.52225,17.40237,17.35182,4134,301,1,106,1869067566828251136,STAR,-0.00022,1660,53230,268
1,1,1237663228535374382,300.730508,76.551731,21.65541,19.13715,17.92577,17.44741,17.15818,4134,301,1,106,1869069216095692800,STAR,-8e-06,1660,53230,274
2,2,1237663228535374435,300.871382,76.53057,20.70867,19.20954,18.55966,18.24395,18.10117,4134,301,1,106,1869067291950344192,STAR,9.6e-05,1660,53230,267
3,3,1237663228535374802,300.317409,76.374746,22.88806,21.209,19.9056,19.33555,19.08966,4134,301,1,106,1870200338850539520,STAR,-0.000247,1661,53240,293
4,4,1237663228535439577,301.252332,76.31952,17.82932,16.11081,15.39808,15.13612,15.00507,4134,301,1,107,1869072789508483072,STAR,-0.000131,1660,53230,287


In [2]:
print(DR_df.keys())

Index(['Unnamed: 0', 'objid_', 'ra_', 'dec_', 'u_', 'g_', 'r_', 'i_', 'z_',
       'run_', 'rerun_', 'camcol_', 'field_', 'specobjid_', 'class_',
       'redshift_', 'plate_', 'mjd_', 'fiberid_'],
      dtype='object')


In [3]:
# DR_df = DR_df.drop("Unnamed: 0", axis=1) # ID Number
# DR_df = DR_df.drop("objid_", axis=1) # ID Number
# DR_df = DR_df.drop("ra_", axis=1) # Right Ascension
# DR_df = DR_df.drop("dec_", axis=1) # Declination
# DR_df = DR_df.drop("run_", axis=1) # Specifies Scan
# DR_df = DR_df.drop("camcol_", axis=1) # Identifies scanline
# DR_df = DR_df.drop("field_", axis=1) # Field number
# DR_df = DR_df.drop("rerun_", axis=1) # Rerun number
# DR_df = DR_df.drop("specobjid_", axis=1) # Object identifier
# DR_df = DR_df.drop("plate_", axis=1) # plate number
# DR_df = DR_df.drop("mjd_", axis=1) # MJD of Observation
# DR_df = DR_df.drop("fiberid_", axis=1) # Fiber ID

In [4]:
features_to_drop = ['Unnamed: 0', 'objid_', 'ra_', 'dec_',
                    'run_', 'rerun_', 'camcol_', 'field_', 'specobjid_', 'plate_', 'mjd_', 'fiberid_']

for feature in features_to_drop:
    DR_df = DR_df.drop(feature, axis=1)

In [5]:
DR_df.head()

Unnamed: 0,u_,g_,r_,i_,z_,class_,redshift_
0,19.19619,17.83329,17.52225,17.40237,17.35182,STAR,-0.00022
1,21.65541,19.13715,17.92577,17.44741,17.15818,STAR,-8e-06
2,20.70867,19.20954,18.55966,18.24395,18.10117,STAR,9.6e-05
3,22.88806,21.209,19.9056,19.33555,19.08966,STAR,-0.000247
4,17.82932,16.11081,15.39808,15.13612,15.00507,STAR,-0.000131


In [6]:
print(DR_df.keys())

Index(['u_', 'g_', 'r_', 'i_', 'z_', 'class_', 'redshift_'], dtype='object')


In [7]:
# Check feature types
for i in DR_df.keys():
    feature_name = i
    feature_type = DR_df[i].dtype
    print(f"Column Name: {feature_name},\ntype: {feature_type}\n") 

Column Name: u_,
type: float64

Column Name: g_,
type: float64

Column Name: r_,
type: float64

Column Name: i_,
type: float64

Column Name: z_,
type: float64

Column Name: class_,
type: object

Column Name: redshift_,
type: float64



In [8]:
# Integer encode the `class` feature with labelencoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
DR_df['class_'] = le.fit_transform(DR_df['class_'])

In [9]:
DR_df = DR_df.set_index('class_')
DR_df.head()

Unnamed: 0_level_0,u_,g_,r_,i_,z_,redshift_
class_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,19.19619,17.83329,17.52225,17.40237,17.35182,-0.00022
2,21.65541,19.13715,17.92577,17.44741,17.15818,-8e-06
2,20.70867,19.20954,18.55966,18.24395,18.10117,9.6e-05
2,22.88806,21.209,19.9056,19.33555,19.08966,-0.000247
2,17.82932,16.11081,15.39808,15.13612,15.00507,-0.000131


In [10]:
path = "../Resources"
DR_df.to_csv(os.path.join(path,r'DR_complete_clean.csv'))

The variables of the data set are of different scales i.e. one variable is in thousandths and others in the 10s. For e.g. in our data set `redshift_` is having values in thousandths and the other features are in of a different scale. Since the data in these variables are of different scales, it is tough to compare these variables. We will convert variables with different scales of measurements into a single scale. StandardScaler normalizes the data using the formula (x-mean)/standard deviation. We will be doing this only for the numerical variables.

We will save this scaled dataframe as a csv file independent of the one above.

In [11]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale

StandardScaler()

In [12]:
features_to_scale = ['u_', 'g_', 'r_', 'i_', 'z_', 'redshift_']

for feature in features_to_scale:
    DR_df[feature] = std_scale.fit_transform(DR_df[[feature]])

DR_df.head()

Unnamed: 0_level_0,u_,g_,r_,i_,z_,redshift_
class_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,-1.06772,-1.123867,-0.878764,-0.036867,-0.558164,-0.713568
2,-0.030126,-0.487516,-0.660599,-0.035391,-0.6683,-0.71327
2,-0.429575,-0.452186,-0.317882,-0.009295,-0.131955,-0.713123
2,0.489953,0.523653,0.40981,0.026467,0.430269,-0.713607
2,-1.644429,-1.964525,-2.027211,-0.111112,-1.892926,-0.713444


In [13]:
path = "../Resources"
DR_df.to_csv(os.path.join(path,r'DR_complete_clean_scaled.csv'))