In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection

In [2]:
##reading data with pandas
df= pd.read_csv('dataset.csv')
## Taking a peak at the data
df.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1237666301628060000,47.372545,0.820621,18.69254,17.13867,16.55555,16.34662,16.17639,4849,301,5,771,8168632633242440000,STAR,0.000115,7255,56597,832
1,1237673706652430000,116.303083,42.45598,18.47633,17.30546,17.24116,17.3278,17.37114,6573,301,6,220,9333948945297330000,STAR,-9.3e-05,8290,57364,868
2,1237671126974140000,172.756623,-8.785698,16.47714,15.31072,15.55971,15.72207,15.82471,5973,301,1,13,3221211255238850000,STAR,0.000165,2861,54583,42
3,1237665441518260000,201.224207,28.77129,18.63561,16.88346,16.09825,15.70987,15.43491,4649,301,3,121,2254061292459420000,GALAXY,0.058155,2002,53471,35
4,1237665441522840000,212.817222,26.625225,18.88325,17.87948,17.47037,17.17441,17.05235,4649,301,3,191,2390305906828010000,GALAXY,0.07221,2123,53793,74


# Data Analysis & Preprocessing

In [3]:
#test if the data contains null values
print('Nan value',df.isnull().sum())

Nan value objid        0
ra           0
dec          0
u            0
g            0
r            0
i            0
z            0
run          0
rerun        0
camcol       0
field        0
specobjid    0
class        0
redshift     0
plate        0
mjd          0
fiberid      0
dtype: int64


In [4]:
#take a look at the column of the dataframe to see the features
df.columns


Index(['objid', 'ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'run', 'rerun', 'camcol',
       'field', 'specobjid', 'class', 'redshift', 'plate', 'mjd', 'fiberid'],
      dtype='object')

In [5]:
#number of data in each class
df['class'].value_counts()

GALAXY    51323
STAR      38096
QSO       10581
Name: class, dtype: int64

#### Note by Lynda
We observe that the distribution of data in the 3 classes is unbalanced. Besides, wa can see that the galaxy class contains the most amount of data (more than 50%), the star class around 39% while the QSO gathers 11%. 

This is why we will do an over-sampling, duplicating the data in the minority classes in order to have the same number of data as the Galaxy Class

In [6]:
class0,class1,class2 = df[df['class'] == 'GALAXY'],df[df['class'] == 'STAR'],df[df['class'] == 'QSO']
count_class_0, count_class_1,count_class_2 = df['class'].value_counts()

In [7]:
class1_sampled = class1.sample(count_class_0, replace=True) #le .sample permet d'avoir des objets randoms
class2_sampled = class2.sample(count_class_0, replace=True)

df = pd.concat([class0, class1_sampled,class2_sampled], axis=0)

print('After Random over-sampling:')
print(df['class'].value_counts())


After Random over-sampling:
QSO       51323
STAR      51323
GALAXY    51323
Name: class, dtype: int64


#### Note  by lynda
we can see in the histograms of features, figures "X"  that the features'ra','dec', 'run', 'rerun', 'camcol', 'field', 'plate', 'mjd', 'fiberid' are not distinct among the classes, they are not related to the characteristics of the objects.Plus as the features "Objid" and "Specobjid" were obtained thanks to a join of tables with these attributes, they are also not relevant.

Therefore they cannot be used to identify galaxies, stars and quasars and thus classes

In [8]:
# we remove useless features by deleting their columns in the data frame
df_features = df.drop(columns=['objid', 'ra','dec', 'run', 'rerun', 'camcol', 'field','specobjid', 'plate', 'mjd', 'fiberid'])
df_features

Unnamed: 0,u,g,r,i,z,class,redshift
3,18.63561,16.88346,16.09825,15.70987,15.43491,GALAXY,0.058155
4,18.88325,17.87948,17.47037,17.17441,17.05235,GALAXY,0.072210
5,15.44680,13.88656,13.06969,12.68676,12.42432,GALAXY,0.020701
6,19.10002,17.10998,16.05751,15.63390,15.26723,GALAXY,0.130948
7,16.89111,16.29741,16.25994,16.86333,16.77033,GALAXY,0.004703
...,...,...,...,...,...,...,...
91885,19.00089,18.86901,18.54266,18.52032,18.48898,QSO,1.293628
61792,18.76070,18.46055,18.47091,18.63990,18.61114,QSO,0.876347
23249,18.54546,17.57167,16.67333,16.10255,15.89512,QSO,0.141208
4143,19.59640,19.60486,19.62648,19.39641,19.23944,QSO,1.953817


In [9]:
df_features.describe()

Unnamed: 0,u,g,r,i,z,redshift
count,153969.0,153969.0,153969.0,153969.0,153969.0,153969.0
mean,18.702853,17.726679,17.302601,17.088526,16.961053,0.436917
std,0.800442,1.081542,1.269727,1.355845,1.425836,0.700638
min,10.61181,9.668339,9.005167,8.848403,8.947795,-0.004136
25%,18.31668,17.07101,16.42555,16.10449,15.89512,7.2e-05
50%,18.93814,17.8123,17.32453,17.06649,16.93514,0.072457
75%,19.30528,18.54494,18.33519,18.23349,18.15568,0.694587
max,19.59995,19.9842,31.9901,32.14147,29.38374,7.011245


In [10]:
## Encoder les classes 
# On utilisera l'encoder de scikit-learn
le = preprocessing.LabelEncoder()
labels = le.fit_transform(df_features["class"])

In [11]:
df_features["class"] = labels

In [12]:
## Standarisation
# On utilisera la standarisation de scikit-learn
df_features = df_features.drop(columns=["class"])
scaler = preprocessing.StandardScaler()
scaled_features = scaler.fit_transform(df_features)
scaled_features

array([[-0.08400771, -0.77964744, -0.94851443, -1.01682719, -1.07035332,
        -0.54059747],
       [ 0.22537228,  0.1412811 ,  0.13213058,  0.06334389,  0.06403052,
        -0.52053665],
       [-4.06783094, -3.55060661, -3.33372726, -3.24652115, -3.18181626,
        -0.59405553],
       ...,
       [-0.19663332, -0.14332263, -0.4955968 , -0.72720616, -0.74758723,
        -0.42205813],
       [ 1.11632017,  1.73658209,  1.83022513,  1.70217939,  1.59793567,
         2.16503211],
       [ 0.84977914,  1.41206301,  1.32059436,  1.40347175,  1.3471837 ,
         0.84661595]])

In [13]:
scaled_features.shape

(153969, 6)

In [14]:
df_features.columns

Index(['u', 'g', 'r', 'i', 'z', 'redshift'], dtype='object')

In [15]:
# Convert numpy array to panda dataframe
df_temp = pd.DataFrame(scaled_features, columns=df_features.columns)
df_temp

Unnamed: 0,u,g,r,i,z,redshift
0,-0.084008,-0.779647,-0.948514,-1.016827,-1.070353,-0.540597
1,0.225372,0.141281,0.132131,0.063344,0.064031,-0.520537
2,-4.067831,-3.550607,-3.333727,-3.246521,-3.181816,-0.594056
3,0.496186,-0.570205,-0.980600,-1.072859,-1.187955,-0.436701
4,-2.263435,-1.321514,-0.821172,-0.166093,-0.133763,-0.616889
...,...,...,...,...,...,...
153964,0.372342,1.056209,0.976637,1.056019,1.071604,1.222761
153965,0.072269,0.678543,0.920129,1.144216,1.157280,0.627186
153966,-0.196633,-0.143323,-0.495597,-0.727206,-0.747587,-0.422058
153967,1.116320,1.736582,1.830225,1.702179,1.597936,2.165032


In [17]:
# Décomposition de notre jeu de données en ensemble d'entrainement+validation et en un ensemble de test 
test_ratio = 0.2
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df_temp, labels, test_size = test_ratio)
X_train

Unnamed: 0,u,g,r,i,z,redshift
124896,0.498122,1.132387,0.962926,1.212417,1.299212,0.955330
14519,0.075180,-1.011743,-1.341025,-1.489759,-1.616063,-0.567190
138044,1.110936,1.479744,1.468619,1.533487,1.457947,0.591329
6210,-1.637142,-0.915667,-0.316400,-0.305527,-0.006918,-0.545717
33377,0.662045,-0.364822,-0.682756,-0.772514,-0.858175,-0.498918
...,...,...,...,...,...,...
39636,-0.720945,-1.451339,-1.599633,-1.653657,-1.729414,-0.533942
47623,0.872254,0.220233,-0.034520,-0.156734,-0.231481,-0.471495
134398,0.976547,1.090530,1.334172,1.392165,1.221355,2.858523
132736,1.093195,1.534093,1.128316,0.849159,0.806987,-0.298489
