# Sloan Digital Sky Survey Classification
## Classification of Galaxies, Stars and Quasars based on the D16 from the SDSS

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
sns.set_style('whitegrid')
import tensorflow as tf
from sklearn.model_selection import train_test_split, cross_val_predict
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from tqdm import tqdm
import time
import warnings
from sklearn.metrics import mean_absolute_error
warnings.filterwarnings("ignore")
%matplotlib inline

SMALL_SIZE = 10
MEDIUM_SIZE = 12

plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=MEDIUM_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rcParams['figure.dpi']=150

## Data Acquisition

### Default Query

**SELECT TOP 10000** <br/>
p.objid,p.ra,p.dec,p.u,p.g,p.r,p.i,p.z, p.run, p.rerun, p.camcol, p.field,  <br/>
s.specobjid, s.class, s.z as redshift, s.plate, s.mjd, s.fiberid  <br/>
**FROM** PhotoObj **AS** p <br/>
   **JOIN** SpecObj **AS** s **ON** s.bestobjid = p.objid <br/>
**WHERE** <br/>
   p.u **BETWEEN** 0 **AND** 19.6 <br/>
   **AND** g **BETWEEN** 0 **AND** 20 <br/>



The above query joins two tables (actually views): The image table (PhotoObj) which contains all image objects and the spectral table (SpecObj) which contains corresponding spectral data. 

### Feature Description

In [2]:
sdss_df = pd.read_csv('input/SDSS_dr16.csv', skiprows=0)
# sdss_df = pd.read_csv('input/SDSS_top.csv', skiprows=0)

In [3]:
sdss_df.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,185.18278,1.189488,19.34126,18.467,18.4494,18.33074,18.06268,756,301,6,463,4.33045e+18,QSO,2.627589,3846,55327,865
1,1.23765e+18,216.927113,0.824043,19.20285,18.19531,17.59929,17.22163,17.0241,756,301,5,675,3.44617e+17,GALAXY,0.150541,306,51637,335
2,1.237651e+18,130.183176,49.955453,18.532,17.65095,17.24701,17.12667,17.10159,1331,301,2,175,8.231498e+18,STAR,-0.000139,7311,57038,158
3,1.23765e+18,146.011531,59.07911,18.68013,18.36549,18.32169,18.33179,18.2737,1331,301,4,261,5.10087e+17,QSO,0.775121,453,51915,196
4,1.23766e+18,190.489753,63.544891,19.15626,18.12416,17.64571,17.40442,17.13662,2304,301,6,214,8.79452e+17,QSO,2.613389,781,52373,450


In [4]:
sdss_df.describe()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,redshift,plate,mjd,fiberid
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,1.23765e+18,169.220255,18.833856,18.693241,17.721518,17.299627,17.083654,16.952933,1173.076633,301.0,3.565333,286.052433,1.863896e+18,0.433206,1655.388,53026.506633,343.559367
std,1285415000000.0,67.342191,26.489236,0.799212,1.071583,1.25639,1.336041,1.406907,412.550959,0.0,1.58445,192.647294,2.397609e+18,0.704016,2129.48962,1666.154163,215.758118
min,1.237646e+18,0.039086,-11.214592,12.42139,12.35722,11.63385,11.05139,10.61626,94.0,301.0,1.0,11.0,2.9949e+17,-0.004268,266.0,51578.0,1.0
25%,1.23765e+18,134.683454,-0.425347,18.297902,17.066893,16.430835,16.11568,15.905575,756.0,301.0,2.0,129.0,3.817382e+17,0.00014,339.0,51909.0,164.0
50%,1.23765e+18,175.103945,1.044443,18.926905,17.80338,17.30628,17.045095,16.907735,1239.0,301.0,4.0,253.0,5.890115e+17,0.073726,523.0,52056.0,329.0
75%,1.23765e+18,213.058163,48.587442,19.299202,18.538702,18.335195,18.225887,18.140358,1350.0,301.0,5.0,417.0,2.881224e+18,0.668148,2559.0,54454.0,505.0
max,1.23766e+18,359.996095,68.72347,19.59986,19.91897,29.74405,27.22786,28.23451,2328.0,301.0,6.0,812.0,1.312126e+19,6.990327,11654.0,58543.0,1000.0


From the above table we can tell that are no missing values at all. This means: **no imputing!**

We also notice that most of the features stay within a reasonable scale when comparing values within **only one** column. We can recognize this from the min, max and quartil rows.

In [5]:
sdss_df['class'].value_counts()

QSO       10000
GALAXY    10000
STAR      10000
Name: class, dtype: int64

In [6]:
sdss_df_fe = sdss_df

# encode class labels to integers
le = LabelEncoder()
y_encoded = le.fit_transform(sdss_df_fe['class'])
sdss_df_fe['class'] = y_encoded


In [33]:
sdss_df_fe

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.237650e+18,185.182780,1.189488,19.34126,18.46700,18.44940,18.33074,18.06268,756,301,6,463,4.330450e+18,1,2.627589,3846,55327,865
1,1.237650e+18,216.927113,0.824043,19.20285,18.19531,17.59929,17.22163,17.02410,756,301,5,675,3.446170e+17,0,0.150541,306,51637,335
2,1.237651e+18,130.183176,49.955453,18.53200,17.65095,17.24701,17.12667,17.10159,1331,301,2,175,8.231498e+18,2,-0.000139,7311,57038,158
3,1.237650e+18,146.011531,59.079110,18.68013,18.36549,18.32169,18.33179,18.27370,1331,301,4,261,5.100870e+17,1,0.775121,453,51915,196
4,1.237660e+18,190.489753,63.544891,19.15626,18.12416,17.64571,17.40442,17.13662,2304,301,6,214,8.794520e+17,1,2.613389,781,52373,450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,1.237652e+18,245.382180,51.272421,19.34043,18.26579,17.90533,17.77829,17.70416,1412,301,5,389,3.702108e+18,2,-0.000553,3288,54908,542
29996,1.237649e+18,181.406660,-1.017241,17.77664,16.16791,15.59226,15.40551,15.33608,756,301,1,438,3.256127e+18,2,0.000045,2892,54552,87
29997,1.237650e+18,8.672036,15.573279,19.57385,18.31827,17.81188,17.61035,17.51804,1035,301,5,21,3.527563e+18,2,-0.000095,3133,54789,431
29998,1.237651e+18,112.956179,37.318673,17.47377,16.36555,16.08178,15.97640,15.96756,1402,301,4,32,2.311624e+18,2,-0.000028,2053,53446,551


In [7]:
X_df = sdss_df_fe.drop(['objid','class','run','rerun','camcol','field','specobjid','plate','mjd','fiberid','ra','dec'], axis=1).values
y_df = sdss_df_fe['class'].values

In [8]:
X_df.shape

(30000, 6)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.30, random_state=44, shuffle =True)

#### XGBoost

In [10]:
xgb = XGBClassifier(eval_metric='mlogloss')
training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100
xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start
print("XGBoost's prediction accuracy is: %3.5f" % (acc_xgb))
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))

XGBoost's prediction accuracy is: 98.45556
Time consumed for training: 4.415
Time consumed for prediction: 0.02676 seconds


<center> <h1>Hyper parameter Tuning</h1>

## Learning rate estimation:

In [11]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [12]:
# "Learn" the mean from the training data
mean_train = np.mean(y_train)# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))

Baseline MAE is 0.66


<center> <h4> That is, the prediction is, on average, 0.66 off from the actual value of class

In [13]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1
}
params['eval_metric'] = "mae"
num_boost_round = 999

In [14]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:0.59122
[1]	Test-mae:0.42430
[2]	Test-mae:0.30733
[3]	Test-mae:0.22438
[4]	Test-mae:0.16681
[5]	Test-mae:0.12630
[6]	Test-mae:0.09799
[7]	Test-mae:0.07771
[8]	Test-mae:0.06398
[9]	Test-mae:0.05402
[10]	Test-mae:0.04718
[11]	Test-mae:0.04224
[12]	Test-mae:0.03878
[13]	Test-mae:0.03643
[14]	Test-mae:0.03532
[15]	Test-mae:0.03404
[16]	Test-mae:0.03299
[17]	Test-mae:0.03267
[18]	Test-mae:0.03236
[19]	Test-mae:0.03211
[20]	Test-mae:0.03211
[21]	Test-mae:0.03222
[22]	Test-mae:0.03198
[23]	Test-mae:0.03181
[24]	Test-mae:0.03190
[25]	Test-mae:0.03187
[26]	Test-mae:0.03184
[27]	Test-mae:0.03193
[28]	Test-mae:0.03202
[29]	Test-mae:0.03201
[30]	Test-mae:0.03199
[31]	Test-mae:0.03201
[32]	Test-mae:0.03185
[33]	Test-mae:0.03176
[34]	Test-mae:0.03187
[35]	Test-mae:0.03199
[36]	Test-mae:0.03192
[37]	Test-mae:0.03210
[38]	Test-mae:0.03211
[39]	Test-mae:0.03208
[40]	Test-mae:0.03204
[41]	Test-mae:0.03193
[42]	Test-mae:0.03201
[43]	Test-mae:0.03200


In [15]:
print("Best MAE: {:.5f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best MAE: 0.03176 with 34 rounds


In [16]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,0.596914,0.00119,0.597781,0.00479
1,0.428381,0.001237,0.430083,0.003214
2,0.308575,0.000957,0.31082,0.001934
3,0.224239,0.000714,0.22699,0.001009
4,0.164834,0.000553,0.168042,0.000831
5,0.123166,0.000624,0.126869,0.000877
6,0.09385,0.000752,0.097861,0.001004
7,0.073564,0.001075,0.077757,0.001076
8,0.059202,0.001119,0.063753,0.001224
9,0.048777,0.001142,0.053631,0.001127


### Optimising Max Depth and Min Child Weight

In [17]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [18]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
params={}
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5
	MAE 0.0301462 for 23 rounds
CV with max_depth=9, min_child_weight=6
	MAE 0.0306392 for 28 rounds
CV with max_depth=9, min_child_weight=7
	MAE 0.030544799999999993 for 21 rounds
CV with max_depth=10, min_child_weight=5
	MAE 0.030113 for 21 rounds
CV with max_depth=10, min_child_weight=6
	MAE 0.030148799999999996 for 21 rounds
CV with max_depth=10, min_child_weight=7
	MAE 0.030614600000000002 for 23 rounds
CV with max_depth=11, min_child_weight=5
	MAE 0.029805400000000003 for 21 rounds
CV with max_depth=11, min_child_weight=6
	MAE 0.0297948 for 21 rounds
CV with max_depth=11, min_child_weight=7
	MAE 0.030454800000000004 for 22 rounds
Best params: 11, 6, MAE: 0.0297948


<center><h4> Here we find Max Depth is 11 and min_child_weight is 6

In [19]:
#Setting the found values
params['max_depth'] = 11
params['min_child_weight'] = 6

### Optimising Parameters subsample and colsample_bytree

In [20]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [21]:
min_mae = float("Inf")
best_params = None# We start by the largest values and go down to the smallest

for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 0.0297948 for 21 rounds
CV with subsample=1.0, colsample=0.9
	MAE 0.03749020000000001 for 19 rounds
CV with subsample=1.0, colsample=0.8
	MAE 0.050086 for 23 rounds
CV with subsample=1.0, colsample=0.7
	MAE 0.050086 for 23 rounds
CV with subsample=0.9, colsample=1.0
	MAE 0.030547199999999997 for 21 rounds
CV with subsample=0.9, colsample=0.9
	MAE 0.042800000000000005 for 20 rounds
CV with subsample=0.9, colsample=0.8
	MAE 0.055685799999999994 for 20 rounds
CV with subsample=0.9, colsample=0.7
	MAE 0.055685799999999994 for 20 rounds
CV with subsample=0.8, colsample=1.0
	MAE 0.0309176 for 18 rounds
CV with subsample=0.8, colsample=0.9
	MAE 0.0440716 for 18 rounds
CV with subsample=0.8, colsample=0.8
	MAE 0.0571094 for 21 rounds
CV with subsample=0.8, colsample=0.7
	MAE 0.0571094 for 21 rounds
CV with subsample=0.7, colsample=1.0
	MAE 0.0320858 for 19 rounds
CV with subsample=0.7, colsample=0.9
	MAE 0.045146599999999995 for 18 rounds
CV with subsa

In [22]:
#Setting the found paramters
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0 

### Optimising ETA

In [23]:
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))    # We update our parameters
    params['eta'] = eta    # Run and time CV
    %time 
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
          )    
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta

print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with eta=0.3
Wall time: 0 ns
	MAE 0.0297948 for 21 rounds

CV with eta=0.2
Wall time: 0 ns
	MAE 0.029613800000000003 for 38 rounds

CV with eta=0.1
Wall time: 0 ns
	MAE 0.029425599999999996 for 92 rounds

CV with eta=0.05
Wall time: 0 ns
	MAE 0.0293862 for 171 rounds

CV with eta=0.01
Wall time: 0 ns
	MAE 0.029191 for 927 rounds

CV with eta=0.005
Wall time: 0 ns
	MAE 0.033730800000000005 for 998 rounds

Best params: 0.01, MAE: 0.029191


In [24]:
params['eta'] = 0.01
params['eval_metric'] = "mae"

## Training with updated parameters

In [25]:
model_optimised = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:0.81856
[1]	Test-mae:0.81076
[2]	Test-mae:0.80303
[3]	Test-mae:0.79537
[4]	Test-mae:0.78780
[5]	Test-mae:0.78029
[6]	Test-mae:0.77287
[7]	Test-mae:0.76552
[8]	Test-mae:0.75824
[9]	Test-mae:0.75105
[10]	Test-mae:0.74392
[11]	Test-mae:0.73686
[12]	Test-mae:0.72987
[13]	Test-mae:0.72295
[14]	Test-mae:0.71610
[15]	Test-mae:0.70932
[16]	Test-mae:0.70261
[17]	Test-mae:0.69596
[18]	Test-mae:0.68937
[19]	Test-mae:0.68287
[20]	Test-mae:0.67641
[21]	Test-mae:0.66994
[22]	Test-mae:0.66354
[23]	Test-mae:0.65719
[24]	Test-mae:0.65091
[25]	Test-mae:0.64469
[26]	Test-mae:0.63854
[27]	Test-mae:0.63245
[28]	Test-mae:0.62642
[29]	Test-mae:0.62044
[30]	Test-mae:0.61453
[31]	Test-mae:0.60867
[32]	Test-mae:0.60289
[33]	Test-mae:0.59723
[34]	Test-mae:0.59154
[35]	Test-mae:0.58592
[36]	Test-mae:0.58042
[37]	Test-mae:0.57492
[38]	Test-mae:0.56946
[39]	Test-mae:0.56406
[40]	Test-mae:0.55871
[41]	Test-mae:0.55349
[42]	Test-mae:0.54825
[43]	Test-mae:0.54307
[44]	Test-mae:0.53793
[45]	Test-mae:0.5328

[361]	Test-mae:0.04949
[362]	Test-mae:0.04929
[363]	Test-mae:0.04908
[364]	Test-mae:0.04887
[365]	Test-mae:0.04866
[366]	Test-mae:0.04846
[367]	Test-mae:0.04826
[368]	Test-mae:0.04806
[369]	Test-mae:0.04787
[370]	Test-mae:0.04767
[371]	Test-mae:0.04747
[372]	Test-mae:0.04728
[373]	Test-mae:0.04709
[374]	Test-mae:0.04691
[375]	Test-mae:0.04673
[376]	Test-mae:0.04655
[377]	Test-mae:0.04637
[378]	Test-mae:0.04620
[379]	Test-mae:0.04602
[380]	Test-mae:0.04585
[381]	Test-mae:0.04568
[382]	Test-mae:0.04551
[383]	Test-mae:0.04535
[384]	Test-mae:0.04518
[385]	Test-mae:0.04502
[386]	Test-mae:0.04486
[387]	Test-mae:0.04470
[388]	Test-mae:0.04453
[389]	Test-mae:0.04437
[390]	Test-mae:0.04422
[391]	Test-mae:0.04406
[392]	Test-mae:0.04390
[393]	Test-mae:0.04375
[394]	Test-mae:0.04360
[395]	Test-mae:0.04345
[396]	Test-mae:0.04331
[397]	Test-mae:0.04316
[398]	Test-mae:0.04302
[399]	Test-mae:0.04287
[400]	Test-mae:0.04272
[401]	Test-mae:0.04258
[402]	Test-mae:0.04244
[403]	Test-mae:0.04230
[404]	Test-

[718]	Test-mae:0.02962
[719]	Test-mae:0.02962
[720]	Test-mae:0.02961
[721]	Test-mae:0.02960
[722]	Test-mae:0.02960
[723]	Test-mae:0.02959
[724]	Test-mae:0.02959
[725]	Test-mae:0.02958
[726]	Test-mae:0.02958
[727]	Test-mae:0.02958
[728]	Test-mae:0.02957
[729]	Test-mae:0.02957
[730]	Test-mae:0.02957
[731]	Test-mae:0.02957
[732]	Test-mae:0.02957
[733]	Test-mae:0.02956
[734]	Test-mae:0.02956
[735]	Test-mae:0.02956
[736]	Test-mae:0.02955
[737]	Test-mae:0.02955
[738]	Test-mae:0.02955
[739]	Test-mae:0.02954
[740]	Test-mae:0.02955
[741]	Test-mae:0.02954
[742]	Test-mae:0.02954
[743]	Test-mae:0.02954
[744]	Test-mae:0.02953
[745]	Test-mae:0.02953
[746]	Test-mae:0.02953
[747]	Test-mae:0.02953
[748]	Test-mae:0.02953
[749]	Test-mae:0.02953
[750]	Test-mae:0.02952
[751]	Test-mae:0.02952
[752]	Test-mae:0.02952
[753]	Test-mae:0.02951
[754]	Test-mae:0.02951
[755]	Test-mae:0.02951
[756]	Test-mae:0.02951
[757]	Test-mae:0.02951
[758]	Test-mae:0.02950
[759]	Test-mae:0.02950
[760]	Test-mae:0.02950
[761]	Test-

In [26]:
print("Best MAE: {:.5f} with {} rounds".format(
                 model_optimised.best_score,
                 model_optimised.best_iteration+1))

Best MAE: 0.02941 with 811 rounds


## Summary

In [27]:
print("The best parameters are: ")
print(params)

The best parameters are: 
{'max_depth': 11, 'min_child_weight': 6, 'subsample': 1.0, 'colsample_bytree': 1.0, 'eta': 0.01, 'eval_metric': 'mae'}


In [28]:
print("The improvement in MAE is : {:.5f} ".format(model.best_score - model_optimised.best_score))

The improvement in MAE is : 0.00235 


In [29]:
xgb = XGBClassifier(params,n_estimators=100, eval_metric='mlogloss')
training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100
xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start
print("XGBoost's prediction accuracy is: %3.5f" % (acc_xgb))
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))

XGBoost's prediction accuracy is: 98.45556
Time consumed for training: 4.824
Time consumed for prediction: 0.02150 seconds


## Confusion Matrix

In [30]:
sdss = sdss_df
predictions = cross_val_predict(xgb, sdss.drop('class', axis=1), sdss['class'], cv=3)
confusion_matrix(sdss['class'], predictions)

array([[9917,   77,    6],
       [ 169, 9831,    0],
       [   1,    0, 9999]], dtype=int64)

## Precision and Recall

In [31]:
print("Precision:", precision_score(sdss['class'], predictions, average='micro'))
print("Recall:",recall_score(sdss['class'], predictions, average='micro'))

Precision: 0.9915666666666667
Recall: 0.9915666666666667


## F1 Score

In [32]:
print("F1-Score:", f1_score(sdss['class'], predictions, average='micro'))

F1-Score: 0.9915666666666667


<center><h1> Validationn of model with DR17

In [38]:
sdss_df = pd.read_csv('Test/Dr17.csv', skiprows=0)
sdss_df.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,348.90253,1.271886,19.38905,18.24496,17.58728,17.20807,16.90905,94,301,6,94,4.30195e+17,GALAXY,0.032125,382,51816,368
1,1.23765e+18,51.443695,1.270073,19.52808,17.96541,17.03493,16.53754,16.14154,94,301,6,512,4.66235e+17,GALAXY,0.121315,414,51869,410
2,1.23765e+18,51.483584,1.272013,18.72268,17.3852,16.81134,16.51803,16.29502,94,301,6,512,4.66233e+17,GALAXY,0.048765,414,51869,401
3,1.23765e+18,55.545963,0.866701,18.10033,16.87099,16.69033,16.64522,16.62379,109,301,6,139,8.2542e+18,STAR,-0.000109,7331,56658,835
4,1.23765e+18,55.5235,0.938595,19.15007,17.08453,16.27119,15.97778,15.82429,109,301,6,139,8.2542e+18,STAR,0.000209,7331,56658,839


In [39]:
sdss_df_fe = sdss_df

# encode class labels to integers
le = LabelEncoder()
y_encoded = le.fit_transform(sdss_df_fe['class'])
sdss_df_fe['class'] = y_encoded

sdss_df_fe

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.237650e+18,348.902530,1.271886,19.38905,18.24496,17.58728,17.20807,16.90905,94,301,6,94,4.301950e+17,0,0.032125,382,51816,368
1,1.237650e+18,51.443695,1.270073,19.52808,17.96541,17.03493,16.53754,16.14154,94,301,6,512,4.662350e+17,0,0.121315,414,51869,410
2,1.237650e+18,51.483584,1.272013,18.72268,17.38520,16.81134,16.51803,16.29502,94,301,6,512,4.662330e+17,0,0.048765,414,51869,401
3,1.237650e+18,55.545963,0.866701,18.10033,16.87099,16.69033,16.64522,16.62379,109,301,6,139,8.254200e+18,2,-0.000109,7331,56658,835
4,1.237650e+18,55.523500,0.938595,19.15007,17.08453,16.27119,15.97778,15.82429,109,301,6,139,8.254200e+18,2,0.000209,7331,56658,839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.237650e+18,94.807071,-0.784311,18.98625,17.44478,16.70986,16.41571,16.24832,211,301,2,359,1.417560e+18,2,0.000339,1259,52931,183
96,1.237650e+18,94.877282,-0.694199,17.77287,16.39544,16.32320,16.32068,16.29270,211,301,2,360,1.417560e+18,2,0.000247,1259,52931,194
97,1.237650e+18,94.892919,-0.786350,16.13102,15.06038,15.05153,15.11111,15.14127,211,301,2,360,1.417560e+18,2,0.000225,1259,52931,188
98,1.237650e+18,94.978682,-0.745533,17.89055,16.60337,16.52952,16.51860,16.59143,211,301,2,361,1.417560e+18,2,0.000207,1259,52931,187


In [40]:
X_df = sdss_df_fe.drop(['objid','class','run','rerun','camcol','field','specobjid','plate','mjd','fiberid','ra','dec'], axis=1).values
y_df = sdss_df_fe['class'].values

In [41]:
xgb = XGBClassifier(params,n_estimators=100, eval_metric='mlogloss')
training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100
xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start
print("XGBoost's prediction accuracy is: %3.5f" % (acc_xgb))
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))

XGBoost's prediction accuracy is: 98.45556
Time consumed for training: 4.956
Time consumed for prediction: 0.02506 seconds


## Confusion Matrix

In [42]:
sdss = sdss_df
predictions = cross_val_predict(xgb, sdss.drop('class', axis=1), sdss['class'], cv=3)
confusion_matrix(sdss['class'], predictions)

array([[29,  0,  0],
       [ 2,  1,  0],
       [ 0,  0, 68]], dtype=int64)

## Precision and Recall

In [43]:
print("Precision:", precision_score(sdss['class'], predictions, average='micro'))
print("Recall:",recall_score(sdss['class'], predictions, average='micro'))

Precision: 0.98
Recall: 0.98


## F1 Score

In [44]:
print("F1-Score:", f1_score(sdss['class'], predictions, average='micro'))

F1-Score: 0.98
