# Sloan Digital Sky Survey Classification
## Classification of Galaxies, Stars and Quasars based on the D15 from the SDSS

### Importing Libraries

In [284]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
sns.set_style('whitegrid')
import tensorflow as tf
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import time
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

SMALL_SIZE = 10
MEDIUM_SIZE = 12

plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=MEDIUM_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rcParams['figure.dpi']=150

## Data Acquisition

### Default Query

**SELECT TOP 10000** <br/>
p.objid,p.ra,p.dec,p.u,p.g,p.r,p.i,p.z, p.run, p.rerun, p.camcol, p.field,  <br/>
s.specobjid, s.class, s.z as redshift, s.plate, s.mjd, s.fiberid  <br/>
**FROM** PhotoObj **AS** p <br/>
   **JOIN** SpecObj **AS** s **ON** s.bestobjid = p.objid <br/>
**WHERE** <br/>
   p.u **BETWEEN** 0 **AND** 19.6 <br/>
   **AND** g **BETWEEN** 0 **AND** 20 <br/>



The above query joins two tables (actually views): The image table (PhotoObj) which contains all image objects and the spectral table (SpecObj) which contains corresponding spectral data. 

### Feature Description

In [285]:
sdss_df = pd.read_csv('input/SDSS_dr16.csv', skiprows=0)
# sdss_df = pd.read_csv('input/SDSS_top.csv', skiprows=0)

In [286]:
sdss_df.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,185.18278,1.189488,19.34126,18.467,18.4494,18.33074,18.06268,756,301,6,463,4.33045e+18,QSO,2.627589,3846,55327,865
1,1.23765e+18,216.927113,0.824043,19.20285,18.19531,17.59929,17.22163,17.0241,756,301,5,675,3.44617e+17,GALAXY,0.150541,306,51637,335
2,1.237651e+18,130.183176,49.955453,18.532,17.65095,17.24701,17.12667,17.10159,1331,301,2,175,8.231498e+18,STAR,-0.000139,7311,57038,158
3,1.23765e+18,146.011531,59.07911,18.68013,18.36549,18.32169,18.33179,18.2737,1331,301,4,261,5.10087e+17,QSO,0.775121,453,51915,196
4,1.23766e+18,190.489753,63.544891,19.15626,18.12416,17.64571,17.40442,17.13662,2304,301,6,214,8.79452e+17,QSO,2.613389,781,52373,450


In [287]:
sdss_df.describe()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,redshift,plate,mjd,fiberid
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,1.23765e+18,169.220255,18.833856,18.693241,17.721518,17.299627,17.083654,16.952933,1173.076633,301.0,3.565333,286.052433,1.863896e+18,0.433206,1655.388,53026.506633,343.559367
std,1285415000000.0,67.342191,26.489236,0.799212,1.071583,1.25639,1.336041,1.406907,412.550959,0.0,1.58445,192.647294,2.397609e+18,0.704016,2129.48962,1666.154163,215.758118
min,1.237646e+18,0.039086,-11.214592,12.42139,12.35722,11.63385,11.05139,10.61626,94.0,301.0,1.0,11.0,2.9949e+17,-0.004268,266.0,51578.0,1.0
25%,1.23765e+18,134.683454,-0.425347,18.297902,17.066893,16.430835,16.11568,15.905575,756.0,301.0,2.0,129.0,3.817382e+17,0.00014,339.0,51909.0,164.0
50%,1.23765e+18,175.103945,1.044443,18.926905,17.80338,17.30628,17.045095,16.907735,1239.0,301.0,4.0,253.0,5.890115e+17,0.073726,523.0,52056.0,329.0
75%,1.23765e+18,213.058163,48.587442,19.299202,18.538702,18.335195,18.225887,18.140358,1350.0,301.0,5.0,417.0,2.881224e+18,0.668148,2559.0,54454.0,505.0
max,1.23766e+18,359.996095,68.72347,19.59986,19.91897,29.74405,27.22786,28.23451,2328.0,301.0,6.0,812.0,1.312126e+19,6.990327,11654.0,58543.0,1000.0


From the above table we can tell that are no missing values at all. This means: **no imputing!**

We also notice that most of the features stay within a reasonable scale when comparing values within **only one** column. We can recognize this from the min, max and quartil rows.

In [288]:
sdss_df['class'].value_counts()

QSO       10000
GALAXY    10000
STAR      10000
Name: class, dtype: int64

In [289]:
X_df = sdss_df.drop(['objid','class','run','rerun','camcol','field','specobjid','plate','mjd','fiberid','ra','dec'], axis=1).values
y_df = sdss_df['class'].values

In [290]:
X_df.shape

(30000, 6)

In [291]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.30, random_state=44, shuffle =True)

#### XGBoost

In [293]:
xgb = XGBClassifier(n_estimators=100, eval_metric='mlogloss')
training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100
xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start
print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))

XGBoost's prediction accuracy is: 98.46
Time consumed for training: 3.751
Time consumed for prediction: 0.02665 seconds


## Summary