<a href="https://colab.research.google.com/github/peterjsadowski/sklearn_examples/blob/master/sdss/quasars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
# Build simple model for classifying quasars, galaxies, and stars. 
# Author Peter Sadowski
# Adapted from https://www.kaggle.com/lucidlenn/sloan-digital-sky-survey/home

import urllib
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.metrics
import sklearn.ensemble

# Download data from Sloan Digital Sky Survey 
# https://www.sdss.org/dr14/
url = 'https://raw.githubusercontent.com/peterjsadowski/sklearn_examples/master/sdss/sdss.csv'
filename = 'sdss.csv'
urllib.request.urlretrieve(url, filename)

('sdss.csv', <http.client.HTTPMessage at 0x7f3881fff320>)

In [77]:
# Features:
# ra = J2000 Right Ascension (r-band)
# dec = J2000 Declination (r-band)
# u = better of DeV/Exp magnitude fit
# g = better of DeV/Exp magnitude fit
# r = better of DeV/Exp magnitude fit
# i = better of DeV/Exp magnitude fit
# z = better of DeV/Exp magnitude fit
# redshift = Redshift
# plate = plate number
# mjd = MJD of observation
# fiberid = fiber ID

# The Thuan-Gunn astronomic magnitude system. u, g, r, i, z represent the response of the 5 bands of the telescope.
# Redshift is the change in electromagnetic radiation due to the object moving away from the observer.

data = pd.read_csv(filename)
data.drop(["objid","specobjid","run","rerun","camcol","field"], axis = 1, inplace = True) # Unused columns.

print(data.head(n=5))
print(data.info())

           ra       dec         u         g         r         i         z  \
0  183.531326  0.089693  19.47406  17.04240  15.94699  15.50342  15.22531   
1  183.598371  0.135285  18.66280  17.21449  16.67637  16.48922  16.39150   
2  183.680207  0.126185  19.38298  18.19169  17.47428  17.08732  16.80125   
3  183.870529  0.049911  17.76536  16.60272  16.16116  15.98233  15.90438   
4  183.883288  0.102557  17.55025  16.26342  16.43869  16.55492  16.61326   

    class  redshift  plate    mjd  fiberid  
0    STAR -0.000009   3306  54922      491  
1    STAR -0.000055    323  51615      541  
2  GALAXY  0.123111    287  52023      513  
3    STAR -0.000111   3306  54922      510  
4    STAR  0.000590   3306  54922      512  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
ra          10000 non-null float64
dec         10000 non-null float64
u           10000 non-null float64
g           10000 non-null float64
r           10000 n

In [78]:
# Preprocess data.

# Associate each class with a number.
print("Mapping: ", dict(enumerate(["GALAXY","QUASAR","STAR"])))
data["class"] = data["class"].astype("category")
data["class"] = data["class"].cat.codes
print(data["class"].value_counts().sort_index())

# Split data set.
features = data.drop("class", axis = 1)
labels = data["class"].copy()
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)
print('Train data shape:', X_train.shape, y_train.shape)
print('Test data shape:' , X_test.shape, y_test.shape)


Mapping:  {0: 'GALAXY', 1: 'QUASAR', 2: 'STAR'}
0    4998
1     850
2    4152
Name: class, dtype: int64
Train data shape: (8000, 11) (8000,)
Test data shape: (2000, 11) (2000,)


In [79]:
# Train and test classifier:
import sklearn.neighbors
classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
accuracy = classifier.score(X_test, y_test)
print(f'Test accuracy: {accuracy}')

Test accuracy: 0.7365
