# Creating Hybrid dataset from SDSS + LAMOST

    --> http://dr6.lamost.org/v2/sql/s

In [79]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
import warnings
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import timeit
import os
warnings.filterwarnings("ignore")

In [80]:
sdss = pd.read_csv('input/combine/SDSS_final.csv')
lamost = pd.read_csv('input/combine/LAMOST_final.csv')
sdss = sdss.sample(frac=1)
lamost = lamost.sample(frac=1)

In [81]:
sdss.columns

Index(['objid', 'ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'run', 'rerun', 'camcol',
       'field', 'specobjid', 'class', 'redshift', 'plate', 'mjd', 'fiberid'],
      dtype='object')

In [82]:
lamost.columns

Index(['obsid', 'ra', 'dec', 'redshift', 'class', 'mag1', 'mag2', 'mag3',
       'mag4', 'mag5'],
      dtype='object')

# Lamost

In [83]:
from astropy import units as u
from astropy.coordinates import SkyCoord
from astropy.cosmology import WMAP9 as cosmo

radec = SkyCoord(ra=lamost['ra']*u.degree, dec=lamost['dec']*u.degree, frame='icrs')
#radec.ra.value
#radec.dec.value
galactic = radec.galactic

lamost['l'] = galactic.l.value
lamost['b'] = galactic.b.value


r = cosmo.comoving_distance(lamost['redshift'])
lamost['distance']= r.value

lamost.head()

Unnamed: 0,obsid,ra,dec,redshift,class,mag1,mag2,mag3,mag4,mag5,l,b,distance
12581,6914095,120.78038,30.000975,0.141658,GALAXY,,,17.92,17.24,,191.497204,27.720556,593.448188
1210,134806173,156.175532,-2.306705,0.519072,QSO,20.06,20.08,19.96,19.65,19.57,246.837642,44.010379,1979.342226
22426,511101,81.27087,31.021007,-0.00037,STAR,,20.02,18.23,16.74,,175.842206,-2.615629,-1.599727
23190,109015187,163.850162,42.917264,0.683907,QSO,20.74,19.76,19.13,18.78,18.55,171.451666,61.702672,2497.945821
24235,205032,44.574356,-1.729952,-0.0001,STAR,20.35,17.66,16.35,15.82,15.53,178.452993,-50.39978,-0.431188


In [84]:
def cartesian(dist,alpha,delta):
    x = dist*np.cos(np.deg2rad(delta))*np.cos(np.deg2rad(alpha))
    y = dist*np.cos(np.deg2rad(delta))*np.sin(np.deg2rad(alpha))
    z = dist*np.sin(np.deg2rad(delta))
    return x,y,z

cart = cartesian(lamost['distance'],lamost['ra'],lamost['dec'])
lamost['x_coord'] = cart[0]
lamost['y_coord'] = cart[1]
lamost['z_coord'] = cart[2]

lamost.head()

Unnamed: 0,obsid,ra,dec,redshift,class,mag1,mag2,mag3,mag4,mag5,l,b,distance,x_coord,y_coord,z_coord
12581,6914095,120.78038,30.000975,0.141658,GALAXY,,,17.92,17.24,,191.497204,27.720556,593.448188,-263.006159,441.540637,296.73284
1210,134806173,156.175532,-2.306705,0.519072,QSO,20.06,20.08,19.96,19.65,19.57,246.837642,44.010379,1979.342226,-1809.209826,798.879705,-79.665989
22426,511101,81.27087,31.021007,-0.00037,STAR,,20.02,18.23,16.74,,175.842206,-2.615629,-1.599727,-0.208057,-1.355052,-0.824423
23190,109015187,163.850162,42.917264,0.683907,QSO,20.74,19.76,19.13,18.78,18.55,171.451666,61.702672,2497.945821,-1757.149827,508.831432,1700.955138
24235,205032,44.574356,-1.729952,-0.0001,STAR,20.35,17.66,16.35,15.82,15.53,178.452993,-50.39978,-0.431188,-0.307013,-0.302485,0.013017


In [85]:
lamost['u-r'] = lamost['mag1']-lamost['mag4']

In [86]:
lamost['class'] = lamost['class'].astype('category').cat.codes

In [87]:
lamost = lamost.dropna()

In [88]:
lamost.columns

Index(['obsid', 'ra', 'dec', 'redshift', 'class', 'mag1', 'mag2', 'mag3',
       'mag4', 'mag5', 'l', 'b', 'distance', 'x_coord', 'y_coord', 'z_coord',
       'u-r'],
      dtype='object')

In [89]:
columns_titles = ["mag1","mag2","mag4","mag3","mag5","redshift","l","b","distance","x_coord","y_coord","z_coord","u-r","class"]
data_lamost=lamost.reindex(columns=columns_titles)

In [90]:
data_lamost.rename(columns = {'mag1':'u','mag2':'g','mag3':'i','mag4':'r','mag5':'z'}, inplace = True)

In [91]:
data_lamost

Unnamed: 0,u,g,r,i,z,redshift,l,b,distance,x_coord,y_coord,z_coord,u-r,class
1210,20.06,20.08,19.65,19.96,19.57,0.519072,246.837642,44.010379,1979.342226,-1809.209826,798.879705,-79.665989,0.41,1
23190,20.74,19.76,18.78,19.13,18.55,0.683907,171.451666,61.702672,2497.945821,-1757.149827,508.831432,1700.955138,1.96,1
24235,20.35,17.66,15.82,16.35,15.53,-0.000100,178.452993,-50.399780,-0.431188,-0.307013,-0.302485,0.013017,4.53,2
18213,19.36,16.85,15.23,15.67,14.95,1.064190,201.692465,65.839783,3520.270168,-2976.542146,770.028962,1714.512848,4.13,1
24520,18.85,18.09,17.65,17.90,17.64,0.089774,102.650682,-55.255145,380.616363,378.552269,9.292540,38.479145,1.20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27550,21.62,19.38,17.33,17.85,17.02,0.260848,234.326905,40.329634,1061.995227,-895.410554,568.786435,50.554860,4.29,0
27671,20.66,18.89,17.40,17.88,17.04,0.126548,122.857269,-61.866317,532.014695,518.663948,118.068622,9.335163,3.26,0
17611,18.46,17.04,16.03,16.44,15.89,0.055701,163.453755,-48.904738,237.973331,185.847776,146.675073,24.048565,2.43,0
3317,19.65,19.28,19.09,19.27,19.20,0.582793,83.864469,-48.251838,2185.747500,2121.407602,-456.550866,262.074851,0.56,1


# SDSS

In [92]:
from astropy import units as u
from astropy.coordinates import SkyCoord
from astropy.cosmology import WMAP9 as cosmo

radec = SkyCoord(ra=sdss['ra']*u.degree, dec=sdss['dec']*u.degree, frame='icrs')
#radec.ra.value
#radec.dec.value
galactic = radec.galactic

sdss['l'] = galactic.l.value
sdss['b'] = galactic.b.value


r = cosmo.comoving_distance(sdss['redshift'])
sdss['distance']= r.value

sdss.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,...,field,specobjid,class,redshift,plate,mjd,fiberid,l,b,distance
6126,1.24e+18,132.64813,0.513992,23.63103,21.7563,19.86714,19.027,18.59907,1239,301,...,82,4.3e+18,GALAXY,0.423029,3815,55537,515,227.066523,26.501462,1653.564621
22996,1.24e+18,28.47987,14.842659,26.41812,21.94439,20.76235,19.54128,18.87422,1035,301,...,149,5.77e+18,GALAXY,0.617852,5123,55841,42,144.679447,-45.374307,2296.091051
13415,1.24e+18,154.029671,0.22016,23.22405,22.70523,21.39361,19.95633,19.61973,756,301,...,255,4.31e+18,GALAXY,0.498464,3831,55543,573,242.227468,44.031861,1910.942298
7398,1.24e+18,134.210356,54.04265,18.37194,16.43596,15.51614,15.06265,14.75283,1345,301,...,181,5.06e+17,STAR,-0.000124,449,51900,497,163.723918,39.865513,-0.53817
22562,1.24e+18,168.679603,-3.000609,23.04556,21.63496,20.55599,19.69412,19.15873,1140,301,...,105,4.26e+18,STAR,0.000349,3788,55246,207,261.410904,51.898771,1.511196


In [93]:
def cartesian(dist,alpha,delta):
    x = dist*np.cos(np.deg2rad(delta))*np.cos(np.deg2rad(alpha))
    y = dist*np.cos(np.deg2rad(delta))*np.sin(np.deg2rad(alpha))
    z = dist*np.sin(np.deg2rad(delta))
    return x,y,z

cart = cartesian(sdss['distance'],sdss['ra'],sdss['dec'])
sdss['x_coord'] = cart[0]
sdss['y_coord'] = cart[1]
sdss['z_coord'] = cart[2]

sdss.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,...,redshift,plate,mjd,fiberid,l,b,distance,x_coord,y_coord,z_coord
6126,1.24e+18,132.64813,0.513992,23.63103,21.7563,19.86714,19.027,18.59907,1239,301,...,0.423029,3815,55537,515,227.066523,26.501462,1653.564621,-1120.235149,1216.194527,14.833685
22996,1.24e+18,28.47987,14.842659,26.41812,21.94439,20.76235,19.54128,18.87422,1035,301,...,0.617852,5123,55841,42,144.679447,-45.374307,2296.091051,1950.886521,1058.357652,588.179373
13415,1.24e+18,154.029671,0.22016,23.22405,22.70523,21.39361,19.95633,19.61973,756,301,...,0.498464,3831,55543,573,242.227468,44.031861,1910.942298,-1717.96445,836.80625,7.342801
7398,1.24e+18,134.210356,54.04265,18.37194,16.43596,15.51614,15.06265,14.75283,1345,301,...,-0.000124,449,51900,497,163.723918,39.865513,-0.53817,0.220348,-0.226507,-0.435624
22562,1.24e+18,168.679603,-3.000609,23.04556,21.63496,20.55599,19.69412,19.15873,1140,301,...,0.000349,3788,55246,207,261.410904,51.898771,1.511196,-1.479764,0.296234,-0.079106


In [94]:
sdss['u-r'] = sdss['u']-sdss['r']
sdss['class'] = sdss['class'].astype('category').cat.codes
sdss = sdss.dropna()

In [95]:
columns_titles = ["u","g","r","i","z","redshift","l","b","distance","x_coord","y_coord","z_coord","u-r","class"]
data_sdss=sdss.reindex(columns=columns_titles)

In [96]:
data_sdss

Unnamed: 0,u,g,r,i,z,redshift,l,b,distance,x_coord,y_coord,z_coord,u-r,class
6126,23.63103,21.75630,19.86714,19.02700,18.59907,0.423029,227.066523,26.501462,1653.564621,-1120.235149,1216.194527,14.833685,3.76389,0
22996,26.41812,21.94439,20.76235,19.54128,18.87422,0.617852,144.679447,-45.374307,2296.091051,1950.886521,1058.357652,588.179373,5.65577,0
13415,23.22405,22.70523,21.39361,19.95633,19.61973,0.498464,242.227468,44.031861,1910.942298,-1717.964450,836.806250,7.342801,1.83044,0
7398,18.37194,16.43596,15.51614,15.06265,14.75283,-0.000124,163.723918,39.865513,-0.538170,0.220348,-0.226507,-0.435624,2.85580,2
22562,23.04556,21.63496,20.55599,19.69412,19.15873,0.000349,261.410904,51.898771,1.511196,-1.479764,0.296234,-0.079106,2.48957,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6836,21.79394,20.98301,20.70273,20.73454,20.59291,-0.000209,272.440548,57.374510,-0.901905,0.900258,-0.047562,0.026552,1.09121,2
24942,23.26712,21.52811,21.35841,21.39918,21.56435,2.890654,257.431448,50.199194,6393.431889,-6186.745059,1581.796244,-313.173340,1.90871,1
11968,24.96220,22.09758,21.99999,21.91905,21.26641,3.039737,250.282850,46.431248,6542.973301,-6127.436228,2278.233410,-273.271695,2.96221,1
27364,19.23483,18.39306,18.15089,18.05317,18.07109,0.000053,151.947598,45.548241,0.227480,-0.096163,0.057307,0.198030,1.08394,2


# Join

In [97]:
data_lamost.head(15000)['class'].value_counts()

1    6101
0    5836
2    3063
Name: class, dtype: int64

In [98]:
data_sdss.head(15000)['class'].value_counts()

0    5009
1    5003
2    4988
Name: class, dtype: int64

In [99]:
final_lamost = data_lamost.head(15000)
final_sdss = data_sdss.head(15000)

In [100]:
dataframes = [final_sdss,final_lamost]
df = pd.concat(dataframes)

In [101]:
df.head()

Unnamed: 0,u,g,r,i,z,redshift,l,b,distance,x_coord,y_coord,z_coord,u-r,class
6126,23.63103,21.7563,19.86714,19.027,18.59907,0.423029,227.066523,26.501462,1653.564621,-1120.235149,1216.194527,14.833685,3.76389,0
22996,26.41812,21.94439,20.76235,19.54128,18.87422,0.617852,144.679447,-45.374307,2296.091051,1950.886521,1058.357652,588.179373,5.65577,0
13415,23.22405,22.70523,21.39361,19.95633,19.61973,0.498464,242.227468,44.031861,1910.942298,-1717.96445,836.80625,7.342801,1.83044,0
7398,18.37194,16.43596,15.51614,15.06265,14.75283,-0.000124,163.723918,39.865513,-0.53817,0.220348,-0.226507,-0.435624,2.8558,2
22562,23.04556,21.63496,20.55599,19.69412,19.15873,0.000349,261.410904,51.898771,1.511196,-1.479764,0.296234,-0.079106,2.48957,2


In [102]:
df = df.sample(frac=1)

In [104]:
df.to_csv('input/Hybrid_final.csv',index= False)