# Load Modules

In [7]:
import pandas as pd 
from matplotlib import pyplot as plt 
import numpy as np 
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier 
import sklearn.impute._iterative as itimp
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from IPython.display import display

In [11]:
df = pd.read_csv('unid_sources_phot.csv')
df = df[df['var_flag']==1]
display(df['class'].value_counts())
#df = df[df['streak_src_flag']==0]
#df = df[df['pileup_flag']==0]
df =  df.reset_index(drop=True)
# df = df.replace({
#     'CV' : 'XRB' , 
#     'LMXB' : 'XRB' , 
#     'HMXB' : 'XRB' , 
#     'PULSAR' : 'XRB'
# })
#df = df[df['class'].isin(['CV' , 'HMXB' , 'LMXB'])]



U    40522
Name: class, dtype: int64

In [10]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_available': 100-percent_missing})
missing_value_df = missing_value_df.sort_values('percent_available' , ascending=True)
#missing_value_df[missing_value_df['percent_available']>5]
#missing_value_df.to_csv('bands_sparsity.csv')
missing_value_df

Unnamed: 0,column_name,percent_available
2-10 keV (XMM),2-10 keV (XMM),2.884853
FUV (GALEX) AB,FUV (GALEX) AB,4.367998
NUV (GALEX) AB,NUV (GALEX) AB,4.496323
0.5-8 keV (Chandra),0.5-8 keV (Chandra),5.35018
0.5-2 keV (Chandra),0.5-2 keV (Chandra),5.480973
u (SDSS PSF) AB,u (SDSS PSF) AB,7.988253
z (SDSS PSF) AB,z (SDSS PSF) AB,8.040077
r (SDSS PSF) AB,r (SDSS PSF) AB,8.252307
i (SDSS PSF) AB,i (SDSS PSF) AB,8.351019
g (SDSS PSF) AB,g (SDSS PSF) AB,8.536104


In [6]:
df['JH'] = df['J (2MASS/CTIO)'] - df['H (2MASS/CTIO)']
df['JK'] = df['J (2MASS/CTIO)'] - df['K_s (2MASS/CTIO)']
df['HK'] = df['H (2MASS/CTIO)'] - df['K_s (2MASS/CTIO)']
df['W1W4'] = df['W1 (WISE)'] - df['W4 (WISE)']
df['W1W2'] = df['W1 (WISE)'] - df['W2 (WISE)']
df['W1W3'] = df['W3 (WISE)'] - df['W1 (WISE)']
df['W2W4'] = df['W2 (WISE)'] - df['W4 (WISE)']
df['W2W3'] = df['W2 (WISE)'] - df['W3 (WISE)']
df['W3W4'] = df['W3 (WISE)'] - df['W4 (WISE)']

In [11]:
feat_to_use = [
 'hard_hm',
 'var_inter_hard_prob_hm',
 'var_inter_hard_sigma_hm',
 'hard_hs',
 'var_inter_hard_prob_hs',
 'hard_ms',
 'var_inter_hard_prob_ms',
 'var_intra_index_b',
 'var_intra_prob_b',
 'ks_intra_prob_b',
 'kp_intra_prob_b',
 'var_inter_index_b',
 'var_inter_prob_b',
 'var_inter_sigma_b',
 'gal_l2',
 'gal_b2'
 ]
feat_to_use = feat_to_use + [
    'JK','JH' ,'HK','W1W4',
     #'W1W2','W1W3' , 'W2W3' ,'W2W4' , 'W3W4' , 
     '0.5-7 keV Chandra','1.2-2 keV Chandra','0.5-1.2 keV Chandra','0.2-0.5 keV Chandra','0.3-8 keV (Chandra)','0.5-2 keV (Chandra)' , '2-7 keV Chandra',
      #'2-10 keV (XMM)',,'1-2 keV (Chandra)'
     'FUV (GALEX) AB','NUV (GALEX) AB',
     'u (SDSS PSF) AB','g (SDSS PSF) AB','r (SDSS PSF) AB','i (SDSS PSF) AB','z (SDSS PSF) AB',
     'W1 (WISE)','W2 (WISE)','W3 (WISE)','W4 (WISE)',
     'J (2MASS/CTIO)','H (2MASS/CTIO)','K_s (2MASS/CTIO)',
     '4.5 microns (IRAC)','8.0 microns (IRAC)','24 microns (MIPS)','3.6 microns (IRAC)','5.8 microns (IRAC)',

 ]


In [12]:
df_small = df[feat_to_use]
percent_missing = df_small.isnull().sum() * 100 / len(df_small)
missing_value_df = pd.DataFrame({'column_name': df_small.columns,
                                 'percent_available': 100-percent_missing})
missing_value_df = missing_value_df.sort_values('percent_available' , ascending=True)
missing_value_df = missing_value_df[missing_value_df['percent_available']>10]
display(missing_value_df)
df_small = df_small[missing_value_df['column_name']]
df_small

Unnamed: 0,column_name,percent_available
8.0 microns (IRAC),8.0 microns (IRAC),13.306092
5.8 microns (IRAC),5.8 microns (IRAC),13.606688
24 microns (MIPS),24 microns (MIPS),13.771077
4.5 microns (IRAC),4.5 microns (IRAC),13.836832
3.6 microns (IRAC),3.6 microns (IRAC),14.015312
W1W4,W1W4,16.598563
W1 (WISE),W1 (WISE),16.607956
W2 (WISE),W2 (WISE),16.61735
W3 (WISE),W3 (WISE),16.781739
W4 (WISE),W4 (WISE),16.795829


Unnamed: 0,8.0 microns (IRAC),5.8 microns (IRAC),24 microns (MIPS),4.5 microns (IRAC),3.6 microns (IRAC),W1W4,W1 (WISE),W2 (WISE),W3 (WISE),W4 (WISE),...,var_intra_prob_b,hard_ms,hard_hs,1.2-2 keV Chandra,hard_hm,0.5-1.2 keV Chandra,0.2-0.5 keV Chandra,0.5-7 keV Chandra,gal_b2,gal_l2
0,,,,17.19,17.580,,,,,,...,0.024076,0.164897,0.301062,3.757750e-15,0.121174,5.734200e-15,3.659900e-14,4.185450e-14,-43.608193,303.306304
1,,,,,,,,,,,...,0.236336,-0.056215,-0.296065,2.411850e-15,-0.233604,3.675450e-15,5.000950e-15,3.730250e-15,-43.626698,303.220840
2,,,,,,,,,,,...,0.090023,0.036227,0.438476,1.618950e-15,0.382261,3.623200e-15,1.997550e-14,2.067000e-14,-44.267982,302.584181
3,,,,,,,,,,,...,0.269411,0.174891,-0.999375,2.021900e-15,-0.999375,2.819350e-15,8.615850e-15,6.643300e-15,-43.853723,301.995228
4,,,,,,,,,,,...,0.309891,0.999375,0.999375,1.037885e-15,0.404747,2.669750e-15,1.453750e-14,1.599900e-14,-43.931660,301.988531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21286,,,,,,,,,,,...,0.159907,-0.999375,0.081199,7.680750e-17,0.999375,7.414100e-17,5.067800e-16,3.240550e-16,19.413367,309.497154
21287,124.7065,66.1155,140.165,22.23,35.563,,,,,,...,0.672005,-0.181137,-0.197377,2.714800e-16,-0.011243,3.012400e-16,7.017600e-16,1.130950e-15,19.375066,309.556783
21288,,,,,,,,,,,...,0.999555,-0.121174,0.052467,3.404575e-14,0.172392,3.505325e-14,1.229457e-13,1.869300e-13,57.634609,233.490377
21289,,,,,,,,,,,...,0.677867,-0.039975,-0.227358,1.913725e-13,-0.188632,2.970125e-13,4.986100e-13,9.463225e-13,-4.143069,35.718416


In [13]:

feat_to_use
x = df[feat_to_use]
x

Unnamed: 0,hard_hm,var_inter_hard_prob_hm,var_inter_hard_sigma_hm,hard_hs,var_inter_hard_prob_hs,hard_ms,var_inter_hard_prob_ms,var_intra_index_b,var_intra_prob_b,ks_intra_prob_b,...,W3 (WISE),W4 (WISE),J (2MASS/CTIO),H (2MASS/CTIO),K_s (2MASS/CTIO),4.5 microns (IRAC),8.0 microns (IRAC),24 microns (MIPS),3.6 microns (IRAC),5.8 microns (IRAC)
0,0.121174,,,0.301062,,0.164897,,0.0,0.024076,0.965033,...,,,,,,17.19,,,17.580,
1,-0.233604,,,-0.296065,,-0.056215,,0.0,0.236336,0.984416,...,,,,,,,,,,
2,0.382261,,,0.438476,,0.036227,,0.0,0.090023,0.419304,...,,,,,,,,,,
3,-0.999375,,,-0.999375,,0.174891,,0.0,0.269411,0.348119,...,,,,,,,,,,
4,0.404747,,,0.999375,,0.999375,,0.0,0.309891,0.454695,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21286,0.999375,0.215726,0.368378,0.081199,0.614123,-0.999375,0.899631,0.0,0.159907,0.809765,...,,,,,,,,,,
21287,-0.011243,0.380301,0.382474,-0.197377,0.651394,-0.181137,0.640034,2.0,0.672005,0.727189,...,,,,,,22.23,124.7065,140.165,35.563,66.1155
21288,0.172392,0.999963,0.199800,0.052467,1.000000,-0.121174,0.999999,7.0,0.999555,0.999938,...,,,,,,,,,,
21289,-0.188632,1.000000,0.198482,-0.227358,1.000000,-0.039975,0.916848,2.0,0.677867,0.998727,...,,,15.845,15.258,15.323,,,,,


In [15]:
index = x.index.to_list()
display(x)
info_col = ['name'	, 	'catalog' ,	'class' , 'var_flag' , 	'streak_src_flag' ,	'pileup_flag' ,	'ra' , 	'dec']
id = df[info_col]
y = df['class']
scalar = StandardScaler()

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import  IterativeImputer
imputer = IterativeImputer()
from sklearn.preprocessing import Normalizer
norm = Normalizer()
cols = x.columns.to_list()
x = scalar.fit_transform(x)
x = imputer.fit_transform(x)

x = pd.DataFrame(x , columns = cols)
eps = 1e-20
#flux_cols = ['flux_aper_b',	'flux_aper_lolim_b' ,	'flux_aper_hilim_b' , 	'flux_aper_h' , 	'flux_aper_lolim_h' , 	'flux_aper_hilim_h' , 	'flux_aper_m' , 	'flux_aper_lolim_m' , 	'flux_aper_hilim_m' , 	'flux_aper_s']
#x.loc[:, 'flux_aper_avg_b'] =  np.log10(x.loc[:, 'flux_aper_avg_b']+eps)
#x.loc[:, 'flux_aper_avg_hilim_b'] =  np.log10(x.loc[:, 'flux_aper_avg_hilim_b']+eps)
#x.loc[:, 'flux_aper_avg_lolim_b'] =  np.log10(x.loc[:, 'flux_aper_avg_lolim_b']+eps)
#x = x.replace(np.nan , 0 )
#for f in flux_cols:
#    x.loc[: , f] = np.log10(x.loc[: , f]+eps)
#x = x.replace(np.nan , 0)
#x.insert(0 , 'csc_index' , index)
#x = x.set_index('csc_index')
x 

Unnamed: 0,hard_hm,var_inter_hard_prob_hm,var_inter_hard_sigma_hm,hard_hs,var_inter_hard_prob_hs,hard_ms,var_inter_hard_prob_ms,var_intra_index_b,var_intra_prob_b,ks_intra_prob_b,...,W3 (WISE),W4 (WISE),J (2MASS/CTIO),H (2MASS/CTIO),K_s (2MASS/CTIO),4.5 microns (IRAC),8.0 microns (IRAC),24 microns (MIPS),3.6 microns (IRAC),5.8 microns (IRAC)
0,0.121174,,,0.301062,,0.164897,,0.0,0.024076,0.965033,...,,,,,,17.19,,,17.580,
1,-0.233604,,,-0.296065,,-0.056215,,0.0,0.236336,0.984416,...,,,,,,,,,,
2,0.382261,,,0.438476,,0.036227,,0.0,0.090023,0.419304,...,,,,,,,,,,
3,-0.999375,,,-0.999375,,0.174891,,0.0,0.269411,0.348119,...,,,,,,,,,,
4,0.404747,,,0.999375,,0.999375,,0.0,0.309891,0.454695,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21286,0.999375,0.215726,0.368378,0.081199,0.614123,-0.999375,0.899631,0.0,0.159907,0.809765,...,,,,,,,,,,
21287,-0.011243,0.380301,0.382474,-0.197377,0.651394,-0.181137,0.640034,2.0,0.672005,0.727189,...,,,,,,22.23,124.7065,140.165,35.563,66.1155
21288,0.172392,0.999963,0.199800,0.052467,1.000000,-0.121174,0.999999,7.0,0.999555,0.999938,...,,,,,,,,,,
21289,-0.188632,1.000000,0.198482,-0.227358,1.000000,-0.039975,0.916848,2.0,0.677867,0.998727,...,,,15.845,15.258,15.323,,,,,


Unnamed: 0,hard_hm,var_inter_hard_prob_hm,var_inter_hard_sigma_hm,hard_hs,var_inter_hard_prob_hs,hard_ms,var_inter_hard_prob_ms,var_intra_index_b,var_intra_prob_b,ks_intra_prob_b,...,W3 (WISE),W4 (WISE),J (2MASS/CTIO),H (2MASS/CTIO),K_s (2MASS/CTIO),4.5 microns (IRAC),8.0 microns (IRAC),24 microns (MIPS),3.6 microns (IRAC),5.8 microns (IRAC)
0,-0.206443,-0.153255,0.027063,0.324878,0.065071,0.554602,-0.057715,-1.113900,-1.961016,0.676989,...,-0.043402,0.005605,0.014311,0.023037,0.022078,-0.083028,0.022727,0.007897,-0.085207,-0.072883
1,-0.898602,0.016518,0.159653,-0.572280,0.227013,0.151933,0.027128,-1.113900,-1.313384,0.745567,...,-0.006566,0.001283,0.012460,0.020057,0.018243,0.014691,0.234840,-0.013982,-0.002608,-0.001272
2,0.302928,-0.454240,0.157916,0.531337,-0.047134,0.320280,-0.184002,-1.113900,-1.759803,-1.253865,...,-0.014620,0.020793,0.027259,0.043880,0.048907,0.010104,0.185020,-0.023191,0.007589,0.001269
3,-2.392594,0.232351,0.508174,-1.628975,0.457433,0.572801,0.238621,-1.113900,-1.212467,-1.505727,...,0.039520,0.000906,0.014502,0.023344,0.022474,0.011414,0.245760,0.011290,-0.009351,-0.000162
4,0.346797,-0.394855,0.072628,1.374065,-0.285957,2.074277,-0.205358,-1.113900,-1.088958,-1.128648,...,-0.039189,0.019534,0.022286,0.035874,0.038602,0.014104,0.270867,0.002096,-0.004907,0.002134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21286,1.506895,-0.490250,-0.154790,-0.005456,0.762726,-1.565665,1.527433,-1.113900,-1.546578,0.127631,...,-0.728620,0.146206,0.012917,0.020792,0.019189,-0.019421,-0.395052,-0.205666,-0.034900,-0.075807
21287,-0.464784,0.008326,-0.108214,-0.424005,0.872227,-0.075563,0.768399,-0.482114,0.015893,-0.164533,...,-0.059077,-0.110495,0.010444,0.016812,0.014065,-0.083010,-0.283308,-0.086964,-0.085162,-0.115038
21288,-0.106518,1.885573,-0.711816,-0.048625,1.896424,0.033635,1.820899,1.097353,1.015289,0.800486,...,-1.046381,0.319560,-0.003816,-0.006142,-0.015481,-0.069150,0.520758,0.435548,-0.170218,0.023222
21289,-0.810864,1.885686,-0.716173,-0.469051,1.896424,0.181508,1.577774,-0.482114,0.033781,0.796200,...,-0.063931,0.007338,0.956662,1.310840,1.618715,-0.325667,-0.028498,0.169525,-0.508762,-0.183600
