# Imports

In [59]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
from sklearn.decomposition import PCA
from tqdm import tqdm
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 

from IPython.display import display
import joblib
sns.set_style('whitegrid')

## Load Data

In [60]:
data_all =  pd.read_csv('current_data/train_data.csv' , index_col = 'obs_id')
data_all = data_all[data_all['class'].isin(['CV', 'NS' , 'BH' , 'PULSAR'])]
data_all = data_all.sample(frac=1)
display(data_all)

Unnamed: 0_level_0,class,src_n,src_id,significance,photflux_aper_lolim_m,photflux_aper_lolim_h,photflux_aper_hilim_s,photflux_aper_hilim_b,photflux_aper_lolim_u,photflux_aper_hilim_m,...,var_index,hard_ms_hilim,hard_hm_lolim,hard_hs,hard_ms_lolim,hard_hs_hilim,hard_hm_hilim,hard_ms,hard_hs_lolim,hard_hm
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_341,NS,J1748-2446,NS0057,18.31,-6.689094,-6.103639,-6.656591,-5.465339,-6.425096,-6.044072,...,1.0000,1.0000,0.1568,0.9994,0.4316,1.0000,0.7302,0.9994,0.7302,0.4735
CV_OBS_392,CV,CXOGLB J002402.5-72051,CV0070,22.52,-6.526659,-7.352128,-5.474307,-5.081760,-5.960567,-5.661344,...,0.0000,0.1543,-0.4478,-0.2886,-0.6615,0.1855,0.4816,-0.2961,-0.6927,0.0350
PULSAR_OBS_185,PULSAR,PSR J1635-4735,PL0065,15.74,-6.399982,-5.301725,-6.011441,-4.941195,-4.150653,-6.375202,...,0.0000,-0.0356,0.8751,0.9994,-1.0000,1.0000,1.0000,-0.9994,0.7352,0.9994
PULSAR_OBS_314,PULSAR,PSR J1824-2452,PL0094,5.53,-6.782253,-7.020770,-6.770574,-6.034046,-7.030167,-6.504733,...,1.0000,0.6340,-0.3991,0.2236,0.0593,0.5715,0.1305,0.3385,-0.1230,-0.1312
CV_OBS_572,CV,CXOGLB J002408.2-72043,CV0086,12.82,-6.133713,-5.934047,-5.702021,-5.166917,-6.513939,-5.885056,...,2.5000,-0.0306,0.0531,0.0137,-0.3991,0.2030,0.4054,-0.2236,-0.1718,0.2349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CV_OBS_486,CV,CXOGLB J002405.3-72042,CV0079,10.20,-7.570408,-7.282457,-6.127727,-5.689307,-6.185817,-6.346209,...,0.2535,0.1418,0.3417,0.9994,-1.0000,1.0000,1.0000,-0.9994,0.1168,0.9994
PULSAR_OBS_6,PULSAR,PSR J0100-7211,PL0003,65.51,-4.218819,-4.734475,-3.952725,-3.504456,-4.393726,-4.149170,...,0.0000,-0.1718,-0.5590,-0.6371,-0.2755,-0.5753,-0.4016,-0.2236,-0.6977,-0.4797
BH_OBS_119,BH,J1047+1234,BH0024,56.53,-4.960586,-5.075411,-4.654038,-4.185819,-4.810791,-4.896196,...,0.6000,-0.2355,-0.1768,-0.3798,-0.3217,-0.3279,-0.0493,-0.2786,-0.4316,-0.1124
NS_OBS_165,NS,XMMU J004245.2+41172,NS0044,49.26,-5.010684,-5.050220,-4.959002,-4.299902,-5.288217,-4.856985,...,0.7500,0.2979,-0.1530,0.1237,-0.0081,0.2879,0.1118,0.1412,-0.0394,-0.0200


In [61]:
info_col = ['class' , 'src_n' , 'src_id' , 'significance']
i_train = data_all[info_col]
y_train = data_all['class']
x_train = data_all.drop(columns=info_col)

## Load CLassifiers

In [62]:
cv_pl  = joblib.load('forest/v2/cv-pulsar.joblib')
all_clf  = joblib.load('forest/v2/all-clf.joblib')
cv_xrb = joblib.load('forest/v2/cv-xrb.joblib')
cv_pulsar = joblib.load('forest/v2/cv-pulsar.joblib')
xrb_pulsar = joblib.load('forest/v2/xrb-pulsar.joblib')
ns_bh = joblib.load('forest/v2/ns-bh.joblib')

In [63]:
classes = ['CV' , 'PULSAR' , 'XRB']

In [64]:
pred_prob_a = all_clf.predict_proba(x_train)
pred_prob_a[:5]

array([[0.01423621, 0.04839492, 0.93736887],
       [0.9680979 , 0.01865407, 0.01324803],
       [0.02968263, 0.86310369, 0.10721367],
       [0.06189056, 0.87366753, 0.0644419 ],
       [0.94409504, 0.01723488, 0.03867008]])

In [65]:
pred_prob_cl = all_clf.predict(x_train)
pred_prob_cl[:5]

array(['XRB', 'CV', 'PULSAR', 'PULSAR', 'CV'], dtype=object)

In [66]:
final_class = []
final_prob = []
initial_prob = []
initial_class = []
for i in range(len(x_train)):
    sample = x_train[i:i+1]
    pred_prob_a = all_clf.predict_proba(sample)
    prob_a_max = np.amax(pred_prob_a)
    pred_cl_a = all_clf.predict(sample)
    min_cl = classes[np.argmin(pred_prob_a)]
    if(min_cl=='XRB'):
        b_prob = np.amax(cv_pulsar.predict_proba(sample)[0])
        b_class = cv_pulsar.predict(sample)[0]
    elif(min_cl=='CV'):
        b_prob = np.amax(xrb_pulsar.predict_proba(sample)[0])
        b_class = xrb_pulsar.predict(sample)[0]
        if(b_class=='XRB'):
            b_prob = np.amax(ns_bh.predict_proba(sample)[0])
            b_class = ns_bh.predict(sample)[0]
    else: # 'PULSAR
        b_prob = np.amax(cv_xrb.predict_proba(sample)[0])
        b_class = cv_xrb.predict(sample)[0]
        if(b_class=='XRB'):
            b_prob = np.amax(ns_bh.predict_proba(sample)[0])
            b_class = ns_bh.predict(sample)[0]
    final_class.append(b_class)
    final_prob.append(b_prob)
    initial_prob.append(prob_a_max)
    initial_class.append(pred_cl_a)
    #print(pred_prob_a , pred_cl_a , final_class , final_prob)

In [72]:
for yt , yp  in zip(final_class, y_train):
    print(yt , yp)

NS NS
CV CV
PULSAR PULSAR
PULSAR PULSAR
CV CV
NS NS
CV CV
CV CV
NS NS
NS NS
CV CV
CV CV
BH BH
CV CV
BH BH
CV CV
BH BH
NS NS
PULSAR PULSAR
NS NS
BH BH
NS NS
NS NS
CV CV
CV CV
NS NS
PULSAR PULSAR
CV CV
PULSAR PULSAR
NS NS
PULSAR PULSAR
CV CV
PULSAR PULSAR
NS NS
BH BH
CV CV
CV CV
NS NS
BH BH
NS NS
BH BH
CV CV
BH BH
PULSAR CV
PULSAR PULSAR
CV CV
PULSAR PULSAR
PULSAR PULSAR
NS NS
PULSAR PULSAR
CV CV
CV CV
NS NS
BH BH
CV CV
CV CV
PULSAR PULSAR
NS NS
CV CV
CV CV
CV CV
NS NS
CV CV
PULSAR PULSAR
CV CV
NS NS
CV CV
CV CV
BH BH
NS NS
NS NS
CV CV
NS NS
BH BH
NS NS
NS NS
BH BH
CV CV
CV CV
CV NS
BH BH
BH CV
NS NS
CV CV
PULSAR PULSAR
CV CV
CV CV
BH BH
NS NS
BH BH
CV CV
NS NS
CV CV
CV CV
CV CV
BH BH
NS NS
NS NS
NS NS
CV CV
NS NS
NS NS
NS NS
CV CV
PULSAR PULSAR
NS NS
PULSAR PULSAR
PULSAR PULSAR
BH BH
PULSAR PULSAR
NS NS
PULSAR PULSAR
BH BH
NS NS
CV CV
NS NS
PULSAR PULSAR
PULSAR PULSAR
NS NS
BH BH
CV CV
PULSAR BH
CV CV
CV CV
PULSAR PULSAR
PULSAR PULSAR
BH BH
CV CV
CV CV
CV CV
CV CV
PULSAR PULSAR
CV CV
PU

In [73]:
res  = i_train.copy()
res.insert(0 , 'true_class' , y_train)
res.insert(1 , 'level_a_prob' , initial_prob)
res.insert(2 , 'level_a_class' , initial_class)
res.insert(3 , 'level_b_prob' , final_prob)
res.insert(4 , 'level_b_class' , final_class)
res 
truth = []
th_class = [] 
is_ok = [] 
for tc , pc , pp in zip(res['true_class'] , res['level_b_class'] , res['level_b_prob']):
    if(tc==pc): is_ok.append(1)
    else: is_ok.append(0)
    if(pp > 0.60):
        th_class.append(pc)
        if(tc==pc):
            truth.append(1)
        else: truth.append(0)
    else :
        truth.append(2)
        th_class.append('X')
res.insert(5 , 'th_class' , th_class )
res.insert(6 , 'truth' , truth)
res.insert(7 , 'is_ok' , is_ok)

In [74]:
res.to_csv('result_train.csv')
res_data = res.copy()

In [75]:
cv_data = res_data[res_data['true_class']=='CV']
cv_sources = np.unique(cv_data['src_id'])
print('Num of CV sources :' , len(cv_sources))
print('Num of CV obs :' , len(cv_data))

cv_data = res_data[res_data['true_class']=='NS']
cv_sources = np.unique(cv_data['src_id'])
print('Num of NS sources :' , len(cv_sources))
print('Num of NS obs :' , len(cv_data))

cv_data = res_data[res_data['true_class']=='BH']
cv_sources = np.unique(cv_data['src_id'])
print('Num of BH sources :' , len(cv_sources))
print('Num of BH obs :' , len(cv_data))

cv_data = res_data[res_data['true_class']=='PULSAR']
cv_sources = np.unique(cv_data['src_id'])
print('Num of PULSAR sources :' , len(cv_sources))
print('Num of PULSAR obs :' , len(cv_data))

Num of CV sources : 64
Num of CV obs : 463
Num of NS sources : 45
Num of NS obs : 271
Num of BH sources : 26
Num of BH obs : 142
Num of PULSAR sources : 112
Num of PULSAR obs : 287


In [76]:
data =  res_data.copy()
total = len(data)
print('Total samples' , total)
am_clf = len(data[data['th_class']=='X'])
print('Ambiguous calssification : ' , am_clf)
wrong_clf = len(data[data['truth']==0])
print('Wrong calssification : ' , wrong_clf)

acc = (total-am_clf-wrong_clf)/total
print('Classification accuracy :{:.3f}'.format(acc))

Total samples 1163
Ambiguous calssification :  11
Wrong calssification :  17
Classification accuracy :0.976
