In [5]:
import numpy as np 
from matplotlib import pyplot as plt 
import seaborn as sns 
from tqdm import tqdm 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier  , AdaBoostClassifier , GradientBoostingClassifier
from sklearn.metrics import precision_score , recall_score 
import xgboost as xgb 
import pandas as pd 
import sklearn.neighbors._base
from os import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest 
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay
sns.set_style('whitegrid')
%load_ext autoreload
%autoreload 2
sns.set_style('whitegrid')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
def details(data_sent , comments = ''):
    if(comments):
        print(comments)
    sp = (data_sent.isna().sum().sum())/ data.size
    print('________________________________________________')
    print('Sparsity in the data : {:.2f}'.format(sp))
    num_rows = data_sent.shape
    print('Data shape' , num_rows)
    #display(data_sent['class'].value_counts())
    print('Number of sources : ')
    cl = data_sent['class'].unique()
    for c in cl:
        num_src = len(data_sent[data_sent['class']==c]['src_id'].unique())
        num_obs = len(data_sent[data_sent['class']==c])
        print(c ,' \t ' , num_src , '\t' , num_obs)

In [7]:
from features import phot_flux , en_flux , hard, powlaw_fit , bb_fit , brems_fit , intra_obs_var , inter_ob_var , info_col , phot_flux_hilim , phot_flux_lolim , en_flux_hilim , en_flux_lolim

In [8]:
feat_to_use = info_col + phot_flux + phot_flux_hilim + phot_flux_lolim + en_flux + en_flux_hilim + en_flux_lolim + powlaw_fit +bb_fit +hard+ intra_obs_var +inter_ob_var 
data_cv = pd.read_csv('filtered_data/cv_new_data.csv' , index_col='obs_id')[feat_to_use]
data_cv.insert(0 , 'class' , ['CV']*len(data_cv))
data_pl  =  pd.read_csv('filtered_data/pl_new_data.csv' , index_col = 'obs_id')[feat_to_use]
data_pl.insert(0 , 'class' , ['PL']*len(data_pl))
data_lx  =  pd.read_csv('filtered_data/lmxb_data.csv' , index_col = 'obs_id')[feat_to_use]
data_lx.insert(0 , 'class' , ['LX']*len(data_lx))

### Add old pulsar data also

data_pl  =  pd.read_csv('filtered_data/pl_new_data.csv' , index_col = 'obs_id')[feat_to_use]
data_pl.insert(0 , 'class' , ['PL']*len(data_pl))

In [5]:
data = pd.concat([data_cv , data_pl , data_lx] , axis=0)
data

Unnamed: 0_level_0,class,src_id,num_obs,src_n,name,ra,dec,livetime,significance,likelihood,...,var_inter_sigma_h,var_inter_index_m,var_inter_prob_m,var_inter_sigma_m,var_inter_index_s,var_inter_prob_s,var_inter_sigma_s,var_inter_index_u,var_inter_prob_u,var_inter_sigma_u
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CV_0_obs_0,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,4903.5,11.31,304.46460,...,,,,,,,,,,
CV_0_obs_1,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3974.2,11.31,304.46460,...,,,,,,,,,,
CV_0_obs_2,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,...,,,,,,,,,,
CV_0_obs_3,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,...,,,,,,,,,,
CV_0_obs_4,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LMXB_86_obs_0,LX,LMXB_86,1,SAX_J1810.8-2609_____,2CXO J181044.4-260901,272.68530,-26.150370,31871.5,9.29,512.97550,...,,,,,,,,,,
LMXB_90_obs_0,LX,LMXB_90,1,XTE_J2123-058________,2CXO J212314.5-054753,320.81060,-5.798128,17705.1,4.06,132.76700,...,,,,,,,,,,
LMXB_91_obs_0,LX,LMXB_91,1,XB_1732-304__________,2CXO J173547.0-302858,263.94590,-30.482820,18855.8,4.89,114.24570,...,8.934000e-09,5.0,0.8,1.257000e-07,,0.247,5.992000e-09,,,
LMXB_92_obs_0,LX,LMXB_92,1,BW_ANT_______________,2CXO J092920.1-312303,142.33410,-31.384270,24445.9,3.70,75.53454,...,,,,,,,,,,


In [6]:
sp = [] 
for f in feat_to_use:
    #print(f)
    na = data[feat_to_use][f].isna().value_counts()
    try:
        sp.append(float(1-na[0]/(na[0]+na[1])))
    except:
        sp.append(0)
sp =  np.asarray(sp)
sparsity = pd.DataFrame(
    {
        "feat" : feat_to_use ,
        "sp_val" :sp
    }
).sort_values(by='sp_val' , ascending=False).reset_index(drop=True)
sparsity

Unnamed: 0,feat,sp_val
0,var_index_u,0.953091
1,var_index_s,0.878587
2,var_index_h,0.876380
3,kp_prob_u,0.871413
4,ks_prob_u,0.870861
...,...,...
101,ra,0.000000
102,num_obs,0.000000
103,name,0.000000
104,src_n,0.000000


In [7]:
sparse_feat = sparsity[sparsity['sp_val']>0.5].sort_values(by='sp_val').reset_index(drop=True)
sparse_feat = sparse_feat['feat'].to_list()
#sparse_feat

In [8]:
dense_feat = sparsity[sparsity['sp_val']<0.5].sort_values(by='sp_val').reset_index(drop=True)
dense_feat = dense_feat['feat'].to_list()
#dense_feat

In [9]:
data_use = data[dense_feat+['class']]
details(data , 'Before removing sparse columns')
details(data_use , 'After removing sparse columns')

Before removing sparse columns
________________________________________________
Sparsity in the data : 0.44
Data shape (1812, 107)
Number of sources : 
CV  	  60 	 994
PL  	  92 	 297
LX  	  58 	 521
After removing sparse columns
________________________________________________
Sparsity in the data : 0.13
Data shape (1812, 58)
Number of sources : 
CV  	  60 	 994
PL  	  92 	 297
LX  	  58 	 521


In [10]:
flux_feat = phot_flux + phot_flux_lolim + phot_flux_hilim + en_flux + en_flux_lolim + en_flux_hilim
flux_feat_avail = list(set(flux_feat) & set(dense_feat))
def take_log(data_sent):
    x_train_log = data_sent.copy()
    for f in flux_feat_avail[:]:
        x_train_log.loc[:,f] = -(np.log10(x_train_log.loc[:,f]))
    x_train_log = x_train_log.replace(-np.inf , np.nan)
    x_train_log = x_train_log.replace(np.inf , np.nan)
    return x_train_log


In [11]:
df_log = take_log(data_use)
df_log

Unnamed: 0_level_0,src_id,livetime,dec,significance,src_n,ra,num_obs,name,likelihood,photflux_aper_hilim_b,...,photflux_aper_s,var_inter_index_h,var_inter_index_m,flux_aper_lolim_m,photflux_aper_lolim_m,flux_aper_s,photflux_aper_lolim_s,flux_aper_lolim_s,var_inter_index_s,class
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CV_0_obs_0,CV_0,4903.5,41.255820,11.31,[HPH2013]_176__________,10.68773,33,2CXO J004245.0+411520,304.46460,,...,,,,,,,,,,CV
CV_0_obs_1,CV_0,3974.2,41.255820,11.31,[HPH2013]_176__________,10.68773,33,2CXO J004245.0+411520,304.46460,5.909742,...,,,,,,,,,,CV
CV_0_obs_2,CV_0,3907.5,41.255820,11.31,[HPH2013]_176__________,10.68773,33,2CXO J004245.0+411520,304.46460,5.705754,...,5.808270,,,,,14.532244,,15.762708,,CV
CV_0_obs_3,CV_0,3907.5,41.255820,11.31,[HPH2013]_176__________,10.68773,33,2CXO J004245.0+411520,304.46460,5.847100,...,,,,,,,,,,CV
CV_0_obs_4,CV_0,3907.5,41.255820,11.31,[HPH2013]_176__________,10.68773,33,2CXO J004245.0+411520,304.46460,5.394264,...,,,,,,,,,,CV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LMXB_86_obs_0,LMXB_86,31871.5,-26.150370,9.29,SAX_J1810.8-2609_____,272.68530,1,2CXO J181044.4-260901,512.97550,4.904831,...,5.338755,,,14.444543,5.786217,14.189364,5.405497,14.260507,,LX
LMXB_90_obs_0,LMXB_90,17705.1,-5.798128,4.06,XTE_J2123-058________,320.81060,1,2CXO J212314.5-054753,132.76700,5.360115,...,5.737786,,,15.478078,6.884057,14.499626,5.889410,14.651500,,LX
LMXB_91_obs_0,LMXB_91,18855.8,-30.482820,4.89,XB_1732-304__________,263.94590,1,2CXO J173547.0-302858,114.24570,5.235226,...,6.466864,,5.0,14.373147,5.747875,15.291664,7.068949,15.893809,,LX
LMXB_92_obs_0,LMXB_92,24445.9,-31.384270,3.70,BW_ANT_______________,142.33410,1,2CXO J092920.1-312303,75.53454,5.521289,...,6.135904,,,15.357437,6.734946,14.902049,6.357733,15.133535,,LX


# All obs Imputation

In [12]:
info_col_cl = info_col + ['class']

In [13]:
data_id = df_log[info_col_cl]
data_val = df_log.drop(columns = info_col_cl)
rf_imputer = MissForest(verbose=0 , decreasing=True)
new_data = rf_imputer.fit_transform(data_val)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [14]:
new_imp_data =  pd.DataFrame(new_data , columns = data_val.columns.to_list()  , index=data_val.index.to_list())
new_imp_data.index.name = 'obs_id'

In [15]:
data_imp_v2 = pd.concat([data_id , new_imp_data] , axis=1)
data_imp_v2

Unnamed: 0_level_0,src_id,num_obs,src_n,name,ra,dec,livetime,significance,likelihood,class,...,var_prob_b,photflux_aper_s,var_inter_index_h,var_inter_index_m,flux_aper_lolim_m,photflux_aper_lolim_m,flux_aper_s,photflux_aper_lolim_s,flux_aper_lolim_s,var_inter_index_s
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CV_0_obs_0,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,4903.5,11.31,304.46460,CV,...,0.33815,6.261572,5.04,5.24,14.852762,6.218768,15.057965,6.500526,15.294337,4.17
CV_0_obs_1,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3974.2,11.31,304.46460,CV,...,0.52456,5.998752,5.73,5.14,14.740725,6.062481,14.871647,6.646402,15.525210,4.25
CV_0_obs_2,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,CV,...,0.52956,5.808270,5.69,5.29,14.837430,6.186248,14.532244,6.994177,15.762708,4.33
CV_0_obs_3,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,CV,...,0.52428,6.074041,5.69,5.26,14.849548,6.201443,14.901801,6.657812,15.507985,4.27
CV_0_obs_4,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,CV,...,0.53020,6.099741,5.69,5.20,14.818570,6.173513,14.904197,6.664845,15.545160,4.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LMXB_86_obs_0,LMXB_86,1,SAX_J1810.8-2609_____,2CXO J181044.4-260901,272.68530,-26.150370,31871.5,9.29,512.97550,LX,...,0.08400,5.338755,5.00,5.04,14.444543,5.786217,14.189364,5.405497,14.260507,4.32
LMXB_90_obs_0,LMXB_90,1,XTE_J2123-058________,2CXO J212314.5-054753,320.81060,-5.798128,17705.1,4.06,132.76700,LX,...,0.80700,5.737786,5.00,5.08,15.478078,6.884057,14.499626,5.889410,14.651500,4.33
LMXB_91_obs_0,LMXB_91,1,XB_1732-304__________,2CXO J173547.0-302858,263.94590,-30.482820,18855.8,4.89,114.24570,LX,...,0.45100,6.466864,3.00,5.00,14.373147,5.747875,15.291664,7.068949,15.893809,3.00
LMXB_92_obs_0,LMXB_92,1,BW_ANT_______________,2CXO J092920.1-312303,142.33410,-31.384270,24445.9,3.70,75.53454,LX,...,0.60700,6.135904,5.00,5.09,15.357437,6.734946,14.902049,6.357733,15.133535,4.31


In [16]:
data_imp_v2.to_csv('imputed_data/old-data-all-at-once-imp.csv')