In [2]:
import numpy as np 
from matplotlib import pyplot as plt 
import seaborn as sns 
from tqdm import tqdm 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier  , AdaBoostClassifier , GradientBoostingClassifier
from sklearn.metrics import precision_score , recall_score 
import xgboost as xgb 
import pandas as pd 
import sklearn.neighbors._base
from os import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest 
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay
sns.set_style('whitegrid')
%load_ext autoreload
%autoreload 2
sns.set_style('whitegrid')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
def details(data_sent , comments = ''):
    if(comments):
        print(comments)
    sp = (data_sent.isna().sum().sum())/ data.size
    print('________________________________________________')
    print('Sparsity in the data : {:.2f}'.format(sp))
    num_rows = data_sent.shape
    print('Data shape' , num_rows)
    #display(data_sent['class'].value_counts())
    print('Number of sources : ')
    cl = data_sent['class'].unique()
    for c in cl:
        num_src = len(data_sent[data_sent['class']==c]['src_id'].unique())
        num_obs = len(data_sent[data_sent['class']==c])
        print(c ,' \t ' , num_src , '\t' , num_obs)

In [4]:
from features import phot_flux , en_flux , hard, powlaw_fit , bb_fit , brems_fit , intra_obs_var , inter_ob_var , info_col , phot_flux_hilim , phot_flux_lolim , en_flux_hilim , en_flux_lolim

In [19]:
feat_to_use = info_col + phot_flux + phot_flux_hilim + phot_flux_lolim + en_flux + en_flux_hilim + en_flux_lolim + powlaw_fit +bb_fit +hard+ intra_obs_var +inter_ob_var 

# use only independent features small subset
from features import small_ind_subset
feat_to_use = info_col + small_ind_subset  
data_cv = pd.read_csv('filtered_data/cv_new_data.csv' , index_col='obs_id')[feat_to_use]
data_cv.insert(0 , 'class' , ['CV']*len(data_cv))
data_lx  =  pd.read_csv('filtered_data/lmxb_data.csv' , index_col = 'obs_id')[feat_to_use]
data_lx.insert(0 , 'class' , ['LX']*len(data_lx))

### Add old pulsar data also

In [20]:
data_pl_old  =  pd.read_csv('filtered_data/pl_old_data.csv' , index_col = 'obs_id')[feat_to_use]
data_pl_old.insert(0 , 'class' , ['PL']*len(data_pl_old))
data_pl_new  =  pd.read_csv('filtered_data/pl_new_data.csv' , index_col = 'obs_id')[feat_to_use]
data_pl_new.insert(0 , 'class' , ['PL']*len(data_pl_new))

In [21]:
data = pd.concat([data_cv , data_pl_new , data_lx] , axis=0)
data

Unnamed: 0_level_0,class,src_id,num_obs,src_n,name,ra,dec,livetime,significance,likelihood,...,flux_aper_lolim_h,var_inter_index_h,flux_aper_lolim_s,var_mean_b,var_sigma_b,ks_prob_b,var_prob_b,var_inter_index_s,var_inter_index_m,var_inter_sigma_u
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CV_0_obs_0,CV,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,4103.7,52.93,16618.73000,...,3.537000e-16,,0.000000e+00,,,,,5.0,,
CV_0_obs_1,CV,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,3907.5,52.93,16618.73000,...,0.000000e+00,,0.000000e+00,,,,,5.0,,
CV_0_obs_2,CV,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,3907.5,52.93,16618.73000,...,3.499000e-16,,0.000000e+00,,,,,5.0,,
CV_0_obs_3,CV,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,3904.6,52.93,16618.73000,...,2.078000e-15,,0.000000e+00,,,,,5.0,,
CV_0_obs_4,CV,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,3907.5,52.93,16618.73000,...,0.000000e+00,,0.000000e+00,,,,,5.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LMXB_86_obs_0,LX,LMXB_86,1,SAX_J1810.8-2609_____,2CXO J181044.4-260901,272.68530,-26.150370,31871.5,9.29,512.97550,...,4.687000e-15,,5.489000e-15,0.003456,0.000029,0.829,0.084,,,
LMXB_90_obs_0,LX,LMXB_90,1,XTE_J2123-058________,2CXO J212314.5-054753,320.81060,-5.798128,17705.1,4.06,132.76700,...,9.107000e-16,,2.231000e-15,0.001171,0.000255,0.265,0.807,,,
LMXB_91_obs_0,LX,LMXB_91,1,XB_1732-304__________,2CXO J173547.0-302858,263.94590,-30.482820,18855.8,4.89,114.24570,...,7.144000e-16,,1.277000e-16,0.001623,0.000211,0.735,0.451,,5.0,
LMXB_92_obs_0,LX,LMXB_92,1,BW_ANT_______________,2CXO J092920.1-312303,142.33410,-31.384270,24445.9,3.70,75.53454,...,2.083000e-15,,7.353000e-16,0.000740,0.000201,0.940,0.607,,,


In [22]:
data['class'].value_counts()

CV    1582
PL    1000
LX     521
Name: class, dtype: int64

In [23]:
sp = [] 
for f in feat_to_use:
    #print(f)
    na = data[feat_to_use][f].isna().value_counts()
    try:
        sp.append(float(1-na[0]/(na[0]+na[1])))
    except:
        sp.append(0)
sp =  np.asarray(sp)
sparsity = pd.DataFrame(
    {
        "feat" : feat_to_use ,
        "sp_val" :sp
    }
).sort_values(by='sp_val' , ascending=False).reset_index(drop=True)
sparsity

Unnamed: 0,feat,sp_val
0,var_inter_sigma_u,0.476636
1,var_prob_b,0.399291
2,ks_prob_b,0.398646
3,var_sigma_b,0.398646
4,var_mean_b,0.398646
5,flux_aper_hilim_u,0.353851
6,var_inter_index_h,0.212053
7,var_inter_index_s,0.194328
8,var_inter_index_m,0.185305
9,var_inter_sigma_s,0.184015


In [24]:
sparse_feat = sparsity[sparsity['sp_val']>0.5].sort_values(by='sp_val').reset_index(drop=True)
sparse_feat = sparse_feat['feat'].to_list()
#sparse_feat

In [25]:
dense_feat = sparsity[sparsity['sp_val']<0.5].sort_values(by='sp_val').reset_index(drop=True)
dense_feat = dense_feat['feat'].to_list()
#dense_feat

In [26]:
data_use = data[dense_feat+['class']]
details(data , 'Before removing sparse columns')
details(data_use , 'After removing sparse columns')

Before removing sparse columns
________________________________________________
Sparsity in the data : 0.15
Data shape (3103, 29)
Number of sources : 
CV  	  184 	 1582
PL  	  178 	 1000
LX  	  58 	 521
After removing sparse columns
________________________________________________
Sparsity in the data : 0.15
Data shape (3103, 29)
Number of sources : 
CV  	  184 	 1582
PL  	  178 	 1000
LX  	  58 	 521


In [27]:
flux_feat = phot_flux + phot_flux_lolim + phot_flux_hilim + en_flux + en_flux_lolim + en_flux_hilim
flux_feat_avail = list(set(flux_feat) & set(dense_feat))
def take_log(data_sent):
    x_train_log = data_sent.copy()
    for f in flux_feat_avail[:]:
        x_train_log.loc[:,f] = -(np.log10(x_train_log.loc[:,f]))
    x_train_log = x_train_log.replace(-np.inf , np.nan)
    x_train_log = x_train_log.replace(np.inf , np.nan)
    return x_train_log


In [28]:
df_log = take_log(data_use)
df_log

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,src_id,significance,livetime,likelihood,ra,name,src_n,dec,num_obs,hard_hm,...,var_inter_index_m,var_inter_index_s,var_inter_index_h,flux_aper_hilim_u,var_mean_b,var_sigma_b,ks_prob_b,var_prob_b,var_inter_sigma_u,class
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CV_0_obs_0,CV_0,52.93,4103.7,16618.73000,10.69654,2CXO J004247.1+411619,CSS120424:125906+242634,41.272150,25,0.9994,...,,5.0,,13.177505,,,,,,CV
CV_0_obs_1,CV_0,52.93,3907.5,16618.73000,10.69654,2CXO J004247.1+411619,CSS120424:125906+242634,41.272150,25,0.9994,...,,5.0,,12.648397,,,,,,CV
CV_0_obs_2,CV_0,52.93,3907.5,16618.73000,10.69654,2CXO J004247.1+411619,CSS120424:125906+242634,41.272150,25,0.9994,...,,5.0,,12.947306,,,,,,CV
CV_0_obs_3,CV_0,52.93,3904.6,16618.73000,10.69654,2CXO J004247.1+411619,CSS120424:125906+242634,41.272150,25,0.9994,...,,5.0,,13.064140,,,,,,CV
CV_0_obs_4,CV_0,52.93,3907.5,16618.73000,10.69654,2CXO J004247.1+411619,CSS120424:125906+242634,41.272150,25,-0.9994,...,,5.0,,13.072989,,,,,,CV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LMXB_86_obs_0,LMXB_86,9.29,31871.5,512.97550,272.68530,2CXO J181044.4-260901,SAX_J1810.8-2609_____,-26.150370,1,-0.2124,...,,,,14.159831,0.003456,0.000029,0.829,0.084,,LX
LMXB_90_obs_0,LMXB_90,4.06,17705.1,132.76700,320.81060,2CXO J212314.5-054753,XTE_J2123-058________,-5.798128,1,0.2711,...,,,,14.612610,0.001171,0.000255,0.265,0.807,,LX
LMXB_91_obs_0,LMXB_91,4.89,18855.8,114.24570,263.94590,2CXO J173547.0-302858,XB_1732-304__________,-30.482820,1,-0.6034,...,5.0,,,15.247567,0.001623,0.000211,0.735,0.451,,LX
LMXB_92_obs_0,LMXB_92,3.70,24445.9,75.53454,142.33410,2CXO J092920.1-312303,BW_ANT_______________,-31.384270,1,0.3310,...,,,,14.335264,0.000740,0.000201,0.940,0.607,,LX


# All obs Imputation

In [29]:
info_col_cl = info_col + ['class']

In [30]:
data_id = df_log[info_col_cl]
data_val = df_log.drop(columns = info_col_cl)
rf_imputer = MissForest(verbose=0 , decreasing=True)
new_data = rf_imputer.fit_transform(data_val)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [31]:
new_imp_data =  pd.DataFrame(new_data , columns = data_val.columns.to_list()  , index=data_val.index.to_list())
new_imp_data.index.name = 'obs_id'

In [32]:
data_imp_v2 = pd.concat([data_id , new_imp_data] , axis=1)
data_imp_v2

Unnamed: 0_level_0,src_id,num_obs,src_n,name,ra,dec,livetime,significance,likelihood,class,...,var_inter_sigma_s,var_inter_index_m,var_inter_index_s,var_inter_index_h,flux_aper_hilim_u,var_mean_b,var_sigma_b,ks_prob_b,var_prob_b,var_inter_sigma_u
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CV_0_obs_0,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,4103.7,52.93,16618.73000,CV,...,2.546000e-05,5.00,5.00,5.00,13.177505,0.001794,0.000458,0.74518,0.66810,9.663672e-05
CV_0_obs_1,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,3907.5,52.93,16618.73000,CV,...,2.546000e-05,4.95,5.00,4.96,12.648397,0.008160,0.002223,0.48396,0.46334,1.006484e-04
CV_0_obs_2,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,3907.5,52.93,16618.73000,CV,...,2.546000e-05,5.00,5.00,5.00,12.947306,0.002150,0.000759,0.73545,0.61051,9.534336e-05
CV_0_obs_3,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,3904.6,52.93,16618.73000,CV,...,2.546000e-05,5.00,5.00,5.00,13.064140,0.002382,0.000756,0.72550,0.55735,9.695378e-05
CV_0_obs_4,CV_0,25,CSS120424:125906+242634,2CXO J004247.1+411619,10.69654,41.272150,3907.5,52.93,16618.73000,CV,...,2.546000e-05,5.00,5.00,5.00,13.072989,0.001926,0.000656,0.76392,0.68356,7.274384e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LMXB_86_obs_0,LMXB_86,1,SAX_J1810.8-2609_____,2CXO J181044.4-260901,272.68530,-26.150370,31871.5,9.29,512.97550,LX,...,3.385778e-06,2.25,4.96,1.61,14.159831,0.003456,0.000029,0.82900,0.08400,3.013343e-06
LMXB_90_obs_0,LMXB_90,1,XTE_J2123-058________,2CXO J212314.5-054753,320.81060,-5.798128,17705.1,4.06,132.76700,LX,...,2.082304e-07,2.33,3.43,1.81,14.612610,0.001171,0.000255,0.26500,0.80700,6.861258e-07
LMXB_91_obs_0,LMXB_91,1,XB_1732-304__________,2CXO J173547.0-302858,263.94590,-30.482820,18855.8,4.89,114.24570,LX,...,5.992000e-09,5.00,3.26,1.44,15.247567,0.001623,0.000211,0.73500,0.45100,8.933566e-07
LMXB_92_obs_0,LMXB_92,1,BW_ANT_______________,2CXO J092920.1-312303,142.33410,-31.384270,24445.9,3.70,75.53454,LX,...,1.579263e-07,2.90,2.94,2.10,14.335264,0.000740,0.000201,0.94000,0.60700,6.592942e-07


In [33]:
data_imp_v2.to_csv('imputed_data/small-feat-subset-new-data.csv')