# Import Modules

In [1]:
import numpy as np 
from matplotlib import pyplot as plt 
import seaborn as sns 
from tqdm import tqdm 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier  , AdaBoostClassifier , GradientBoostingClassifier
from sklearn.metrics import precision_score , recall_score 
import xgboost as xgb 
import pandas as pd 
import sklearn.neighbors._base
from os import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest 
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay
sns.set_style('whitegrid')
%load_ext autoreload
%autoreload 2
sns.set_style('whitegrid')
from IPython.display import display

def details(data_sent , comments = ''):
    if(comments):
        print(comments)
    sp = (data_sent.isna().sum().sum())/ data_sent.size
    print('________________________________________________')
    print('Sparsity in the data : {:.2f}'.format(sp))
    num_rows = data_sent.shape
    print('Data shape' , num_rows)
    #display(data_sent['class'].value_counts())
    print('Number of sources : ')
    cl = data_sent['class'].unique()
    for c in cl:
        num_src = len(data_sent[data_sent['class']==c]['src_id'].unique())
        num_obs = len(data_sent[data_sent['class']==c])
        print(c ,' \t ' , num_src , '\t' , num_obs)

# Load raw data

In [2]:
from features import phot_flux , en_flux , hard, powlaw_fit , bb_fit , brems_fit , intra_obs_var , inter_ob_var , info_col , phot_flux_hilim , phot_flux_lolim , en_flux_hilim , en_flux_lolim

info_col_cl =  info_col+['class']

In [3]:
feat_to_use = info_col + phot_flux + phot_flux_hilim + phot_flux_lolim + en_flux + en_flux_hilim + en_flux_lolim + powlaw_fit +bb_fit +hard+ intra_obs_var +inter_ob_var 
data_cv = pd.read_csv('filtered_data/cv_old_data.csv' , index_col='obs_id')[feat_to_use]
data_cv.insert(0 , 'class' , ['CV']*len(data_cv))
data_pl  =  pd.read_csv('filtered_data/pl_old_data.csv' , index_col = 'obs_id')[feat_to_use]
data_pl.insert(0 , 'class' , ['PL']*len(data_pl))
data_lx  =  pd.read_csv('filtered_data/lmxb_data.csv' , index_col = 'obs_id')[feat_to_use]
data_lx.insert(0 , 'class' , ['LX']*len(data_lx))

data = pd.concat([data_cv , data_pl , data_lx] , axis=0)
data

Unnamed: 0_level_0,class,src_id,num_obs,src_n,name,ra,dec,livetime,significance,likelihood,...,var_inter_sigma_h,var_inter_index_m,var_inter_prob_m,var_inter_sigma_m,var_inter_index_s,var_inter_prob_s,var_inter_sigma_s,var_inter_index_u,var_inter_prob_u,var_inter_sigma_u
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CV_0_obs_0,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,4903.5,11.31,304.46460,...,,,,,,,,,,
CV_0_obs_1,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3974.2,11.31,304.46460,...,,,,,,,,,,
CV_0_obs_2,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,...,,,,,,,,,,
CV_0_obs_3,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,...,,,,,,,,,,
CV_0_obs_4,CV,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LMXB_86_obs_0,LX,LMXB_86,1,SAX_J1810.8-2609_____,2CXO J181044.4-260901,272.68530,-26.150370,31871.5,9.29,512.97550,...,,,,,,,,,,
LMXB_90_obs_0,LX,LMXB_90,1,XTE_J2123-058________,2CXO J212314.5-054753,320.81060,-5.798128,17705.1,4.06,132.76700,...,,,,,,,,,,
LMXB_91_obs_0,LX,LMXB_91,1,XB_1732-304__________,2CXO J173547.0-302858,263.94590,-30.482820,18855.8,4.89,114.24570,...,8.934000e-09,5.0,0.8,1.257000e-07,0.0,0.247,5.992000e-09,,,
LMXB_92_obs_0,LX,LMXB_92,1,BW_ANT_______________,2CXO J092920.1-312303,142.33410,-31.384270,24445.9,3.70,75.53454,...,,,,,,,,,,


In [4]:
details(data)

________________________________________________
Sparsity in the data : 0.36
Data shape (1812, 107)
Number of sources : 
CV  	  60 	 994
PL  	  92 	 297
LX  	  58 	 521


## Remove sparse columns

In [5]:
sp = [] 
for f in feat_to_use:
    #print(f)
    na = data[feat_to_use][f].isna().value_counts()
    try:
        sp.append(float(1-na[0]/(na[0]+na[1])))
    except:
        sp.append(0)
sp =  np.asarray(sp)
sparsity = pd.DataFrame(
    {
        "feat" : feat_to_use ,
        "sp_val" :sp
    }
).sort_values(by='sp_val' , ascending=False).reset_index(drop=True)
sparsity

Unnamed: 0,feat,sp_val
0,ks_prob_u,0.870309
1,var_max_u,0.870309
2,var_min_u,0.870309
3,var_mean_u,0.870309
4,var_sigma_u,0.870309
...,...,...
101,likelihood,0.000000
102,num_obs,0.000000
103,name,0.000000
104,src_n,0.000000


In [6]:
sparse_feat = sparsity[sparsity['sp_val']>0.5].sort_values(by='sp_val').reset_index(drop=True)
sparse_feat = sparse_feat['feat'].to_list()
#sparse_feat

In [7]:
dense_feat = sparsity[sparsity['sp_val']<0.5].sort_values(by='sp_val').reset_index(drop=True)
dense_feat = dense_feat['feat'].to_list()
#dense_feat

In [8]:
data_use = data[dense_feat+['class']]
details(data , 'Before removing sparse columns')
details(data_use , 'After removing sparse columns')

Before removing sparse columns
________________________________________________
Sparsity in the data : 0.36
Data shape (1812, 107)
Number of sources : 
CV  	  60 	 994
PL  	  92 	 297
LX  	  58 	 521
After removing sparse columns
________________________________________________
Sparsity in the data : 0.14
Data shape (1812, 62)
Number of sources : 
CV  	  60 	 994
PL  	  92 	 297
LX  	  58 	 521


## Remove sparse rows

In [9]:
row_sp = []
for i in range(len(data_use)):
    #display(data.iloc[i].to_frame().T)
    sp = (data_use.iloc[i].isna().sum().sum())/ data_use.iloc[i].size
    row_sp.append(sp)

In [10]:
data_sp_row_rem = data_use.copy()
data_sp_row_rem.insert(1 , 'sparsity' , row_sp)
data_dense= data_sp_row_rem[data_sp_row_rem['sparsity']<0.4]
data_dense = data_dense.drop(columns=['sparsity'])
details(data_dense)
#data_dense = data_use.copy()

________________________________________________
Sparsity in the data : 0.12
Data shape (1728, 62)
Number of sources : 
CV  	  60 	 947
PL  	  92 	 293
LX  	  58 	 488


## Take LOG

In [11]:
flux_feat = phot_flux + phot_flux_lolim + phot_flux_hilim + en_flux + en_flux_lolim + en_flux_hilim
flux_feat_avail = list(set(flux_feat) & set(dense_feat))
def take_log(data_sent):
    x_train_log = data_sent.copy()
    for f in flux_feat_avail[:]:
        x_train_log.loc[:,f] = (np.log10(x_train_log.loc[:,f]))
    x_train_log = x_train_log.replace(-np.inf , -20)
    x_train_log = x_train_log.replace(np.inf , -20)
    return x_train_log
df_log = take_log(data_dense)
details(df_log)

________________________________________________
Sparsity in the data : 0.12
Data shape (1728, 62)
Number of sources : 
CV  	  60 	 947
PL  	  92 	 293
LX  	  58 	 488


  result = getattr(ufunc, method)(*inputs, **kwargs)


# Imputation

In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

## All obs imputation

In [13]:
imputer = IterativeImputer(RandomForestRegressor() , verbose=1)
df_val = df_log.drop(columns=info_col_cl)
df_id = df_log[info_col_cl]
rf_imputer = MissForest(verbose=0 , decreasing=True)
new_data = rf_imputer.fit_transform(df_val)

Iteration: 0
Iteration: 1
Iteration: 2


In [14]:
new_imp_data =  pd.DataFrame(new_data , columns = df_val.columns.to_list()  , index=df_val.index.to_list())
new_imp_data.index.name = 'obs_id'
data_imp_all_obs = pd.concat([df_id , new_imp_data] , axis=1)
data_imp_all_obs

Unnamed: 0_level_0,src_id,num_obs,src_n,name,ra,dec,livetime,significance,likelihood,class,...,var_inter_index_h,var_inter_prob_h,var_inter_sigma_h,var_prob_b,var_mean_b,var_sigma_b,var_min_b,var_max_b,ks_prob_b,kp_prob_b
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CV_0_obs_0,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,4903.5,11.31,304.46460,CV,...,5.11,0.92642,1.297709e-04,0.33888,0.010027,0.001208,0.009997,0.013863,0.48352,0.46924
CV_0_obs_2,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,CV,...,5.53,0.92046,1.221554e-04,0.52573,0.009557,0.003772,0.009127,0.022759,0.44175,0.30052
CV_0_obs_3,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,CV,...,5.54,0.92625,1.233281e-04,0.52619,0.009652,0.004017,0.009138,0.022634,0.43288,0.31813
CV_0_obs_4,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,3907.5,11.31,304.46460,CV,...,5.55,0.92668,1.225652e-04,0.51449,0.009405,0.003731,0.009243,0.022964,0.41625,0.36038
CV_0_obs_5,CV_0,33,[HPH2013]_176__________,2CXO J004245.0+411520,10.68773,41.255820,39875.5,11.31,304.46460,CV,...,5.11,0.92604,1.480959e-04,0.36039,0.009999,0.001231,0.009982,0.013934,0.43457,0.46669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LMXB_86_obs_0,LMXB_86,1,SAX_J1810.8-2609_____,2CXO J181044.4-260901,272.68530,-26.150370,31871.5,9.29,512.97550,LX,...,5.08,0.83550,1.225941e-04,0.08400,0.003456,0.000029,0.003420,0.003494,0.82900,0.53900
LMXB_90_obs_0,LMXB_90,1,XTE_J2123-058________,2CXO J212314.5-054753,320.81060,-5.798128,17705.1,4.06,132.76700,LX,...,5.08,0.90091,9.671024e-05,0.80700,0.001171,0.000255,0.001060,0.003517,0.26500,0.21100
LMXB_91_obs_0,LMXB_91,1,XB_1732-304__________,2CXO J173547.0-302858,263.94590,-30.482820,18855.8,4.89,114.24570,LX,...,0.00,0.23400,8.934000e-09,0.45100,0.001623,0.000211,0.001488,0.002756,0.73500,0.54700
LMXB_92_obs_0,LMXB_92,1,BW_ANT_______________,2CXO J092920.1-312303,142.33410,-31.384270,24445.9,3.70,75.53454,LX,...,5.08,0.91627,1.348919e-04,0.60700,0.000740,0.000201,0.000511,0.001005,0.94000,0.78900


In [15]:
data_imp_all_obs.to_csv('report/data-imp-all-obs.csv')

# Source-wise imputation

In [16]:
info_col_cl = info_col+['class']
src_list = np.unique(df_log['src_id'])
df_src_imp = pd.DataFrame()
for s in tqdm(src_list[:]):
    print('----------------------------------')
    print(s)
    temp = df_log[df_log['src_id']==s]
    temp_val = temp.drop(columns=info_col_cl)
    temp_id = temp[info_col_cl]
    rf_imputer = MissForest(verbose=0 , decreasing=True)
    #new_data = d.drop(columns= ['class'])
    try:
        new_data = rf_imputer.fit_transform(temp_val)
        imp_data =  pd.DataFrame(new_data , columns = temp_val.columns.to_list()  , index=temp_val.index.to_list())
        imp_data.index.name = 'obs_id'
    except:
        print('all col missing')
        imp_data = temp_val
    temp_imp = pd.concat([temp_id , imp_data] , axis = 1)
    df_src_imp = df_src_imp.append(temp_imp)
    #display(temp)
    #display(temp_val)
display(df_src_imp)



----------------------------------
CV_0
all col missing
----------------------------------
CV_1
----------------------------------
CV_10
all col missing
----------------------------------
CV_11
all col missing
----------------------------------
CV_12
all col missing
----------------------------------
CV_13
