In [213]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
from IPython.display import display
import seaborn as sns

## Filter data

In [214]:
def filter_data(data_sent , max_flux= -12):
    data = data_sent.copy()
    max_flux = -12
    min_flux = 26
    data = data[data['flux_aper']<max_flux]

    data = data[data['significance']>2]
    data_class = data[['class']]

    data_sig = data['significance']
    data_id = data['src_id']
    data_name = data['src_n']
    obs_info_params = [ 'livetime','likelihood','pileup_flag','mstr_sat_src_flag','mstr_streak_src_flag'   ,'gti_obs' , 'flux_significance_b'  , 'flux_significance_m' , 'flux_significance_s' , 'flux_significance_h' , 'flux_significance_u'    ]
    data_val = data.drop(columns=obs_info_params)
    return data_val

def norm_data(data_sent):
    data = data_sent.copy()
    #data.replace()
    for d in data:
        max_val = np.amax(data[d])
        min_val =  np.amin(data[d])
        data[d] = (data[d]-min_val)/(max_val-min_val)
    return data
def std_data(data_sent):
    data = data_sent.copy()
    for d in data:
        mean =  np.mean(data[d])
        std = np.sqrt(np.var(data[d]))
        data[d] = (data[d]-mean)/std 
    return data
def do_nothing(data_sent):
    return data_sent

In [215]:
def extract_data(data_sent , impute_fn = '',reduce_fn = ' ' , rf_impute=False):
    data = data_sent.copy()
    data = data.sample(frac=1)
    data = filter_data(data)
    #display(data)
    data_id = data[[ 'class' ,'src_n' , 'src_id' ,'significance' , ]]
    data_id = data_id
    data_val = data.drop([ 'class' ,'src_n' , 'src_id' ,'significance' ,] , axis=1)
    data_val = reduce_fn(data_val)
    return data_val , data_id
    #if(rf_impute):
    #    data_val  , random_forest_imputer = impute_fn(data_val , data_id)
    #else:
    #    data_val = impute_fn(data_val)
    data_val = reduce_fn(data_val)
    data_val = data_val.reset_index(drop=True)
    data_reduced = pd.concat([data_id , data_val] , axis=1)
    if(rf_impute):
        return(data_reduced , random_forest_imputer)
    else:
        return data_reduced

## Load data

Here we will impute each class seperately ,
for testing we can pass given sample through each imputer and then combine the result

In [287]:
obj_class = 'TUC'

In [307]:
obj_class = 'BH'
train = pd.read_csv('../processed_data/'+obj_class+'_.csv' , index_col='obs_id')
train_bh = train.sample(frac=1)

obj_class = 'NS'
train = pd.read_csv('../processed_data/'+obj_class+'_.csv' , index_col='obs_id')
train_ns = train.sample(frac=1)

obj_class = 'PULSAR'
train = pd.read_csv('../processed_data/'+obj_class+'_.csv' , index_col='obs_id')
train_plsr = train.sample(frac=1)

obj_class = 'CV'
train = pd.read_csv('../processed_data/'+obj_class+'_.csv' , index_col='obs_id')
train_cv = train.sample(frac=1)

train = pd.concat([train_bh , train_ns , train_cv , train_plsr])

train

Unnamed: 0_level_0,index,class,src_n,src_id,livetime,significance,likelihood,pileup_flag,mstr_sat_src_flag,mstr_streak_src_flag,...,bb_ampl_lolim,bb_ampl_hilim,bb_stat,brems_kt,brems_kt_hilim,brems_kt_lolim,brems_nh,brems_nh_hilim,brems_nh_lolim,brems_stat
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BH_OBS_68,68,BH,CXOU J100514.2-07423,BH0018,119479.4,16.29,1.238383e+03,False,False,False,...,,,,,,,,,,
BH_OBS_126,126,BH,J1745-2900,BH0027,13503.6,337.00,4.620114e+05,False,False,False,...,,,,,,,,,,
BH_OBS_111,111,BH,CXOU J100517.1-07421,BH0022,160215.7,11.12,4.582053e+02,False,False,False,...,,,,,,,,,,
BH_OBS_15,15,BH,GX 339-4,BH0006,28302.6,39.86,5.937078e+03,False,False,False,...,0.000068,0.000093,1.672,4.786,5.718,4.075,46.01,50.10,42.20,1.036
BH_OBS_195,195,BH,GS 1354-645,BH0031,19807.2,2.44,2.026300e+01,False,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_365,365,PULSAR,PSR J1833-103,PL0099,7225.7,121.78,2.008029e+04,False,False,False,...,0.000216,0.000294,1.312,18.970,33.810,12.310,224.00,239.00,209.80,1.147
PULSAR_OBS_107,107,PULSAR,PSR J0540-6919,PL0017,58860.3,1181.21,1.485192e+06,False,False,False,...,0.012430,0.012490,14.730,4.771,4.836,4.709,30.67,30.94,30.39,1.865
PULSAR_OBS_443,443,PULSAR,PSR J2256-1024,PL0136,19799.7,10.40,6.907692e+02,False,False,False,...,,,,,,,,,,
PULSAR_OBS_132,132,PULSAR,PSR J1044-5737,PL0032,9939.9,4.57,9.772061e+01,False,False,False,...,,,,,,,,,,


In [308]:
train['class'].value_counts()

CV        786
PULSAR    445
NS        440
BH        199
Name: class, dtype: int64

# RF imputer Missingpy

In [309]:
import sklearn.neighbors._base
from os import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [310]:
from missingpy import MissForest 

In [322]:
def rf_impute(d, i ):
    data = pd.concat([i , d] , axis=1)
    data = data.drop(columns=['src_n' , 'src_id' , 'significance' ,])
    rf_imputer = MissForest(n_estimators=300 , n_jobs=-1 , class_weight="balanced")
    #new_data = d.drop(columns= ['class'])
    rf_imputer.fit(d)
    new_data = rf_imputer.transform(d)
    return new_data , rf_imputer

# Choose Features

In [293]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [313]:
from features import features as feat 
all_feat = list(feat['info'])+list(feat['flux']['photon'])+list(feat['flux']['energy'])+list(feat['variability'])+list(feat['hardness'])+list(feat['model_fit']['powerlaw'])+list(feat['model_fit']['bb'])+list(feat['model_fit']['brems'])+list(feat['info_pre_filter'])

In [314]:
feat_to_drop = list(feat['model_fit']['powerlaw']) + list(feat['model_fit']['bb']) + list(feat['model_fit']['brems']) 
#feat_to_drop = []
feat_used = [item for item in all_feat if item not in feat_to_drop]

In [315]:
train[feat_used]

Unnamed: 0_level_0,significance,src_n,src_id,class,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,...,likelihood,flux_significance_h,flux_significance_m,mstr_streak_src_flag,gti_obs,flux_significance_b,flux_significance_s,flux_significance_u,livetime,mstr_sat_src_flag
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BH_OBS_68,16.29,CXOU J100514.2-07423,BH0018,BH,-6.024293,-8.377682,-6.431916,-7.139243,-6.306977,-6.479910,...,1.238383e+03,1.13,2.41,False,2012-04-04T03:00:56,4.76,4.00,1.45,119479.4,False
BH_OBS_126,337.00,J1745-2900,BH0027,BH,-5.515700,-6.581367,,,,,...,4.620114e+05,1.32,0.00,False,2001-07-14T02:05:23,1.40,0.00,0.00,13503.6,False
BH_OBS_111,11.12,CXOU J100517.1-07421,BH0022,BH,-6.255160,,-6.651890,-7.542724,-6.512720,-5.883392,...,4.582053e+02,1.02,1.95,False,2012-01-21T08:54:47,4.26,3.76,2.58,160215.7,False
BH_OBS_15,39.86,GX 339-4,BH0006,BH,-3.960189,-4.472370,-4.854493,-4.598255,-4.818728,-6.004409,...,5.937078e+03,17.52,18.04,False,2003-09-29T12:00:20,28.33,12.61,1.94,28302.6,False
BH_OBS_195,2.44,GS 1354-645,BH0031,BH,-6.262569,,,-7.349790,,,...,2.026300e+01,0.00,1.30,False,2012-10-31T17:20:25,1.15,0.00,0.00,19807.2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_365,121.78,PSR J1833-103,PL0099,PULSAR,-3.092965,-3.258770,-5.252977,-4.081236,-5.146789,-5.481091,...,2.008029e+04,33.15,16.15,False,2000-07-05T06:08:23,37.12,4.61,2.26,7225.7,False
PULSAR_OBS_107,1181.21,PSR J0540-6919,PL0017,PULSAR,-1.898941,-2.377682,-2.576754,-2.479779,-2.574303,-2.901356,...,1.485192e+06,225.58,248.34,False,2014-05-31T01:47:17,380.85,174.42,18.93,58860.3,False
PULSAR_OBS_443,10.40,PSR J2256-1024,PL0136,PULSAR,-4.689944,-5.926282,-5.028585,-5.700929,-4.967784,-4.840132,...,6.907692e+02,3.21,4.74,False,2011-08-15T11:03:01,10.40,7.86,3.12,19799.7,False
PULSAR_OBS_132,4.57,PSR J1044-5737,PL0032,PULSAR,-4.870955,-5.992679,-5.240786,-6.095934,-5.090123,,...,9.772061e+01,2.17,2.17,False,2013-08-25T17:07:54,4.57,3.52,,9939.9,False


In [316]:
sp = [] 
for f in feat_used:
    #print(f)
    na = train[feat_used][f].isna().value_counts()
    try:
        sp.append([f , 1-na[0]/(na[0]+na[1])])
    except:
        sp.append([f , 0])
sp =  np.asarray(sp)
sparsity = pd.DataFrame(sp , columns=['feat' , 'sparsity']).sort_values(by='sparsity' , ascending=False)


In [317]:
sparsity

Unnamed: 0,feat,sparsity
31,flux_aper_lolim_u,0.7240641711229947
22,flux_aper_u,0.7026737967914438
9,photflux_aper_lolim_u,0.6545454545454545
16,photflux_aper_u,0.6315508021390375
39,flux_aper_lolim_s,0.4080213903743316
...,...,...
60,pileup_flag,0
1,src_n,0
3,class,0
2,src_id,0


In [318]:
train_set = train[feat_used]

In [319]:
data_val , data_id   = extract_data(train_set ,  impute_fn= rf_impute , reduce_fn= do_nothing , rf_impute=True )

In [320]:
data_val

Unnamed: 0_level_0,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,photflux_aper_h,photflux_aper_hilim_u,photflux_aper_hilim_s,photflux_aper_m,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_320,-4.417028,,-4.844664,,-4.730020,-3.831208,-5.598944,-3.697020,-4.643783,-5.950394,...,0.224667,-0.9126,-0.9725,-0.8832,-0.7320,-0.5003,0.5846,0.9051,0.0818,-0.7601
NS_OBS_194,-4.092535,-4.719194,-5.168002,-4.978811,-4.960586,,-4.606249,-4.332454,-4.820448,-4.854182,...,0.581750,0.2017,-0.1118,0.0987,0.3848,0.5603,0.2911,0.4366,0.1368,0.3167
NS_OBS_341,-5.465339,-6.103639,,-6.689094,,,-5.838932,-5.487316,-6.656591,-6.249877,...,0.391000,0.7302,0.4316,0.9994,0.9994,1.0000,0.4735,0.7302,0.1568,1.0000
BH_OBS_136,-5.424581,-5.804100,,-6.806875,,,-5.690370,-5.880085,-7.066715,-6.614215,...,0.462500,0.9188,0.4766,0.9994,0.9994,1.0000,0.7983,0.8676,0.7127,1.0000
CV_OBS_622,-6.190710,-8.156892,-7.191047,,-6.621239,,-6.926282,-5.751781,-6.392759,,...,,-0.7689,-1.0000,-0.9994,-0.3948,0.0906,0.9994,1.0000,0.3079,-0.5540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_391,-4.027196,-4.561616,-5.515131,-4.645507,-5.417028,,-4.531357,-5.156145,-5.341130,-4.618163,...,0.543000,0.7264,0.6765,0.7245,0.7683,0.8089,0.0987,0.1405,0.0543,0.7714
CV_OBS_516,-4.854493,-6.016148,-5.632458,-5.866461,-5.433563,-5.995249,-5.660747,-5.130123,-5.297656,-5.615467,...,0.562000,-0.5053,-0.4541,-0.2099,-0.2261,0.0656,-0.0075,0.2967,-0.3279,0.0518
CV_OBS_674,-4.696588,-5.406492,-5.304081,-5.445148,-5.265760,-5.693575,-5.356054,-5.468138,-5.230549,-5.403733,...,0.753800,-0.1693,-0.2167,-0.1574,-0.1037,-0.0381,0.0550,0.1230,-0.0131,-0.0993
NS_OBS_378,-6.138406,-6.801068,,,,,-6.458670,,,,...,,,,,,,0.9994,1.0000,0.6677,


In [321]:
data_id 

Unnamed: 0_level_0,class,src_n,src_id,significance
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NS_OBS_320,NS,XMMU J004414.0+41220,NS0047,6.53
NS_OBS_194,NS,XMMU J004245.2+41172,NS0044,49.26
NS_OBS_341,NS,J1748-2446,NS0057,18.31
BH_OBS_136,BH,J1745-2900,BH0027,337.00
CV_OBS_622,CV,CXOGLB J002410.6-72051,CV0090,6.98
...,...,...,...,...
PULSAR_OBS_391,PULSAR,PSR J1852+0040,PL0107,23.63
CV_OBS_516,CV,CXOGLB J002406.0-72045,CV0081,25.99
CV_OBS_674,CV,CXOGLB J002416.9-72042,CV0094,38.28
NS_OBS_378,NS,SAXWFC J1747.0-2853.,NS0070,4.44


In [330]:
data_val

Unnamed: 0_level_0,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,photflux_aper_h,photflux_aper_hilim_u,photflux_aper_hilim_s,photflux_aper_m,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_320,-4.417028,,-4.844664,,-4.730020,-3.831208,-5.598944,-3.697020,-4.643783,-5.950394,...,0.224667,-0.9126,-0.9725,-0.8832,-0.7320,-0.5003,0.5846,0.9051,0.0818,-0.7601
NS_OBS_194,-4.092535,-4.719194,-5.168002,-4.978811,-4.960586,,-4.606249,-4.332454,-4.820448,-4.854182,...,0.581750,0.2017,-0.1118,0.0987,0.3848,0.5603,0.2911,0.4366,0.1368,0.3167
NS_OBS_341,-5.465339,-6.103639,,-6.689094,,,-5.838932,-5.487316,-6.656591,-6.249877,...,0.391000,0.7302,0.4316,0.9994,0.9994,1.0000,0.4735,0.7302,0.1568,1.0000
BH_OBS_136,-5.424581,-5.804100,,-6.806875,,,-5.690370,-5.880085,-7.066715,-6.614215,...,0.462500,0.9188,0.4766,0.9994,0.9994,1.0000,0.7983,0.8676,0.7127,1.0000
CV_OBS_622,-6.190710,-8.156892,-7.191047,,-6.621239,,-6.926282,-5.751781,-6.392759,,...,,-0.7689,-1.0000,-0.9994,-0.3948,0.0906,0.9994,1.0000,0.3079,-0.5540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_391,-4.027196,-4.561616,-5.515131,-4.645507,-5.417028,,-4.531357,-5.156145,-5.341130,-4.618163,...,0.543000,0.7264,0.6765,0.7245,0.7683,0.8089,0.0987,0.1405,0.0543,0.7714
CV_OBS_516,-4.854493,-6.016148,-5.632458,-5.866461,-5.433563,-5.995249,-5.660747,-5.130123,-5.297656,-5.615467,...,0.562000,-0.5053,-0.4541,-0.2099,-0.2261,0.0656,-0.0075,0.2967,-0.3279,0.0518
CV_OBS_674,-4.696588,-5.406492,-5.304081,-5.445148,-5.265760,-5.693575,-5.356054,-5.468138,-5.230549,-5.403733,...,0.753800,-0.1693,-0.2167,-0.1574,-0.1037,-0.0381,0.0550,0.1230,-0.0131,-0.0993
NS_OBS_378,-6.138406,-6.801068,,,,,-6.458670,,,,...,,,,,,,0.9994,1.0000,0.6677,


In [323]:
new_data , random_forest_imputer = rf_impute(data_val, data_id)
#train_data.index.name = 'index'
#display(train_data.describe())
#train_data.to_csv('../processed_data/train_norm_rf_impute')

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6


In [331]:
random_forest_imputer.transform(data_val)

KeyboardInterrupt: 

In [324]:
data_val

Unnamed: 0_level_0,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,photflux_aper_h,photflux_aper_hilim_u,photflux_aper_hilim_s,photflux_aper_m,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_320,-4.417028,,-4.844664,,-4.730020,-3.831208,-5.598944,-3.697020,-4.643783,-5.950394,...,0.224667,-0.9126,-0.9725,-0.8832,-0.7320,-0.5003,0.5846,0.9051,0.0818,-0.7601
NS_OBS_194,-4.092535,-4.719194,-5.168002,-4.978811,-4.960586,,-4.606249,-4.332454,-4.820448,-4.854182,...,0.581750,0.2017,-0.1118,0.0987,0.3848,0.5603,0.2911,0.4366,0.1368,0.3167
NS_OBS_341,-5.465339,-6.103639,,-6.689094,,,-5.838932,-5.487316,-6.656591,-6.249877,...,0.391000,0.7302,0.4316,0.9994,0.9994,1.0000,0.4735,0.7302,0.1568,1.0000
BH_OBS_136,-5.424581,-5.804100,,-6.806875,,,-5.690370,-5.880085,-7.066715,-6.614215,...,0.462500,0.9188,0.4766,0.9994,0.9994,1.0000,0.7983,0.8676,0.7127,1.0000
CV_OBS_622,-6.190710,-8.156892,-7.191047,,-6.621239,,-6.926282,-5.751781,-6.392759,,...,,-0.7689,-1.0000,-0.9994,-0.3948,0.0906,0.9994,1.0000,0.3079,-0.5540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_391,-4.027196,-4.561616,-5.515131,-4.645507,-5.417028,,-4.531357,-5.156145,-5.341130,-4.618163,...,0.543000,0.7264,0.6765,0.7245,0.7683,0.8089,0.0987,0.1405,0.0543,0.7714
CV_OBS_516,-4.854493,-6.016148,-5.632458,-5.866461,-5.433563,-5.995249,-5.660747,-5.130123,-5.297656,-5.615467,...,0.562000,-0.5053,-0.4541,-0.2099,-0.2261,0.0656,-0.0075,0.2967,-0.3279,0.0518
CV_OBS_674,-4.696588,-5.406492,-5.304081,-5.445148,-5.265760,-5.693575,-5.356054,-5.468138,-5.230549,-5.403733,...,0.753800,-0.1693,-0.2167,-0.1574,-0.1037,-0.0381,0.0550,0.1230,-0.0131,-0.0993
NS_OBS_378,-6.138406,-6.801068,,,,,-6.458670,,,,...,,,,,,,0.9994,1.0000,0.6677,


In [325]:
imp_data =  pd.DataFrame(new_data , columns = data_val.columns.to_list()  , index=data_val.index.to_list())
imp_data.index.name = 'obs_id'
normalized_df=(imp_data-imp_data.mean())/imp_data.std()
normalized_df.describe()

Unnamed: 0,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,photflux_aper_h,photflux_aper_hilim_u,photflux_aper_hilim_s,photflux_aper_m,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
count,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,...,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0
mean,6.671634e-16,1.864214e-15,-4.810166e-15,7.316833e-16,-2.643944e-15,5.787574e-15,6.946187e-15,1.724192e-15,4.694853e-15,7.4445e-15,...,-2.399592e-15,-8.236585e-18,-1.276671e-16,2.745528e-18,-3.569187e-17,-2.965171e-16,-6.232349e-16,-1.059774e-15,-1.510041e-17,-2.512158e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.647121,-2.15813,-2.424409,-2.169504,-2.046728,-2.018644,-2.16498,-2.067852,-2.358549,-1.90834,...,-2.505795,-1.315784,-1.339282,-1.466049,-1.543961,-2.140976,-2.156266,-2.869539,-1.86928,-2.300178
25%,-0.7248062,-0.8747303,-0.7150908,-0.9183833,-0.7087289,-0.7756393,-0.7364086,-0.833634,-0.7286205,-0.709546,...,-0.5826479,-0.8079989,-0.7642277,-0.5750897,-0.7745542,-0.7830629,-0.5394026,-0.6510178,-0.6137017,-0.6715287
50%,-0.09306287,0.05640825,-0.182638,-0.1607415,-0.1899384,-0.08642085,-0.05089882,-0.07832914,-0.1504349,-0.206269,...,-0.1302948,-0.1067304,-0.06603773,-0.04365256,-0.06732976,-0.08632344,-0.09139516,-0.08963646,0.02694378,-0.2029096
75%,0.6480359,0.8061492,0.7699167,0.7744492,0.7167122,0.7781285,0.7429751,0.8253933,0.6877308,0.7034146,...,0.5850739,0.7467355,0.5671907,0.4637235,0.9558221,1.078321,0.9324347,1.048856,0.5663088,0.7150083
max,2.285062,1.983422,2.213118,2.121802,2.424505,3.018864,2.197176,2.576292,2.431893,2.263839,...,2.286586,1.837891,2.651314,2.018979,1.425943,1.297444,1.336235,1.283155,1.968985,1.817786


In [326]:
data_id

Unnamed: 0_level_0,class,src_n,src_id,significance
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NS_OBS_320,NS,XMMU J004414.0+41220,NS0047,6.53
NS_OBS_194,NS,XMMU J004245.2+41172,NS0044,49.26
NS_OBS_341,NS,J1748-2446,NS0057,18.31
BH_OBS_136,BH,J1745-2900,BH0027,337.00
CV_OBS_622,CV,CXOGLB J002410.6-72051,CV0090,6.98
...,...,...,...,...
PULSAR_OBS_391,PULSAR,PSR J1852+0040,PL0107,23.63
CV_OBS_516,CV,CXOGLB J002406.0-72045,CV0081,25.99
CV_OBS_674,CV,CXOGLB J002416.9-72042,CV0094,38.28
NS_OBS_378,NS,SAXWFC J1747.0-2853.,NS0070,4.44


In [327]:
processed_data_all = pd.concat([data_id , normalized_df] , axis=1)
processed_data_all.to_csv('../processed_data/v4/'+'all'+'_rf_impute_std_no_model.csv')

In [328]:
processed_data_all

Unnamed: 0_level_0,class,src_n,src_id,significance,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_320,NS,XMMU J004414.0+41220,NS0047,6.53,0.615725,0.197494,1.077237,-0.399899,1.081690,1.955740,...,-1.444029,-1.177796,-1.280950,-1.263447,-1.146647,-1.327829,0.611456,1.074376,0.213293,-1.889646
NS_OBS_194,NS,XMMU J004245.2+41172,NS0044,49.26,0.995125,1.022972,0.802394,1.066059,0.832256,0.494114,...,0.274123,0.581473,0.544759,0.448555,0.512744,0.528043,0.098624,0.043683,0.319174,0.378500
NS_OBS_341,NS,J1748-2446,NS0057,18.31,-0.609967,-0.127760,-1.006257,-0.421786,-1.255528,-0.635317,...,-0.643695,1.415875,1.697414,2.018979,1.425943,1.297444,0.417331,0.689599,0.357676,1.817786
BH_OBS_136,BH,J1745-2900,BH0027,337.00,-0.562312,0.121213,-1.633314,-0.524249,-1.585227,-0.853567,...,-0.299664,1.713639,1.792868,2.018979,1.425943,1.297444,0.984854,0.991877,1.427839,1.817786
CV_OBS_622,CV,CXOGLB J002410.6-72051,CV0090,6.98,-1.458075,-1.834396,-0.917226,-1.187908,-0.964301,-1.017967,...,-0.012137,-0.950921,-1.339282,-1.466049,-0.645620,-0.293854,1.336235,1.283155,0.648559,-1.455522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_391,PULSAR,PSR J1852+0040,PL0107,23.63,1.071520,1.153949,0.507329,1.356013,0.338459,-0.344890,...,0.087672,1.409876,2.216894,1.539675,1.082565,0.963051,-0.237557,-0.607733,0.160353,1.336268
CV_OBS_516,CV,CXOGLB J002406.0-72045,CV0081,25.99,0.104239,-0.055039,0.407600,0.293856,0.320572,-0.226115,...,0.179093,-0.534746,-0.181324,-0.089508,-0.394958,-0.337599,-0.423120,-0.264095,-0.575420,-0.179479
CV_OBS_674,CV,CXOGLB J002416.9-72042,CV0094,38.28,0.288862,0.451699,0.686725,0.660373,0.502106,0.078042,...,1.101963,-0.004266,0.322246,0.002029,-0.213091,-0.519057,-0.313914,-0.646233,0.030601,-0.497753
NS_OBS_378,NS,SAXWFC J1747.0-2853.,NS0070,4.44,-1.396921,-0.707453,-0.596720,-1.215326,-0.625726,-1.680173,...,-0.075750,-0.403383,-1.061346,-0.520157,-0.033387,0.275872,1.336235,1.283155,1.341210,-0.855976


In [329]:
import joblib 
joblib.dump(random_forest_imputer , 'forest/v3/all-imputer.joblib' , compress=3)

['forest/v3/all-imputer.joblib']

## Fill 47 TUC

In [332]:
tuc = pd.read_csv('../processed_data/v4/TUC_no_impute_std_no_model.csv' , index_col='obs_id')
tuc 

Unnamed: 0_level_0,class,src_n,src_id,significance,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TUC_OBS_2089,TUC,CXOGLB J002408.5-720708,TC0229,3.95,-6.346981,,-7.272703,-7.575445,-6.904831,,...,0.790000,-1.0000,-0.6015,-0.2498,-0.9994,-0.2555,-0.5721,-0.0306,-1.0000,0.1355
TUC_OBS_1548,TUC,CXOGLB J002359.3-720448,TC0168,3.42,-6.027797,,-7.331241,-7.079251,-6.650140,,...,0.847000,-1.0000,-0.4878,-0.0962,-0.9994,-0.3766,-0.9994,-0.3604,-1.0000,0.3616
TUC_OBS_1615,TUC,CXOGLB J002400.4-720448,TC0176,5.03,-6.049976,,-6.510745,-8.014349,-6.345920,,...,0.886000,-1.0000,-0.9275,-0.7883,-0.9994,-0.8176,-0.9994,-0.1318,-1.0000,-0.6065
TUC_OBS_217,TUC,CXOGLB J002409.1-720428,TC0023,5.64,-5.948847,,-6.937794,-7.087884,-6.487716,,...,0.858000,-1.0000,-0.6065,-0.2623,-0.9994,-0.5278,-0.9994,-0.3716,-1.0000,0.1480
TUC_OBS_931,TUC,CXOGLB J002404.3-720501,TC0095,12.45,-5.480041,-6.171920,-6.233810,-6.422853,-6.112777,-6.604848,...,0.431250,-0.0793,-0.3741,-0.2186,0.0849,0.2430,0.2986,0.4478,0.1368,-0.0581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TUC_OBS_2095,TUC,CXOGLB J002408.6-720449,TC0230,3.40,-5.995249,-7.194567,-6.566070,-7.836242,-6.389872,,...,0.362000,-0.6864,-0.8801,-0.7083,-0.4372,-0.1668,0.4110,0.7352,0.0119,-0.4866
TUC_OBS_1193,TUC,CXOGLB J002415.1-720443,TC0120,10.66,-5.206140,-6.955852,,,,,...,,0.0081,-1.0000,-0.9994,0.9994,1.0000,0.9994,1.0000,0.3254,0.0468
TUC_OBS_166,TUC,CXOGLB J002406.0-720501,TC0017,7.63,-5.586030,,-6.807711,,-6.108853,,...,,-1.0000,-1.0000,-0.7146,-0.9994,-0.2355,-0.9994,0.1693,-1.0000,-0.0518
TUC_OBS_989,TUC,CXOGLB J002406.5-720430,TC0102,3.41,-6.056951,-7.981716,-6.960586,-6.891773,-6.691863,,...,0.715667,-0.7464,-0.2804,-0.0012,-0.4260,-0.0643,-0.4172,-0.0806,-0.7314,0.2904


In [333]:
info_col = ['src_n' , 'src_id' , 'significance' , 'class']
inter_obs_params = ['var_inter_prob' , 'var_inter_index' , 'var_inter_sigma']
id_data = tuc[info_col]
x_val = tuc.copy()
y_val = x_val['class']
x_val = x_val.drop(columns=info_col)
x_val

Unnamed: 0_level_0,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,photflux_aper_h,photflux_aper_hilim_u,photflux_aper_hilim_s,photflux_aper_m,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TUC_OBS_2089,-6.346981,,-7.272703,-7.575445,-6.904831,,,-6.754734,-6.708409,-7.098324,...,0.790000,-1.0000,-0.6015,-0.2498,-0.9994,-0.2555,-0.5721,-0.0306,-1.0000,0.1355
TUC_OBS_1548,-6.027797,,-7.331241,-7.079251,-6.650140,,,-5.742321,-6.396747,-6.653256,...,0.847000,-1.0000,-0.4878,-0.0962,-0.9994,-0.3766,-0.9994,-0.3604,-1.0000,0.3616
TUC_OBS_1615,-6.049976,,-6.510745,-8.014349,-6.345920,,,-6.485984,-6.232547,-7.254691,...,0.886000,-1.0000,-0.9275,-0.7883,-0.9994,-0.8176,-0.9994,-0.1318,-1.0000,-0.6065
TUC_OBS_217,-5.948847,,-6.937794,-7.087884,-6.487716,,,-5.733063,-6.280172,-6.661942,...,0.858000,-1.0000,-0.6065,-0.2623,-0.9994,-0.5278,-0.9994,-0.3716,-1.0000,0.1480
TUC_OBS_931,-5.480041,-6.171920,-6.233810,-6.422853,-6.112777,-6.604848,-6.048905,-6.010105,-6.022963,-6.295935,...,0.431250,-0.0793,-0.3741,-0.2186,0.0849,0.2430,0.2986,0.4478,0.1368,-0.0581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TUC_OBS_2095,-5.995249,-7.194567,-6.566070,-7.836242,-6.389872,,-6.803271,-6.550985,-6.264960,-7.155026,...,0.362000,-0.6864,-0.8801,-0.7083,-0.4372,-0.1668,0.4110,0.7352,0.0119,-0.4866
TUC_OBS_1193,-5.206140,-6.955852,,,,,-5.725380,-5.238222,-5.690796,,...,,0.0081,-1.0000,-0.9994,0.9994,1.0000,0.9994,1.0000,0.3254,0.0468
TUC_OBS_166,-5.586030,,-6.807711,,-6.108853,,,-5.709743,-5.853562,,...,,-1.0000,-1.0000,-0.7146,-0.9994,-0.2355,-0.9994,0.1693,-1.0000,-0.0518
TUC_OBS_989,-6.056951,-7.981716,-6.960586,-6.891773,-6.691863,,-7.116566,-6.510463,-6.526951,-6.678816,...,0.715667,-0.7464,-0.2804,-0.0012,-0.4260,-0.0643,-0.4172,-0.0806,-0.7314,0.2904


In [335]:
tuc_filled = random_forest_imputer.transform(x_val)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6


In [336]:
tuc_filled

array([[-6.34698055, -8.52903388, -7.2727028 , ..., -0.0306    ,
        -1.        ,  0.1355    ],
       [-6.02779716, -8.52981619, -7.33124146, ..., -0.3604    ,
        -1.        ,  0.3616    ],
       [-6.04997609, -8.52790598, -6.51074483, ..., -0.1318    ,
        -1.        , -0.6065    ],
       ...,
       [-5.58603003, -7.35492122, -6.80771139, ...,  0.1693    ,
        -1.        , -0.0518    ],
       [-6.05695089, -7.98171569, -6.96058588, ..., -0.0806    ,
        -0.7314    ,  0.2904    ],
       [-6.24488773, -7.35078299, -7.97135522, ...,  0.7639    ,
        -0.3579    ,  1.        ]])

In [339]:
imp_data =  pd.DataFrame(tuc_filled , columns = x_val.columns.to_list()  , index=x_val.index.to_list())
imp_data.index.name = 'obs_id'
normalized_df=(imp_data-imp_data.mean())/imp_data.std()
normalized_df.describe()

processed_data_all = pd.concat([id_data, normalized_df] , axis=1)
processed_data_all
processed_data_all.to_csv('../processed_data/v4/'+'tuc'+'_rf_impute_std_no_model.csv')