In [65]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
from IPython.display import display
import seaborn as sns

## Filter data

In [66]:
def filter_data(data_sent , max_flux= -12):
    data = data_sent.copy()
    max_flux = -12
    min_flux = 26
    data = data[data['flux_aper']<max_flux]

    data = data[data['significance']>2]
    data_class = data[['class']]

    data_sig = data['significance']
    data_id = data['src_id']
    data_name = data['src_n']
    obs_info_params = [ 'livetime','likelihood','pileup_flag','mstr_sat_src_flag','mstr_streak_src_flag'   ,'gti_obs' , 'flux_significance_b'  , 'flux_significance_m' , 'flux_significance_s' , 'flux_significance_h' , 'flux_significance_u'    ]
    data_val = data.drop(columns=obs_info_params)
    return data_val

def norm_data(data_sent):
    data = data_sent.copy()
    #data.replace()
    for d in data:
        max_val = np.amax(data[d])
        min_val =  np.amin(data[d])
        data[d] = (data[d]-min_val)/(max_val-min_val)
    return data
def std_data(data_sent):
    data = data_sent.copy()
    for d in data:
        mean =  np.mean(data[d])
        std = np.sqrt(np.var(data[d]))
        data[d] = (data[d]-mean)/std 
    return data
def do_nothing(data_sent):
    return data_sent

In [67]:
def extract_data(data_sent , impute_fn = '',reduce_fn = ' ' , rf_impute=False):
    data = data_sent.copy()
    data = data.sample(frac=1)
    data = filter_data(data)
    #display(data)
    data_id = data[[ 'class' ,'src_n' , 'src_id' ,'significance' , ]]
    data_id = data_id
    data_val = data.drop([ 'class' ,'src_n' , 'src_id' ,'significance' ,] , axis=1)
    data_val = reduce_fn(data_val)
    return data_val , data_id
    #if(rf_impute):
    #    data_val  , random_forest_imputer = impute_fn(data_val , data_id)
    #else:
    #    data_val = impute_fn(data_val)
    data_val = reduce_fn(data_val)
    data_val = data_val.reset_index(drop=True)
    data_reduced = pd.concat([data_id , data_val] , axis=1)
    if(rf_impute):
        return(data_reduced , random_forest_imputer)
    else:
        return data_reduced

## Load data

Here we will impute each class seperately ,
for testing we can pass given sample through each imputer and then combine the result

In [68]:
obj_class = 'TUC'

In [69]:
obj_class = 'BH'
train = pd.read_csv('../processed_data/'+obj_class+'_.csv' , index_col='obs_id')
train_bh = train.sample(frac=1)

obj_class = 'NS'
train = pd.read_csv('../processed_data/'+obj_class+'_.csv' , index_col='obs_id')
train_ns = train.sample(frac=1)

obj_class = 'PULSAR'
train = pd.read_csv('../processed_data/'+obj_class+'_.csv' , index_col='obs_id')
train_plsr = train.sample(frac=1)

obj_class = 'CV'
train = pd.read_csv('../processed_data/'+obj_class+'_.csv' , index_col='obs_id')
train_cv = train.sample(frac=1)

train = pd.concat([train_bh , train_ns , train_cv , train_plsr])

train

Unnamed: 0_level_0,index,class,src_n,src_id,livetime,significance,likelihood,pileup_flag,mstr_sat_src_flag,mstr_streak_src_flag,...,bb_ampl_lolim,bb_ampl_hilim,bb_stat,brems_kt,brems_kt_hilim,brems_kt_lolim,brems_nh,brems_nh_hilim,brems_nh_lolim,brems_stat
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BH_OBS_170,170,BH,J1745-2900,BH0027,17827.6,337.00,462011.40000,False,False,False,...,,,,,,,,,,
BH_OBS_182,182,BH,J1745-2900,BH0027,44955.8,337.00,462011.40000,False,False,False,...,0.002207,0.003062,0.931,1.607,1.688,1.531,1.530000e+03,1579.00000,1483.0,0.888
BH_OBS_66,66,BH,CXOU J100514.2-07423,BH0018,158013.8,16.29,1238.38300,False,False,False,...,,,,,,,,,,
BH_OBS_4,4,BH,1E 1740.7-2942,BH0002,11611.1,166.94,151803.00000,False,False,False,...,,,,,,,,,,
BH_OBS_61,61,BH,CXOU J100514.2-07423,BH0018,40580.4,16.29,1238.38300,False,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_124,124,PULSAR,PSR J0908-4913,PL0027,34605.8,3.24,28.19767,False,False,False,...,,,,,,,,,,
PULSAR_OBS_176,176,PULSAR,PSR J1550-5418,PL0060,96515.8,122.14,80599.58000,False,False,False,...,0.000374,0.000418,3.345,6.400,6.874,5.986,3.129000e+02,318.10000,307.8,2.120
PULSAR_OBS_211,211,PULSAR,PSR J1741-205,PL0079,48783.7,98.07,57964.79000,False,False,False,...,0.003282,0.003774,4.820,1.387,1.422,1.342,2.083000e-07,0.08261,,1.707
PULSAR_OBS_244,244,PULSAR,PSR J1745-2900,PL0081,5106.3,107.91,21810.19000,False,False,False,...,,,,,,,,,,


In [70]:
train['class'].value_counts()

CV        786
PULSAR    445
NS        440
BH        199
Name: class, dtype: int64

# RF imputer Missingpy

In [71]:
import sklearn.neighbors._base
from os import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [72]:
from missingpy import MissForest 

In [73]:
def rf_impute(d, i ):
    data = pd.concat([i , d] , axis=1)
    data = data.drop(columns=['src_n' , 'src_id' , 'significance' ,])
    rf_imputer = MissForest(n_estimators=300 , n_jobs=-1 , class_weight="balanced")
    #new_data = d.drop(columns= ['class'])
    rf_imputer.fit(d)
    new_data = rf_imputer.transform(d)
    return new_data , rf_imputer

## Zero imputer

In [74]:
def zero_impute(d , i ):
    data = pd.concat([i , d] , axis=1)
    data = data.drop(columns=['src_n' , 'src_id' , 'significance' ,])
    new_data = data.replace(np.nan , 0 )
    return new_data , 'ignore this string'

# Choose Features

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
from features import features as feat 
all_feat = list(feat['info'])+list(feat['flux']['photon'])+list(feat['flux']['energy'])+list(feat['variability'])+list(feat['hardness'])+list(feat['model_fit']['powerlaw'])+list(feat['model_fit']['bb'])+list(feat['model_fit']['brems'])+list(feat['info_pre_filter'])

In [77]:
feat_to_drop = list(feat['model_fit']['powerlaw']) + list(feat['model_fit']['bb']) + list(feat['model_fit']['brems']) 
#feat_to_drop = []
feat_used = [item for item in all_feat if item not in feat_to_drop]

In [78]:
train[feat_used]

Unnamed: 0_level_0,src_id,significance,class,src_n,photflux_aper_lolim_m,photflux_aper_hilim_s,photflux_aper_b,photflux_aper_lolim_s,photflux_aper_hilim_h,photflux_aper_u,...,flux_significance_u,likelihood,pileup_flag,flux_significance_b,flux_significance_m,flux_significance_h,mstr_streak_src_flag,mstr_sat_src_flag,livetime,flux_significance_s
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BH_OBS_170,BH0027,337.00,BH,J1745-2900,,-6.213746,-5.550985,,-5.587036,,...,0.00,462011.40000,False,2.05,0.90,1.73,False,False,17827.6,0.00
BH_OBS_182,BH0027,337.00,BH,J1745-2900,-5.837734,-6.102758,-3.432151,-6.476904,-3.482672,-5.171018,...,1.64,462011.40000,False,66.22,5.69,65.92,False,False,44955.8,2.49
BH_OBS_66,BH0018,16.29,BH,CXOU J100514.2-07423,-6.971429,-6.332547,-6.138824,-6.547447,-6.988853,-6.348141,...,1.13,1238.38300,False,5.41,3.17,1.58,False,False,158013.8,4.13
BH_OBS_4,BH0002,166.94,BH,1E 1740.7-2942,,,,,-1.865823,,...,,151803.00000,False,,,166.39,False,False,11611.1,
BH_OBS_61,BH0018,16.29,BH,CXOU J100514.2-07423,-7.075566,-5.900665,-5.814458,-6.154778,-6.800519,,...,0.00,1238.38300,False,4.06,1.79,1.03,False,False,40580.4,3.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_124,PL0027,3.24,PULSAR,PSR J0908-4913,-6.915781,-6.485984,-5.809388,,-6.007093,,...,0.00,28.19767,False,3.24,1.79,2.54,False,False,34605.8,0.00
PULSAR_OBS_176,PL0060,122.14,PULSAR,PSR J1550-5418,-4.253521,-5.400554,-3.213675,-5.483994,-3.327810,-4.904831,...,5.14,80599.58000,False,122.14,47.92,111.88,False,False,96515.8,10.43
PULSAR_OBS_211,PL0079,98.07,PULSAR,PSR J1741-205,-4.540608,-4.060381,-3.763211,-4.084442,-4.704433,-3.574466,...,19.24,57964.79000,False,46.83,25.00,16.38,False,False,48783.7,36.16
PULSAR_OBS_244,PL0081,107.91,PULSAR,PSR J1745-2900,,-6.077223,,,,,...,0.00,21810.19000,False,,0.00,,False,False,5106.3,0.00


In [79]:
sp = [] 
for f in feat_used:
    #print(f)
    na = train[feat_used][f].isna().value_counts()
    try:
        sp.append([f , 1-na[0]/(na[0]+na[1])])
    except:
        sp.append([f , 0])
sp =  np.asarray(sp)
sparsity = pd.DataFrame(sp , columns=['feat' , 'sparsity']).sort_values(by='sparsity' , ascending=False)


## drop highly sparse features

In [80]:
sparsity

Unnamed: 0,feat,sparsity
21,flux_aper_lolim_u,0.7240641711229947
29,flux_aper_u,0.7026737967914438
15,photflux_aper_lolim_u,0.6545454545454545
9,photflux_aper_u,0.6315508021390375
27,flux_aper_lolim_s,0.4080213903743316
...,...,...
63,pileup_flag,0
1,significance,0
3,src_n,0
60,gti_obs,0


In [81]:
sparsity['sparsity'].astype(float)

21    0.724064
29    0.702674
15    0.654545
9     0.631551
27    0.408021
        ...   
63    0.000000
1     0.000000
3     0.000000
60    0.000000
0     0.000000
Name: sparsity, Length: 71, dtype: float64

In [83]:
feat_low_sparse = sparsity[sparsity['sparsity'].astype(float)<0.35]['feat'].to_list()
feat_low_sparse

['photflux_aper_lolim_h',
 'photflux_aper_h',
 'flux_aper_lolim_b',
 'photflux_aper_lolim_b',
 'flux_aper_b',
 'flux_aper_hilim_u',
 'photflux_aper_b',
 'flux_aper_lolim',
 'photflux_aper_lolim',
 'flux_aper',
 'photflux_aper',
 'var_inter_prob',
 'var_inter_index',
 'var_inter_sigma',
 'flux_significance_u',
 'photflux_aper_hilim_u',
 'flux_aper_hilim_m',
 'flux_aper_hilim_h',
 'flux_aper_hilim_s',
 'hard_ms_lolim',
 'hard_ms_hilim',
 'hard_ms',
 'hard_hm_lolim',
 'hard_hm',
 'hard_hm_hilim',
 'hard_hs',
 'hard_hs_hilim',
 'hard_hs_lolim',
 'photflux_aper_hilim_m',
 'flux_significance_m',
 'flux_significance_s',
 'photflux_aper_hilim_s',
 'flux_aper_hilim_b',
 'photflux_aper_hilim_h',
 'flux_significance_h',
 'flux_aper_hilim',
 'photflux_aper_hilim_b',
 'flux_significance_b',
 'photflux_aper_hilim',
 'class',
 'livetime',
 'mstr_sat_src_flag',
 'mstr_streak_src_flag',
 'likelihood',
 'pileup_flag',
 'significance',
 'src_n',
 'gti_obs',
 'src_id']

In [84]:
train_set = train[feat_low_sparse]
train_set

Unnamed: 0_level_0,photflux_aper_lolim_h,photflux_aper_h,flux_aper_lolim_b,photflux_aper_lolim_b,flux_aper_b,flux_aper_hilim_u,photflux_aper_b,flux_aper_lolim,photflux_aper_lolim,flux_aper,...,class,livetime,mstr_sat_src_flag,mstr_streak_src_flag,likelihood,pileup_flag,significance,src_n,gti_obs,src_id
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BH_OBS_170,-6.161025,-5.785421,-14.157765,-5.852015,-13.835647,-13.913640,-5.550985,-14.610196,-6.377537,-14.256333,...,BH,17827.6,False,False,462011.40000,False,337.00,J1745-2900,2011-03-31T10:44:54,BH0027
BH_OBS_182,-3.495937,-3.489053,-11.600153,-3.439018,-11.593290,-14.895513,-3.432151,-11.994612,-3.861691,-11.987612,...,BH,44955.8,False,False,462011.40000,False,337.00,J1745-2900,2014-04-28T03:04:50,BH0027
BH_OBS_66,-7.637518,-7.201833,-14.961380,-6.227532,-14.863279,-16.286762,-6.138824,-15.404535,-6.679305,-15.260538,...,BH,158013.8,False,False,1238.38300,False,16.29,CXOU J100514.2-07423,2012-02-03T10:00:00,BH0018
BH_OBS_4,-1.870955,-1.868381,,,,,,-9.830620,-1.870955,-9.827981,...,BH,11611.1,False,False,151803.00000,False,166.94,1E 1740.7-2942,2001-07-21T11:42:23,BH0002
BH_OBS_61,,-7.090070,-14.825940,-5.937042,-14.686766,-15.295935,-5.814458,-15.239804,-6.411117,-15.078522,...,BH,40580.4,False,False,1238.38300,False,16.29,CXOU J100514.2-07423,2010-01-27T01:02:52,BH0018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_124,-6.372839,-6.147947,-14.352128,-5.970211,-14.191586,,-5.809388,-14.685906,-6.490475,-14.487556,...,PULSAR,34605.8,False,False,28.19767,False,3.24,PSR J0908-4913,2010-09-06T12:26:29,PL0027
PULSAR_OBS_176,-3.335546,-3.331707,-11.404283,-3.217241,-11.400663,-14.334607,-3.213675,-11.799216,-3.643125,-11.795263,...,PULSAR,96515.8,False,False,80599.58000,False,122.14,PSR J1550-5418,2011-06-02T17:44:53,PL0060
PULSAR_OBS_211,-4.757459,-4.730254,-12.472241,-3.772370,-12.462559,-12.318849,-3.763211,-12.657258,-3.958599,-12.639664,...,PULSAR,48783.7,False,False,57964.79000,False,98.07,PSR J1741-205,2010-05-21T02:44:36,PL0079
PULSAR_OBS_244,,,,,,-13.663340,,,,,...,PULSAR,5106.3,False,False,21810.19000,False,107.91,PSR J1745-2900,2006-07-04T11:20:29,PL0081


In [85]:
data_val , data_id   = extract_data(train_set ,  impute_fn= rf_impute , reduce_fn= do_nothing , rf_impute=True )

In [86]:
data_val

Unnamed: 0_level_0,photflux_aper_lolim_h,photflux_aper_h,flux_aper_lolim_b,photflux_aper_lolim_b,flux_aper_b,flux_aper_hilim_u,photflux_aper_b,flux_aper_lolim,photflux_aper_lolim,flux_aper,...,hard_hs,hard_hs_hilim,hard_hs_lolim,photflux_aper_hilim_m,photflux_aper_hilim_s,flux_aper_hilim_b,photflux_aper_hilim_h,flux_aper_hilim,photflux_aper_hilim_b,photflux_aper_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_359,-4.252821,-4.243668,-12.282996,-3.812761,-12.277202,-13.917574,-3.807154,-12.675468,-4.253844,-12.667238,...,0.3360,0.3504,0.3217,-4.454569,-4.536704,-12.271890,-4.235301,-12.659647,-3.801618,-4.235704
BH_OBS_48,,,,-7.246111,-15.823619,-15.898597,-6.854804,-16.699242,-7.535570,-16.053548,...,-0.9806,-0.2692,-1.0000,-7.722849,-6.737786,-15.522589,-7.170375,-15.618750,-6.652475,-6.392335
BH_OBS_147,-6.217671,-5.585695,-14.399463,-6.007446,-13.700493,-13.549135,-5.437588,-14.792797,-6.497901,-14.093881,...,0.9994,1.0000,0.4291,-6.182567,-5.967381,-13.453334,-5.338471,-13.687484,-5.199352,-4.899624
PULSAR_OBS_178,-3.612432,-3.600153,-11.690157,-3.452841,-11.677988,-12.650334,-3.440812,-12.080142,-3.911181,-12.058595,...,0.9944,0.9975,0.9913,-4.734239,-5.915781,-11.666351,-3.588044,-12.038278,-3.429107,-3.880113
BH_OBS_17,-5.755970,-5.630970,-13.882397,-5.571703,-13.771856,-15.361910,-5.460924,-14.283990,-6.019642,-14.162462,...,0.9994,1.0000,0.9126,-6.090658,-6.976748,-13.688034,-5.534023,-14.063395,-5.377268,-5.753935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NS_OBS_425,-4.201073,-4.166470,-12.048323,-3.439018,-12.031424,-13.886057,-3.422049,-12.451856,-3.904694,-12.427523,...,-0.2274,-0.1780,-0.2755,-3.979225,-3.938171,-12.015113,-4.134422,-12.401757,-3.405718,-3.824088
PULSAR_OBS_31,-5.761201,-5.619246,-13.793444,-5.298518,-13.695294,,-5.200384,-14.118061,-5.792398,-13.985690,...,0.5122,0.7289,0.2730,-5.709743,-5.928486,-13.619246,-5.518128,-13.887294,-5.124476,-5.130405
NS_OBS_143,-7.347754,-6.666553,-14.778586,-6.008198,-14.606425,,-5.843451,-15.158747,-6.525551,-14.893801,...,-0.1037,0.3117,-0.5166,-6.108240,-6.331428,-14.483465,-6.413188,-14.732412,-5.730020,-6.046395
CV_OBS_2,-6.363913,-6.118387,-14.238749,-5.483994,-14.058538,,-5.388277,-14.583817,-5.952865,-14.373968,...,-0.3648,-0.0893,-0.6015,-5.776764,-5.686766,-13.937794,-5.969400,-14.206070,-5.313900,-4.436386


In [87]:
data_id 

Unnamed: 0_level_0,class,src_n,src_id,significance
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NS_OBS_359,NS,J1824-2452,NS0062,92.00
BH_OBS_48,BH,CXOU J100506.7-07443,BH0014,11.41
BH_OBS_147,BH,J1745-2900,BH0027,337.00
PULSAR_OBS_178,PULSAR,PSR J1617-5055,PL0062,85.97
BH_OBS_17,BH,IGR J17464-3213,BH0008,6.90
...,...,...,...,...
NS_OBS_425,NS,1WGA J1911.2+0035,NS0078,44.36
PULSAR_OBS_31,PULSAR,PSR J0358+5413,PL0009,19.44
NS_OBS_143,NS,KS 1731-260,NS0033,14.81
CV_OBS_2,CV,[DSH2013] 24,CV0001,9.81


In [88]:
data_val

Unnamed: 0_level_0,photflux_aper_lolim_h,photflux_aper_h,flux_aper_lolim_b,photflux_aper_lolim_b,flux_aper_b,flux_aper_hilim_u,photflux_aper_b,flux_aper_lolim,photflux_aper_lolim,flux_aper,...,hard_hs,hard_hs_hilim,hard_hs_lolim,photflux_aper_hilim_m,photflux_aper_hilim_s,flux_aper_hilim_b,photflux_aper_hilim_h,flux_aper_hilim,photflux_aper_hilim_b,photflux_aper_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_359,-4.252821,-4.243668,-12.282996,-3.812761,-12.277202,-13.917574,-3.807154,-12.675468,-4.253844,-12.667238,...,0.3360,0.3504,0.3217,-4.454569,-4.536704,-12.271890,-4.235301,-12.659647,-3.801618,-4.235704
BH_OBS_48,,,,-7.246111,-15.823619,-15.898597,-6.854804,-16.699242,-7.535570,-16.053548,...,-0.9806,-0.2692,-1.0000,-7.722849,-6.737786,-15.522589,-7.170375,-15.618750,-6.652475,-6.392335
BH_OBS_147,-6.217671,-5.585695,-14.399463,-6.007446,-13.700493,-13.549135,-5.437588,-14.792797,-6.497901,-14.093881,...,0.9994,1.0000,0.4291,-6.182567,-5.967381,-13.453334,-5.338471,-13.687484,-5.199352,-4.899624
PULSAR_OBS_178,-3.612432,-3.600153,-11.690157,-3.452841,-11.677988,-12.650334,-3.440812,-12.080142,-3.911181,-12.058595,...,0.9944,0.9975,0.9913,-4.734239,-5.915781,-11.666351,-3.588044,-12.038278,-3.429107,-3.880113
BH_OBS_17,-5.755970,-5.630970,-13.882397,-5.571703,-13.771856,-15.361910,-5.460924,-14.283990,-6.019642,-14.162462,...,0.9994,1.0000,0.9126,-6.090658,-6.976748,-13.688034,-5.534023,-14.063395,-5.377268,-5.753935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NS_OBS_425,-4.201073,-4.166470,-12.048323,-3.439018,-12.031424,-13.886057,-3.422049,-12.451856,-3.904694,-12.427523,...,-0.2274,-0.1780,-0.2755,-3.979225,-3.938171,-12.015113,-4.134422,-12.401757,-3.405718,-3.824088
PULSAR_OBS_31,-5.761201,-5.619246,-13.793444,-5.298518,-13.695294,,-5.200384,-14.118061,-5.792398,-13.985690,...,0.5122,0.7289,0.2730,-5.709743,-5.928486,-13.619246,-5.518128,-13.887294,-5.124476,-5.130405
NS_OBS_143,-7.347754,-6.666553,-14.778586,-6.008198,-14.606425,,-5.843451,-15.158747,-6.525551,-14.893801,...,-0.1037,0.3117,-0.5166,-6.108240,-6.331428,-14.483465,-6.413188,-14.732412,-5.730020,-6.046395
CV_OBS_2,-6.363913,-6.118387,-14.238749,-5.483994,-14.058538,,-5.388277,-14.583817,-5.952865,-14.373968,...,-0.3648,-0.0893,-0.6015,-5.776764,-5.686766,-13.937794,-5.969400,-14.206070,-5.313900,-4.436386


In [90]:
new_data , random_forest_imputer = zero_impute(data_val, data_id)
#train_data.index.name = 'index'
#display(train_data.describe())
#train_data.to_csv('../processed_data/train_norm_rf_impute')

In [331]:
random_forest_imputer.transform(data_val)

KeyboardInterrupt: 

In [91]:
data_val

Unnamed: 0_level_0,photflux_aper_lolim_h,photflux_aper_h,flux_aper_lolim_b,photflux_aper_lolim_b,flux_aper_b,flux_aper_hilim_u,photflux_aper_b,flux_aper_lolim,photflux_aper_lolim,flux_aper,...,hard_hs,hard_hs_hilim,hard_hs_lolim,photflux_aper_hilim_m,photflux_aper_hilim_s,flux_aper_hilim_b,photflux_aper_hilim_h,flux_aper_hilim,photflux_aper_hilim_b,photflux_aper_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_359,-4.252821,-4.243668,-12.282996,-3.812761,-12.277202,-13.917574,-3.807154,-12.675468,-4.253844,-12.667238,...,0.3360,0.3504,0.3217,-4.454569,-4.536704,-12.271890,-4.235301,-12.659647,-3.801618,-4.235704
BH_OBS_48,,,,-7.246111,-15.823619,-15.898597,-6.854804,-16.699242,-7.535570,-16.053548,...,-0.9806,-0.2692,-1.0000,-7.722849,-6.737786,-15.522589,-7.170375,-15.618750,-6.652475,-6.392335
BH_OBS_147,-6.217671,-5.585695,-14.399463,-6.007446,-13.700493,-13.549135,-5.437588,-14.792797,-6.497901,-14.093881,...,0.9994,1.0000,0.4291,-6.182567,-5.967381,-13.453334,-5.338471,-13.687484,-5.199352,-4.899624
PULSAR_OBS_178,-3.612432,-3.600153,-11.690157,-3.452841,-11.677988,-12.650334,-3.440812,-12.080142,-3.911181,-12.058595,...,0.9944,0.9975,0.9913,-4.734239,-5.915781,-11.666351,-3.588044,-12.038278,-3.429107,-3.880113
BH_OBS_17,-5.755970,-5.630970,-13.882397,-5.571703,-13.771856,-15.361910,-5.460924,-14.283990,-6.019642,-14.162462,...,0.9994,1.0000,0.9126,-6.090658,-6.976748,-13.688034,-5.534023,-14.063395,-5.377268,-5.753935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NS_OBS_425,-4.201073,-4.166470,-12.048323,-3.439018,-12.031424,-13.886057,-3.422049,-12.451856,-3.904694,-12.427523,...,-0.2274,-0.1780,-0.2755,-3.979225,-3.938171,-12.015113,-4.134422,-12.401757,-3.405718,-3.824088
PULSAR_OBS_31,-5.761201,-5.619246,-13.793444,-5.298518,-13.695294,,-5.200384,-14.118061,-5.792398,-13.985690,...,0.5122,0.7289,0.2730,-5.709743,-5.928486,-13.619246,-5.518128,-13.887294,-5.124476,-5.130405
NS_OBS_143,-7.347754,-6.666553,-14.778586,-6.008198,-14.606425,,-5.843451,-15.158747,-6.525551,-14.893801,...,-0.1037,0.3117,-0.5166,-6.108240,-6.331428,-14.483465,-6.413188,-14.732412,-5.730020,-6.046395
CV_OBS_2,-6.363913,-6.118387,-14.238749,-5.483994,-14.058538,,-5.388277,-14.583817,-5.952865,-14.373968,...,-0.3648,-0.0893,-0.6015,-5.776764,-5.686766,-13.937794,-5.969400,-14.206070,-5.313900,-4.436386


In [93]:
imp_data =  pd.DataFrame(new_data , columns = data_val.columns.to_list()  , index=data_val.index.to_list())
imp_data.index.name = 'obs_id'
#normalized_df=(imp_data-imp_data.mean())/imp_data.std()
#normalized_df.describe()
imp_data.describe()

Unnamed: 0,photflux_aper_lolim_h,photflux_aper_h,flux_aper_lolim_b,photflux_aper_lolim_b,flux_aper_b,flux_aper_hilim_u,photflux_aper_b,flux_aper_lolim,photflux_aper_lolim,flux_aper,...,hard_hs,hard_hs_hilim,hard_hs_lolim,photflux_aper_hilim_m,photflux_aper_hilim_s,flux_aper_hilim_b,photflux_aper_hilim_h,flux_aper_hilim,photflux_aper_hilim_b,photflux_aper_hilim
count,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,...,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0
mean,-4.477548,-4.423483,-12.235548,-4.650319,-12.465371,-10.149399,-4.647645,-13.698346,-5.477593,-13.87999,...,0.03911,0.241775,-0.150744,-5.457383,-5.413043,-13.377132,-5.339389,-13.65361,-4.896096,-5.051458
std,2.429665,2.199707,4.2615,1.789903,3.675043,6.30184,1.51222,2.60742,1.439307,0.949797,...,0.67197,0.57145,0.630215,1.371327,1.258837,1.490269,1.116952,0.816286,0.976932,0.804173
min,-8.546376,-7.583193,-16.767766,-8.401647,-15.890759,-16.286762,-7.462055,-17.980053,-9.56623,-16.724746,...,-0.9994,-0.965,-1.0,-7.85855,-7.620694,-16.001523,-7.593119,-16.031949,-7.207678,-7.414567
25%,-6.141805,-5.85334,-14.328403,-5.800314,-14.106779,-14.514529,-5.614528,-14.883787,-6.389387,-14.538914,...,-0.481575,-0.188975,-0.67555,-6.258416,-6.202076,-14.092256,-6.033919,-14.200186,-5.563201,-5.667575
50%,-5.198391,-5.189746,-13.456677,-4.948664,-13.438814,-13.373113,-4.95684,-14.019252,-5.559793,-13.911521,...,0.0,0.16615,-0.1874,-5.767894,-5.665346,-13.500591,-5.472118,-13.67851,-5.015812,-5.06471
75%,-4.093414,-4.126898,-12.381534,-3.835574,-12.47554,0.0,-4.063514,-13.166267,-4.676824,-13.194663,...,0.683,0.874775,0.302625,-4.941574,-4.873869,-12.850012,-4.772049,-13.108763,-4.36019,-4.429442
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.009448,...,0.9994,1.0,0.9975,0.0,0.0,0.0,0.0,-11.886136,0.0,0.0


In [94]:
data_id

Unnamed: 0_level_0,class,src_n,src_id,significance
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NS_OBS_359,NS,J1824-2452,NS0062,92.00
BH_OBS_48,BH,CXOU J100506.7-07443,BH0014,11.41
BH_OBS_147,BH,J1745-2900,BH0027,337.00
PULSAR_OBS_178,PULSAR,PSR J1617-5055,PL0062,85.97
BH_OBS_17,BH,IGR J17464-3213,BH0008,6.90
...,...,...,...,...
NS_OBS_425,NS,1WGA J1911.2+0035,NS0078,44.36
PULSAR_OBS_31,PULSAR,PSR J0358+5413,PL0009,19.44
NS_OBS_143,NS,KS 1731-260,NS0033,14.81
CV_OBS_2,CV,[DSH2013] 24,CV0001,9.81


In [95]:
processed_data_all = pd.concat([data_id , imp_data] , axis=1)
processed_data_all.to_csv('../processed_data/v4/'+'all'+'_zero_impute_std_no_model.csv')

In [328]:
processed_data_all

Unnamed: 0_level_0,class,src_n,src_id,significance,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NS_OBS_320,NS,XMMU J004414.0+41220,NS0047,6.53,0.615725,0.197494,1.077237,-0.399899,1.081690,1.955740,...,-1.444029,-1.177796,-1.280950,-1.263447,-1.146647,-1.327829,0.611456,1.074376,0.213293,-1.889646
NS_OBS_194,NS,XMMU J004245.2+41172,NS0044,49.26,0.995125,1.022972,0.802394,1.066059,0.832256,0.494114,...,0.274123,0.581473,0.544759,0.448555,0.512744,0.528043,0.098624,0.043683,0.319174,0.378500
NS_OBS_341,NS,J1748-2446,NS0057,18.31,-0.609967,-0.127760,-1.006257,-0.421786,-1.255528,-0.635317,...,-0.643695,1.415875,1.697414,2.018979,1.425943,1.297444,0.417331,0.689599,0.357676,1.817786
BH_OBS_136,BH,J1745-2900,BH0027,337.00,-0.562312,0.121213,-1.633314,-0.524249,-1.585227,-0.853567,...,-0.299664,1.713639,1.792868,2.018979,1.425943,1.297444,0.984854,0.991877,1.427839,1.817786
CV_OBS_622,CV,CXOGLB J002410.6-72051,CV0090,6.98,-1.458075,-1.834396,-0.917226,-1.187908,-0.964301,-1.017967,...,-0.012137,-0.950921,-1.339282,-1.466049,-0.645620,-0.293854,1.336235,1.283155,0.648559,-1.455522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PULSAR_OBS_391,PULSAR,PSR J1852+0040,PL0107,23.63,1.071520,1.153949,0.507329,1.356013,0.338459,-0.344890,...,0.087672,1.409876,2.216894,1.539675,1.082565,0.963051,-0.237557,-0.607733,0.160353,1.336268
CV_OBS_516,CV,CXOGLB J002406.0-72045,CV0081,25.99,0.104239,-0.055039,0.407600,0.293856,0.320572,-0.226115,...,0.179093,-0.534746,-0.181324,-0.089508,-0.394958,-0.337599,-0.423120,-0.264095,-0.575420,-0.179479
CV_OBS_674,CV,CXOGLB J002416.9-72042,CV0094,38.28,0.288862,0.451699,0.686725,0.660373,0.502106,0.078042,...,1.101963,-0.004266,0.322246,0.002029,-0.213091,-0.519057,-0.313914,-0.646233,0.030601,-0.497753
NS_OBS_378,NS,SAXWFC J1747.0-2853.,NS0070,4.44,-1.396921,-0.707453,-0.596720,-1.215326,-0.625726,-1.680173,...,-0.075750,-0.403383,-1.061346,-0.520157,-0.033387,0.275872,1.336235,1.283155,1.341210,-0.855976


In [329]:
import joblib 
joblib.dump(random_forest_imputer , 'forest/v3/all-imputer.joblib' , compress=3)

['forest/v3/all-imputer.joblib']

## Fill 47 TUC

In [332]:
tuc = pd.read_csv('../processed_data/v4/TUC_no_impute_std_no_model.csv' , index_col='obs_id')
tuc 

Unnamed: 0_level_0,class,src_n,src_id,significance,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TUC_OBS_2089,TUC,CXOGLB J002408.5-720708,TC0229,3.95,-6.346981,,-7.272703,-7.575445,-6.904831,,...,0.790000,-1.0000,-0.6015,-0.2498,-0.9994,-0.2555,-0.5721,-0.0306,-1.0000,0.1355
TUC_OBS_1548,TUC,CXOGLB J002359.3-720448,TC0168,3.42,-6.027797,,-7.331241,-7.079251,-6.650140,,...,0.847000,-1.0000,-0.4878,-0.0962,-0.9994,-0.3766,-0.9994,-0.3604,-1.0000,0.3616
TUC_OBS_1615,TUC,CXOGLB J002400.4-720448,TC0176,5.03,-6.049976,,-6.510745,-8.014349,-6.345920,,...,0.886000,-1.0000,-0.9275,-0.7883,-0.9994,-0.8176,-0.9994,-0.1318,-1.0000,-0.6065
TUC_OBS_217,TUC,CXOGLB J002409.1-720428,TC0023,5.64,-5.948847,,-6.937794,-7.087884,-6.487716,,...,0.858000,-1.0000,-0.6065,-0.2623,-0.9994,-0.5278,-0.9994,-0.3716,-1.0000,0.1480
TUC_OBS_931,TUC,CXOGLB J002404.3-720501,TC0095,12.45,-5.480041,-6.171920,-6.233810,-6.422853,-6.112777,-6.604848,...,0.431250,-0.0793,-0.3741,-0.2186,0.0849,0.2430,0.2986,0.4478,0.1368,-0.0581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TUC_OBS_2095,TUC,CXOGLB J002408.6-720449,TC0230,3.40,-5.995249,-7.194567,-6.566070,-7.836242,-6.389872,,...,0.362000,-0.6864,-0.8801,-0.7083,-0.4372,-0.1668,0.4110,0.7352,0.0119,-0.4866
TUC_OBS_1193,TUC,CXOGLB J002415.1-720443,TC0120,10.66,-5.206140,-6.955852,,,,,...,,0.0081,-1.0000,-0.9994,0.9994,1.0000,0.9994,1.0000,0.3254,0.0468
TUC_OBS_166,TUC,CXOGLB J002406.0-720501,TC0017,7.63,-5.586030,,-6.807711,,-6.108853,,...,,-1.0000,-1.0000,-0.7146,-0.9994,-0.2355,-0.9994,0.1693,-1.0000,-0.0518
TUC_OBS_989,TUC,CXOGLB J002406.5-720430,TC0102,3.41,-6.056951,-7.981716,-6.960586,-6.891773,-6.691863,,...,0.715667,-0.7464,-0.2804,-0.0012,-0.4260,-0.0643,-0.4172,-0.0806,-0.7314,0.2904


In [333]:
info_col = ['src_n' , 'src_id' , 'significance' , 'class']
inter_obs_params = ['var_inter_prob' , 'var_inter_index' , 'var_inter_sigma']
id_data = tuc[info_col]
x_val = tuc.copy()
y_val = x_val['class']
x_val = x_val.drop(columns=info_col)
x_val

Unnamed: 0_level_0,photflux_aper_hilim_b,photflux_aper_lolim_h,photflux_aper_lolim_s,photflux_aper_lolim_m,photflux_aper_s,photflux_aper_lolim_u,photflux_aper_h,photflux_aper_hilim_u,photflux_aper_hilim_s,photflux_aper_m,...,kp_prob,hard_hs_lolim,hard_ms_lolim,hard_ms,hard_hs,hard_hs_hilim,hard_hm,hard_hm_hilim,hard_hm_lolim,hard_ms_hilim
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TUC_OBS_2089,-6.346981,,-7.272703,-7.575445,-6.904831,,,-6.754734,-6.708409,-7.098324,...,0.790000,-1.0000,-0.6015,-0.2498,-0.9994,-0.2555,-0.5721,-0.0306,-1.0000,0.1355
TUC_OBS_1548,-6.027797,,-7.331241,-7.079251,-6.650140,,,-5.742321,-6.396747,-6.653256,...,0.847000,-1.0000,-0.4878,-0.0962,-0.9994,-0.3766,-0.9994,-0.3604,-1.0000,0.3616
TUC_OBS_1615,-6.049976,,-6.510745,-8.014349,-6.345920,,,-6.485984,-6.232547,-7.254691,...,0.886000,-1.0000,-0.9275,-0.7883,-0.9994,-0.8176,-0.9994,-0.1318,-1.0000,-0.6065
TUC_OBS_217,-5.948847,,-6.937794,-7.087884,-6.487716,,,-5.733063,-6.280172,-6.661942,...,0.858000,-1.0000,-0.6065,-0.2623,-0.9994,-0.5278,-0.9994,-0.3716,-1.0000,0.1480
TUC_OBS_931,-5.480041,-6.171920,-6.233810,-6.422853,-6.112777,-6.604848,-6.048905,-6.010105,-6.022963,-6.295935,...,0.431250,-0.0793,-0.3741,-0.2186,0.0849,0.2430,0.2986,0.4478,0.1368,-0.0581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TUC_OBS_2095,-5.995249,-7.194567,-6.566070,-7.836242,-6.389872,,-6.803271,-6.550985,-6.264960,-7.155026,...,0.362000,-0.6864,-0.8801,-0.7083,-0.4372,-0.1668,0.4110,0.7352,0.0119,-0.4866
TUC_OBS_1193,-5.206140,-6.955852,,,,,-5.725380,-5.238222,-5.690796,,...,,0.0081,-1.0000,-0.9994,0.9994,1.0000,0.9994,1.0000,0.3254,0.0468
TUC_OBS_166,-5.586030,,-6.807711,,-6.108853,,,-5.709743,-5.853562,,...,,-1.0000,-1.0000,-0.7146,-0.9994,-0.2355,-0.9994,0.1693,-1.0000,-0.0518
TUC_OBS_989,-6.056951,-7.981716,-6.960586,-6.891773,-6.691863,,-7.116566,-6.510463,-6.526951,-6.678816,...,0.715667,-0.7464,-0.2804,-0.0012,-0.4260,-0.0643,-0.4172,-0.0806,-0.7314,0.2904


In [335]:
tuc_filled = random_forest_imputer.transform(x_val)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6


In [336]:
tuc_filled

array([[-6.34698055, -8.52903388, -7.2727028 , ..., -0.0306    ,
        -1.        ,  0.1355    ],
       [-6.02779716, -8.52981619, -7.33124146, ..., -0.3604    ,
        -1.        ,  0.3616    ],
       [-6.04997609, -8.52790598, -6.51074483, ..., -0.1318    ,
        -1.        , -0.6065    ],
       ...,
       [-5.58603003, -7.35492122, -6.80771139, ...,  0.1693    ,
        -1.        , -0.0518    ],
       [-6.05695089, -7.98171569, -6.96058588, ..., -0.0806    ,
        -0.7314    ,  0.2904    ],
       [-6.24488773, -7.35078299, -7.97135522, ...,  0.7639    ,
        -0.3579    ,  1.        ]])

In [339]:
imp_data =  pd.DataFrame(tuc_filled , columns = x_val.columns.to_list()  , index=x_val.index.to_list())
imp_data.index.name = 'obs_id'
normalized_df=(imp_data-imp_data.mean())/imp_data.std()
normalized_df.describe()

processed_data_all = pd.concat([id_data, normalized_df] , axis=1)
processed_data_all
processed_data_all.to_csv('../processed_data/v4/'+'tuc'+'_rf_impute_std_no_model.csv')