In [1]:
import numpy as np 
from matplotlib import pyplot as plt 
import seaborn as sns 
from tqdm import tqdm 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier  , AdaBoostClassifier , GradientBoostingClassifier
from sklearn.metrics import precision_score , recall_score 
import xgboost as xgb 
import pandas as pd 
import sklearn.neighbors._base
from os import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest 
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay
sns.set_style('whitegrid')
%load_ext autoreload
%autoreload 2
sns.set_style('whitegrid')
from IPython.display import display

In [2]:
def details(data_sent , comments = ''):
    if(comments):
        print(comments)
    sp = (data_sent.isna().sum().sum())/ data_sent.size
    print('________________________________________________')
    print('Sparsity in the data : {:.2f}'.format(sp))
    num_rows = data_sent.shape
    print('Data shape' , num_rows)
    #display(data_sent['class'].value_counts())
    print('Number of sources : ')
    cl = data_sent['class'].unique()
    for c in cl:
        num_src = len(data_sent[data_sent['class']==c]['src_id'].unique())
        num_obs = len(data_sent[data_sent['class']==c])
        print(c ,' \t ' , num_src , '\t' , num_obs)

In [3]:
from features import phot_flux , en_flux , hard, powlaw_fit , bb_fit , brems_fit , intra_obs_var , inter_ob_var , info_col , phot_flux_hilim , phot_flux_lolim , en_flux_hilim , en_flux_lolim

In [4]:
feat_to_use = info_col + phot_flux + phot_flux_hilim + phot_flux_lolim + en_flux + en_flux_hilim + en_flux_lolim + powlaw_fit +bb_fit +hard+ intra_obs_var +inter_ob_var 

In [6]:
df = pd.read_csv('report/data-imp-all-obs.csv' , index_col='obs_id')
details(df)

________________________________________________
Sparsity in the data : 0.00
Data shape (1728, 62)
Number of sources : 
CV  	  60 	 947
PL  	  92 	 293
LX  	  58 	 488


In [7]:
info_col_cl = info_col +['class']

# Combine Obs

In [8]:

df_comb = pd.DataFrame()
for s in df['src_id'].unique()[:]:
    temp = df[df['src_id']==s]
    temp_id =  temp[info_col_cl]
    temp_val = temp.drop(columns=info_col_cl)
    temp_mean = temp_val.mean().to_frame().T
    temp_id_top = temp_id.iloc[0:1].reset_index()
    temp_final = pd.concat([temp_id_top , temp_mean] , axis=1).set_index('obs_id')
    df_comb = df_comb.append(temp_final)
details(df_comb)


________________________________________________
Sparsity in the data : 0.00
Data shape (210, 62)
Number of sources : 
CV  	  60 	 60
PL  	  92 	 92
LX  	  58 	 58


In [24]:
data_id = df_comb[info_col]
data_label = df_comb['class']
data_val = df_comb.drop(columns=['class']+info_col)
data_imp_norm = df_comb.copy()
for c in data_val.columns.to_list():
    data_imp_norm.loc[:,c] = (data_imp_norm.loc[:,c] - data_imp_norm.loc[:,c].min()) / (data_imp_norm.loc[:,c].max() - data_imp_norm.loc[:,c].min())
    #data_imp_norm.loc[:,c] = (data_imp_norm.loc[:,c] - data_imp_norm.loc[:,c].mean()) / (data_imp_norm.loc[:,c].std())

# Cross validation

In [25]:
verbose = 0
val_acc = []
test_acc = []
for i in tqdm(range(10)):
    src_list = pd.DataFrame()

    x = data_imp_norm.drop(columns = info_col_cl)
    y = data_imp_norm['class']
    info = data_imp_norm[info_col_cl] 
    x_train , x_val , y_train , y_val , i_train , i_val = train_test_split(x , y , info , test_size=0.2 , stratify=y, random_state=9)

    clf = RandomForestClassifier(n_estimators=400)
    clf.fit(x_train , y_train , sample_weight=i_train['significance'])
    v_sc = clf.score(x_val , y_val)
    test_sc = clf.score(x_train, y_train)
    val_acc.append(v_sc)
    test_acc.append(test_sc)
    #if(verbose):
    print(test_sc , v_sc)

 10%|█         | 1/10 [00:01<00:09,  1.04s/it]

0.9702380952380952 0.6666666666666666


 20%|██        | 2/10 [00:01<00:07,  1.12it/s]

0.9702380952380952 0.6904761904761905


 30%|███       | 3/10 [00:02<00:05,  1.18it/s]

0.9702380952380952 0.6666666666666666


 40%|████      | 4/10 [00:03<00:04,  1.22it/s]

0.9702380952380952 0.6904761904761905


 50%|█████     | 5/10 [00:04<00:04,  1.23it/s]

0.9702380952380952 0.6666666666666666


 60%|██████    | 6/10 [00:05<00:03,  1.21it/s]

0.9702380952380952 0.6428571428571429


 70%|███████   | 7/10 [00:05<00:02,  1.20it/s]

0.9702380952380952 0.6666666666666666


 80%|████████  | 8/10 [00:06<00:01,  1.17it/s]

0.9702380952380952 0.6904761904761905


 90%|█████████ | 9/10 [00:07<00:00,  1.16it/s]

0.9702380952380952 0.6428571428571429


100%|██████████| 10/10 [00:08<00:00,  1.17it/s]

0.9702380952380952 0.6904761904761905



