In [1]:
import pandas as pd 
from matplotlib import pyplot as plt 
import numpy as np 
import seaborn as sns

# Data Analysis

## Load Data

In [44]:
def get_data(offset , significance):
    df = pd.read_csv('../not_on_git/mw_cat/chandra_filtered_sources.csv' , index_col = 'name')
    df_id = pd.read_csv('compiled_data_v3/id_frame.csv' , index_col='name')[['offset' , 'class']]
    off = offset + 0.1
    df = pd.merge(df_id[df_id['offset']<off] , df , left_index=True , right_index =True , how='right')
    sig = significance
    df = df[df['significance']>sig]
    df = df.drop(columns = ['significance'  , 'offset' , 'ra' , 'dec', 'var_inter_hard_flag' , 'likelihood'])
    df = df.rename(columns = {
        'flux_aper_b' : 'b-csc' , 
        'flux_aper_h' : 'h-csc' ,
        'flux_aper_m': 'm-csc' ,
        'flux_aper_s': 's-csc' ,
        'flux_aper_u': 'u-csc' ,
    })
    df['class'] = df['class'].replace(np.nan , 'X')
    df = pd.merge(
        df , pd.read_csv('mw_cat/sdss.csv' , index_col='name') ,
        left_index=True , 
        right_index = True , 
        how = 'left'
    )
    df = pd.merge(
        df , pd.read_csv('mw_cat/2mass_v2.csv' , index_col='name') ,
        left_index=True , 
        right_index = True , 
        how = 'left'
    )
    df = pd.merge(
        df , pd.read_csv('mw_cat/wise_combined.csv' , index_col='name') ,
        left_index=True , 
        right_index = True , 
        how = 'left'
    )
    df = pd.merge(
        df , pd.read_csv('mw_cat/galex_combined.csv' , index_col='name') ,
        left_index=True , 
        right_index = True , 
        how = 'left'
    )
    df = pd.merge(
        df , pd.read_csv('mw_cat/gaia.csv' , index_col='name') ,
        left_index=True , 
        right_index = True , 
        how = 'left'
    )
    df['B-R'] = df['bp_mag']-df['rp_mag']
    df['G-J'] = df[ 'g_mag'] - df['Jmag']
    df['G-W2'] = df['g_mag'] - df['W2mag']
    df['Bp-H'] = df['bp_mag'] - df[ 'Hmag']
    df['Bp-W3'] = df['bp_mag'] - df['W3mag']
    df['Rp-K'] = df['rp_mag'] - df['Kmag']
    df['J-H'] = df['Jmag'] - df['Hmag']
    df['J-W1'] = df['Jmag'] - df['W1mag']
    df['W1-W2'] = df['W1mag'] - df['W2mag']

    return df

In [45]:
from utilities import deets
df = get_data(offset=1.0 , significance=0)
deets(df , 1)

_____________________________________________________
------------------------------
Number of Objects : 277596
Number of Columns : 45
------------------------------


X         269486
STAR        2853
AGN         2667
YSO         1200
HMXB         759
ULX          214
CV           169
LMXB         143
PULSAR       105
Name: class, dtype: int64

_____________________________________________________


In [46]:
x = df[df['class']!='X']
y = x['class']
#x = x.drop(columns=['class'])
u = df[df['class']=='X']
u = u[u['var_flag']==1]
u = u.drop(columns = ['class'])

## Missing values

In [77]:
miss = []
for cl in y.unique():
#    cl = 'AGN'
    temp = x[x['class']==cl].drop(columns=['class'])
    miss.append((1-temp.isna().sum(axis=0) / len(temp)).to_frame(name=cl))
miss = pd.concat(miss , axis=1)
xd = x.drop(columns=['class'])
miss.insert(0 , 'overall' , (1-xd.isna().sum(axis=0) / len(xd)).to_list())
miss = miss[['overall' , 'AGN' , 'STAR' ,'YSO' , 'CV' , 'PULSAR' , 'HMXB'  , 'LMXB' ,'ULX']]

param_dict = {
    'SDSS' : ['umag' , 'gmag' , 'imag' , 'zmag' , 'rmag'] , 
    'WISE' : ['W1mag' , 'W2mag' , 'W3mag' , 'W4mag'] , 
    'GAIA' :['g_mag' , 'bp_mag' , 'rp_mag'] , 
    '2MASS' :['Jmag' , 'Hmag' , 'Kmag'] , 
    'GALEX' :['fuv_mag' , 'nuv_mag']
}
param_dict['inter-obs-var'] = ['var_inter_prob_b' , 'var_inter_sigma_b' , 'var_inter_index_b']
param_dict['intra-obs-var'] = ['var_intra_prob_b' , 'kp_intra_prob_b' , 'var_intra_index_b']
comb = ['WISE' , '2MASS' , 'GAIA' , 'SDSS' , 'GALEX' ,'inter-obs-var','intra-obs-var' ]
comb_df = []
for c in comb:
    comb_df.append(miss.loc[param_dict[c]].mean().to_frame(name=c).T)
comb_df = pd.concat(comb_df)
comb_df

sep_col = ['h-csc' ,'b-csc' , 'm-csc' , 'u-csc' ,'B-R' , 'G-J' , 'G-W2' , 'Bp-H' , 'Bp-W3' , 'Rp-K' , 'J-H' , 'J-W1' , 'W1-W2' ]
sep_df = miss.loc[sep_col]
sep_df
miss_comb_df = pd.concat([sep_df , comb_df]).sort_values(by='overall' , ascending=False)
miss_comb_df

Unnamed: 0,overall,AGN,STAR,YSO,CV,PULSAR,HMXB,LMXB,ULX
b-csc,0.967818,0.981627,0.952331,0.983333,0.781065,0.990476,0.986825,0.965035,0.985981
intra-obs-var,0.949322,0.985752,0.921837,0.945833,0.798817,0.952381,0.965744,0.874126,0.990654
m-csc,0.940074,0.968504,0.921486,0.949167,0.763314,0.942857,0.948617,0.804196,0.981308
h-csc,0.933909,0.972628,0.87592,0.979167,0.769231,0.904762,0.971014,0.958042,0.96729
GAIA,0.728113,0.728909,0.967403,0.715,0.544379,0.371429,0.197628,0.223776,0.140187
B-R,0.728113,0.728909,0.967403,0.715,0.544379,0.371429,0.197628,0.223776,0.140187
u-csc,0.688779,0.734158,0.662461,0.561667,0.597633,0.628571,0.799736,0.853147,0.785047
W1-W2,0.681628,0.869516,0.801963,0.55,0.147929,0.085714,0.229249,0.13986,0.154206
WISE,0.678052,0.869516,0.793901,0.545,0.147929,0.085714,0.229249,0.13986,0.154206
2MASS,0.582244,0.262842,0.959692,0.91,0.301775,0.209524,0.115942,0.090909,0.079439


In [78]:
import seaborn as sns
sns.set(font_scale=1.3, rc={'axes.facecolor':'white', 'figure.facecolor':'white' , 'axes.grid':True} , style="ticks")
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
sns.heatmap(miss_comb_df*100 , ax=ax , annot=True , fmt='.0f' , cmap='mako')
plt.xticks(rotation=90)
#plt.savefig('final_plots/missing_features.eps' , format='eps')
#plt.savefig('final_plots/missing_features.jpg' , format='jpg')
plt.show()

In [37]:
from utilities import simple_cv
param = {
    'objective': 'binaryclass',
    'num_class':len(np.unique(y)),
    'metric': ['auc_mu' , ] , 
    'verbosity' : 2 , 
    'deterministic' : True  , 
    'early_stopping_round' : 20 , 
    'sparse' : True , 
    'is_unbalance' : True
    }
def calc_weight(gamma , y):
    l = len(y)
    cl_weight = {}
    cl_dict = y.value_counts().to_dict()
    for cl , val in zip(cl_dict.keys() , cl_dict.values()):
        w = np.exp((l / val)*gamma)
        cl_weight[cl] = w
    #print(cl_weight)
    return cl_weight