# Pipeline

> What we have

* A list of chandra sources names
* All photometric data corresponding to theses sources in a seperate folder

> What we have to do

* classifiy it

## Load Modules

In [1]:
import pandas as pd 
from matplotlib import pyplot as plt 
import numpy as np 
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier 
import sklearn.impute._iterative as itimp
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from IPython.display import display
from tqdm import tqdm

## Read source files

here give a file containing the list of files we want to generate dataframe with
for example 'flist_v3' contains the data corresponding to all identified sources

first get the list of file names we want to extract data from

In [2]:
import os
os.system('ls csc_all_mw >fnames/chandra_all_mw')
flist_all = pd.read_csv('fnames/chandra_all_mw' , names = ['name'])
flist_all 

Unnamed: 0,name
0,2CXO J000001.7-500850.csv
1,2CXO J000003.0-501423.csv
2,2CXO J000004.5-552604.csv
3,2CXO J000005.1+622636.csv
4,2CXO J000008.4+135002.csv
...,...
53282,2CXO J235945.7-574830.csv
53283,2CXO J235946.6-605601.csv
53284,2CXO J235950.2-501347.csv
53285,2CXO J235950.2-501539.csv


next from this list we will remove already classified sources,
in 'data_v2/all_final' is chandra source catalogue table cross matched with source identification, we can use that data to eliminate the identified sources from this list

In [3]:
df_ch = pd.read_csv('all_mw_data_v4_phot.csv')
df_ch = df_ch[df_ch['var_flag']==1]
#display(df_ch)


In [4]:
df_num = df_ch['name'].value_counts().to_frame()
rep_src = df_num[df_num['name']>1].index.to_list()
df_ch[df_ch['name'].isin(rep_src)].to_csv('cat/confused_src.csv')


In [5]:
df_ch = df_ch['name'].to_list()
df_ch = [el+'.csv' for el in df_ch]
len(df_ch)

3194

In [6]:
flist = flist_all[~flist_all['name'].isin(df_ch)]
flist

Unnamed: 0,name
0,2CXO J000001.7-500850.csv
1,2CXO J000003.0-501423.csv
2,2CXO J000004.5-552604.csv
3,2CXO J000005.1+622636.csv
4,2CXO J000008.4+135002.csv
...,...
53282,2CXO J235945.7-574830.csv
53283,2CXO J235946.6-605601.csv
53284,2CXO J235950.2-501347.csv
53285,2CXO J235950.2-501539.csv


averge out data files row-wise , for a given frequency using different methods we have more than one measurment, we take average of these measurment. save this averaged data (as seperate files for each source) in different folder 

In [7]:
cols = ['Frequency' , 'Flux Density' , 'Upper limit of Flux Density', 'Lower limit of Flux Density' , 'Photometry Measurement' , 'Uncertainty']
failed_files = []
for f in tqdm(flist['name'][:]):
    try:
        temp = (pd.read_csv('csc_all_mw/'+f))[cols]
        #display(temp)
        mean_fd = temp[['Flux Density' , 'Upper limit of Flux Density', 'Lower limit of Flux Density']].mean(axis=1)
        mean_phot = temp[['Photometry Measurement' , 'Uncertainty']].replace('NaN',0 , regex=False)
        mean_phot['Uncertainty'] = mean_phot['Uncertainty'].replace(np.nan , "0" ,)
        mean_phot['Uncertainty'] = mean_phot['Uncertainty'].str.replace("+/-" , "0*" , regex=False)
        mean_phot['Uncertainty'] = mean_phot['Uncertainty'].str.replace("<" , "" , regex=False)
        mean_phot['Uncertainty'] = mean_phot['Uncertainty'].str.replace(">" , "" , regex=False)
        mean_phot['Uncertainty'] = mean_phot['Uncertainty'].str.replace("%" , "" , regex=False)
        #display(mean_phot)
        mean_phot['Uncertainty'] = [0 if el[-1]=='*' else eval(el) for el in mean_phot['Uncertainty']]
        mean_phot = mean_phot.replace(0 , np.nan)
        mean_phot = mean_phot.mean(axis=1)
        temp_df = pd.DataFrame({
            "freq" : temp['Frequency'] , 
            "flux_density" : mean_fd.to_list(),
            "photometry" : mean_phot.to_list()
        })
        #display(temp_df)
        uniq_freq = temp_df['freq'].unique()
        small_df_list = []
        for u in uniq_freq:
            small_df_list.append(temp_df[temp_df['freq']==u].mean(axis=0).to_frame().T)
            #display(small_df)
        #display(temp_df)
        small_df = pd.concat(small_df_list, axis=0)
        small_df.to_csv('csc_all_mw_avg/'+f)
    except Exception as e:
        print("some error with " + f)
        failed_files.append(f)
        print(e)
    #display(small_df)

100%|██████████| 50197/50197 [27:31<00:00, 30.39it/s] 


take care of some of the failed files

okyy no failed files

## Combine data in one file

from the trainig sample we have got the list of frequencies and their correspoding sparsity. We had prepared a list of frequencies with minimum sparsity, or density > 15. 

In [8]:
freq_list_fill = pd.read_csv('dense_freq.csv', index_col=0)
freq_list_fill

Unnamed: 0,band,freq,unit
0,2-7 keV Chandra,9.19e+17,erg/cm^2/s
1,0.5-7 keV Chandra,5.56e+17,erg/cm^2/s
2,1.2-2 keV Chandra,3.77e+17,erg/cm^2/s
3,0.5-1.2 keV Chandra,2.22e+17,erg/cm^2/s
4,0.2-0.5 keV Chandra,9.67e+16,erg/cm^2/s
5,0.3-8 keV (Chandra),1e+18,erg/cm^2^/s
6,0.5-2 keV (Chandra),3.02e+17,erg/cm^2^/s
7,FUV (GALEX) AB,1950000000000000.0,mag
8,NUV (GALEX) AB,1290000000000000.0,mag
9,u (SDSS PSF) AB,836000000000000.0,asinh mag


In [9]:
df_mw_list = []
for f in tqdm(flist['name'][:]):
    temp_dict = {k : np.nan for k in ['name']+list(freq_list_fill['band']) }
    fd = (pd.read_csv('csc_all_mw_avg/'+f)).iloc[:,1:]
    temp_dict['name'] =  f[:-4]
    #for fqn , fqv in zip(freq_dict.keys() , freq_dict.values()):
    for fqn , fqv in zip(freq_list_fill['band'] , freq_list_fill['freq']):
        t = fd[fd['freq']==fqv]
        if(len(t)>0):
            temp_dict[fqn] =  t.iloc[0]['photometry']
            #display(t.iloc[0]['flux_density'])
    #display(temp_dict)
    df_mw_list.append(temp_dict)
df_mw_final = pd.DataFrame(df_mw_list)
display(df_mw_final)
df_mw_final.to_csv('temp_files/csc_all_mw_final.csv')

100%|██████████| 50197/50197 [26:14<00:00, 31.88it/s]  


Unnamed: 0,name,2-7 keV Chandra,0.5-7 keV Chandra,1.2-2 keV Chandra,0.5-1.2 keV Chandra,0.2-0.5 keV Chandra,0.3-8 keV (Chandra),0.5-2 keV (Chandra),FUV (GALEX) AB,NUV (GALEX) AB,...,4.5 microns (IRAC),8.0 microns (IRAC),24 microns (MIPS),J (2MASS/CTIO),H (2MASS/CTIO),K_s (2MASS/CTIO),0.5-8 keV (Chandra),3.6 microns (IRAC),5.8 microns (IRAC),1-2 keV (Chandra)
0,2CXO J000001.7-500850,,4.758600e-15,9.657575e-16,9.285650e-16,2.439525e-15,1.370000e-14,,,,...,,,,,,,,,,
1,2CXO J000003.0-501423,2.909700e-15,2.178765e-15,5.012800e-16,5.363925e-16,2.138375e-15,9.740000e-15,,,,...,,,,,,,,,,
2,2CXO J000004.5-552604,5.453900e-15,3.680550e-14,6.201400e-15,3.940600e-15,2.713850e-14,4.450000e-14,,,,...,,,,,,,,,,
3,2CXO J000005.1+622636,,1.541300e-15,3.406350e-16,3.060200e-16,9.307550e-16,7.050000e-15,,,,...,,,,,,,,,,
4,2CXO J000008.4+135002,5.792350e-14,5.650550e-15,1.205350e-15,3.826500e-15,4.412150e-15,2.690000e-14,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50192,2CXO J235945.7-574830,9.039150e-14,1.735050e-14,5.930800e-15,3.100050e-15,8.399600e-15,3.890000e-14,,21.9072,21.74725,...,,,,,,,,,,
50193,2CXO J235946.6-605601,,4.198100e-15,3.828950e-16,1.318600e-15,2.799550e-15,,,,,...,,,,,,,,,,
50194,2CXO J235950.2-501347,,1.459575e-14,7.917100e-17,6.060075e-16,1.441050e-14,9.370000e-15,,,,...,,,,,,,,,,
50195,2CXO J235950.2-501539,2.521000e-15,9.508500e-16,2.668450e-16,2.214400e-16,9.629900e-16,,,,,...,,,,,,,,,,


next from the chandra csc database (cat/chandra_all_sources_v2.csv) where we downloaded al sources select only the sources for which we have data from NED (there is seperate photometry file available) and then merge it with the dataframe created above.

From the 'all-chandra' data we will use only selected columns

In [10]:
df_ch = pd.read_csv('cat/chandra_all_sources_v2.csv')
# select only variable sources
df_ch = df_ch[df_ch['var_flag']==1].reset_index(drop=True)
# select only sources which have data available
df_ch = df_ch[df_ch['name'].isin(df_mw_final['name'].to_list())].reset_index(drop=True)
df_ch

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,name,ra,dec,conf_flag,significance,extent_flag,sat_src_flag,gal_l,gal_b,err_ellipse_r0,...,var_inter_hard_prob_ms,var_intra_index_b,var_intra_prob_b,ks_intra_prob_b,kp_intra_prob_b,var_inter_index_b,var_inter_prob_b,var_inter_sigma_b,gal_l2,gal_b2
0,2CXO J004736.9-733102,11.904005,-73.517280,0,3.894737,1,0,303.306304,-43.608193,7.553030,...,,0.0,0.024076,0.965033,0.977357,,,,303.306304,-43.608193
1,2CXO J004829.5-732959,12.123114,-73.499901,0,5.085714,0,0,303.220840,-43.626698,3.944494,...,,0.0,0.236336,0.984416,0.994941,,,,303.220840,-43.626698
2,2CXO J005449.0-725128,13.704345,-72.857987,0,3.600000,0,0,302.584181,-44.267982,5.496063,...,,0.0,0.090023,0.419304,0.122221,,,,302.584181,-44.267982
3,2CXO J010049.1-731526,15.204787,-73.257463,0,2.111111,0,0,301.995228,-43.853723,2.752145,...,,0.0,0.269411,0.348119,0.433439,,,,301.995228,-43.853723
4,2CXO J010049.8-731045,15.207831,-73.179381,0,2.777778,0,0,301.988531,-43.931660,6.993985,...,,0.0,0.309891,0.454695,0.368123,,,,301.988531,-43.931660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50192,2CXO J220508.3-495935,331.284642,-49.993301,0,4.965517,0,0,345.626202,-51.013233,3.120299,...,0.968215,2.0,0.789776,0.546391,0.937311,5.0,0.723271,0.000008,345.626202,-51.013233
50193,2CXO J220544.0-495505,331.433541,-49.918280,0,2.857143,0,0,345.685626,-51.129040,5.492403,...,0.736086,,,,,5.0,0.790599,0.000008,345.685626,-51.129040
50194,2CXO J220554.7-500709,331.478292,-50.119278,0,8.945591,1,0,345.369571,-51.086038,0.728901,...,0.099500,2.0,0.788288,0.974595,0.966695,0.0,0.264667,0.000002,345.369571,-51.086038
50195,2CXO J220555.6-500837,331.481942,-50.143735,0,9.940400,0,0,345.331805,-51.079654,0.734517,...,0.077784,0.0,0.243979,0.716067,0.874332,8.0,1.000000,0.000051,345.331805,-51.079654


In [51]:
chandra_feat = [
    'name' , 
    "catalog",
    'class' , 
    'var_flag' , 
    'streak_src_flag' , 
    'pileup_flag',
    'ra' ,
    'dec',
    'gal_b2',
    'gal_l2',
    'hard_hs',
    'hard_hm',
    'hard_ms',
    'var_inter_prob_b',
    'var_inter_sigma_b',
    'var_intra_prob_b',
    'var_inter_index_b',
    'kp_intra_prob_b',
    'var_intra_index_b',
    'var_inter_hard_prob_hs',
    'ks_intra_prob_b',
    'var_inter_hard_sigma_hm',
    'var_inter_hard_prob_ms',
    'var_inter_hard_prob_hm']

In [52]:
df_ch.insert(0 , 'class' , ['U']*len(df_ch))
df_ch.insert(0 , 'catalog' , ['U']*len(df_ch))
df_ch_small = df_ch[chandra_feat]
df_ch_small = df_ch_small.set_index('name')
df_ch_small

Unnamed: 0_level_0,catalog,class,var_flag,streak_src_flag,pileup_flag,ra,dec,gal_b2,gal_l2,hard_hs,...,var_inter_sigma_b,var_intra_prob_b,var_inter_index_b,kp_intra_prob_b,var_intra_index_b,var_inter_hard_prob_hs,ks_intra_prob_b,var_inter_hard_sigma_hm,var_inter_hard_prob_ms,var_inter_hard_prob_hm
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2CXO J004736.9-733102,U,U,1,0,0,11.904005,-73.517280,-43.608193,303.306304,0.301062,...,,0.024076,,0.977357,0.0,,0.965033,,,
2CXO J004829.5-732959,U,U,1,0,0,12.123114,-73.499901,-43.626698,303.220840,-0.296065,...,,0.236336,,0.994941,0.0,,0.984416,,,
2CXO J005449.0-725128,U,U,1,0,0,13.704345,-72.857987,-44.267982,302.584181,0.438476,...,,0.090023,,0.122221,0.0,,0.419304,,,
2CXO J010049.1-731526,U,U,1,0,0,15.204787,-73.257463,-43.853723,301.995228,-0.999375,...,,0.269411,,0.433439,0.0,,0.348119,,,
2CXO J010049.8-731045,U,U,1,0,0,15.207831,-73.179381,-43.931660,301.988531,0.999375,...,,0.309891,,0.368123,0.0,,0.454695,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2CXO J035901.9+541600,U,U,1,0,0,59.758153,54.266748,0.859038,148.175189,-0.999375,...,4.820432e-07,0.990507,5.0,0.999661,7.0,0.892757,0.995178,0.224022,0.657599,0.726478
2CXO J035901.9+542018,U,U,1,0,0,59.758232,54.338477,0.913583,148.128602,0.823235,...,3.713218e-06,0.907353,3.0,0.968807,6.0,0.378791,0.821507,0.414468,0.688566,0.697826
2CXO J035902.2+535814,U,U,1,0,0,59.759235,53.970753,0.634484,148.368050,0.999375,...,1.019987e-06,0.053695,0.0,0.579636,0.0,,0.667998,0.160197,0.004074,0.137839
2CXO J035904.1+541220,U,U,1,0,0,59.767388,54.205690,0.816143,148.218979,0.999375,...,1.401591e-06,0.769089,6.0,0.931578,3.0,0.062591,0.869352,0.472997,0.041788,0.459760


In [53]:
df_mw_final = df_mw_final.reset_index()
df_mw_flux =  df_mw_final.set_index('name')
df_mw_flux

Unnamed: 0_level_0,index,2-7 keV Chandra,0.5-7 keV Chandra,1.2-2 keV Chandra,0.5-1.2 keV Chandra,0.2-0.5 keV Chandra,0.3-8 keV (Chandra),0.5-2 keV (Chandra),FUV (GALEX) AB,NUV (GALEX) AB,...,4.5 microns (IRAC),8.0 microns (IRAC),24 microns (MIPS),J (2MASS/CTIO),H (2MASS/CTIO),K_s (2MASS/CTIO),0.5-8 keV (Chandra),3.6 microns (IRAC),5.8 microns (IRAC),1-2 keV (Chandra)
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2CXO J000005.1+622636,0,,1.541300e-15,3.406350e-16,3.060200e-16,9.307550e-16,7.050000e-15,,,,...,,,,,,,,,,
2CXO J000008.4+135002,1,5.792350e-14,5.650550e-15,1.205350e-15,3.826500e-15,4.412150e-15,2.690000e-14,,,,...,,,,,,,,,,
2CXO J000020.5-245301,2,4.628930e-13,1.373335e-14,5.200350e-15,5.625800e-15,1.146370e-14,,,,,...,,,,,,,,,,
2CXO J000035.3-574509,3,,2.882100e-15,5.576000e-16,3.386950e-16,2.264350e-15,,,,,...,,,,,,,,,,
2CXO J000050.1+004709,4,,3.315350e-14,3.196050e-15,1.725300e-15,2.992300e-14,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2CXO J235936.7+005100,40517,2.022700e-14,2.588200e-14,5.484600e-15,2.619950e-15,1.903650e-14,2.630000e-14,,23.48555,22.78655,...,,,,,,,,,,
2CXO J235936.9+181703,40518,,6.761900e-15,3.735950e-16,6.992350e-16,5.756800e-15,9.600000e-15,,,,...,,,,,,,,,,
2CXO J235943.5+005924,40519,6.422300e-13,1.937650e-14,1.813450e-14,1.180600e-15,2.482400e-14,,,,,...,,,,,,,,,,
2CXO J235945.7-574830,40520,9.039150e-14,1.735050e-14,5.930800e-15,3.100050e-15,8.399600e-15,3.890000e-14,,21.90720,21.74725,...,,,,,,,,,,


In [54]:
df_final = pd.merge( df_ch_small.iloc[df_mw_final.index] , df_mw_flux , left_index=True , right_index=True)
df_final

Unnamed: 0_level_0,catalog,class,var_flag,streak_src_flag,pileup_flag,ra,dec,gal_b2,gal_l2,hard_hs,...,4.5 microns (IRAC),8.0 microns (IRAC),24 microns (MIPS),J (2MASS/CTIO),H (2MASS/CTIO),K_s (2MASS/CTIO),0.5-8 keV (Chandra),3.6 microns (IRAC),5.8 microns (IRAC),1-2 keV (Chandra)
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2CXO J004736.9-733102,U,U,1,0,0,11.904005,-73.517280,-43.608193,303.306304,0.301062,...,17.19,,,,,,,17.58,,
2CXO J004829.5-732959,U,U,1,0,0,12.123114,-73.499901,-43.626698,303.220840,-0.296065,...,,,,,,,,,,
2CXO J005449.0-725128,U,U,1,0,0,13.704345,-72.857987,-44.267982,302.584181,0.438476,...,,,,,,,,,,
2CXO J010049.1-731526,U,U,1,0,0,15.204787,-73.257463,-43.853723,301.995228,-0.999375,...,,,,,,,,,,
2CXO J010049.8-731045,U,U,1,0,0,15.207831,-73.179381,-43.931660,301.988531,0.999375,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2CXO J035901.9+541600,U,U,1,0,0,59.758153,54.266748,0.859038,148.175189,-0.999375,...,,,,,,,,,,
2CXO J035901.9+542018,U,U,1,0,0,59.758232,54.338477,0.913583,148.128602,0.823235,...,,,,,,,,,,
2CXO J035902.2+535814,U,U,1,0,0,59.759235,53.970753,0.634484,148.368050,0.999375,...,,,,,,,,,,
2CXO J035904.1+541220,U,U,1,0,0,59.767388,54.205690,0.816143,148.218979,0.999375,...,,,,,,,,,,


In [55]:
df_final.to_csv('unid_sources_phot.csv')

> next move to implementation file for classification pipeline