In [2]:
import numpy as np
from tqdm import tqdm
from astropy.io import ascii
import pandas as pd

table = ascii.read('..\Fermi-LAT Data\hdu1.txt')
fermi_lat = table.to_pandas()
fermi_lat.to_csv('..\Fermi-LAT Data\hdu1.csv')
fl = fermi_lat.drop(columns=['ROI_num','RA_Counterpart','DEC_Counterpart','Conf_68_SemiMajor', 'Conf_68_SemiMinor', 'Conf_68_PosAng',
       'Conf_95_SemiMajor', 'Conf_95_SemiMinor', 'Conf_95_PosAng','RAJ2000', 'DEJ2000','DataRelease','GLON','GLAT','Source_Name',
       'Extended_Source_Name','SpectrumType','ASSOC_4FGL','ASSOC_FGL','ASSOC_FHL','ASSOC_GAM1','ASSOC_GAM2','ASSOC_GAM3','TEVCAT_FLAG',
       'ASSOC_TEV','CLASS1','CLASS2','ASSOC1','ASSOC2','Flags'])

'''
The above columns are dropped because they are either positional data or non-numerical data.

Uncertainty data is also dropped below.

'''


#fl = fermi_lat.drop(columns=[])

fl_classes = fermi_lat['CLASS1']
fl_classes.to_csv('fl_classes.csv')

uncertainties = np.array([])
for x in fl.columns:
    if 'Unc' in x:
        uncertainties = np.append(uncertainties,x)
fl = fl.drop(columns=uncertainties)
print(fl.dtypes.value_counts())
fl.columns

float64    26
object      5
Name: count, dtype: int64


Index(['Signif_Avg', 'Pivot_Energy', 'Flux1000', 'Energy_Flux100',
       'PL_Flux_Density', 'PL_Index', 'LP_Flux_Density', 'LP_Index', 'LP_beta',
       'LP_SigCurv', 'LP_EPeak', 'PLEC_Flux_Density', 'PLEC_IndexS',
       'PLEC_ExpfactorS', 'PLEC_Exp_Index', 'PLEC_SigCurv', 'PLEC_EPeak',
       'Npred', 'Flux_Band', 'nuFnu_Band', 'Sqrt_TS_Band', 'Variability_Index',
       'Frac_Variability', 'Signif_Peak', 'Flux_Peak', 'Time_Peak',
       'Peak_Interval', 'Flux_History', 'Sqrt_TS_History', 'ASSOC_PROB_BAY',
       'ASSOC_PROB_LR'],
      dtype='object')

In [2]:
col0 = np.zeros(len(fl))
col1 = np.zeros(len(fl))
col2 = np.zeros(len(fl))
col3 = np.zeros(len(fl))
col4 = np.zeros(len(fl))
col5 = np.zeros(len(fl))
col6 = np.zeros(len(fl))
col7 = np.zeros(len(fl))

columnlist = ['Flux_Band', 'nuFnu_Band', 'Sqrt_TS_Band',
       'Flux_History','Sqrt_TS_History']

for name in columnlist:    
    for i in range(0,len(fl)):
        obj_data = fl[name][i]
        obj_data = obj_data.replace('[','') 
        obj_data = obj_data.replace(']','') 
        obj_data = obj_data.replace('null','0')
        float_data = [float(idx) for idx in obj_data.split(',')]
        for x in range(0,len(float_data)):
            if float_data[x] == 0:
                float_data[x] = np.nan
        col0[i] = float_data[0]
        col1[i] = float_data[1]
        col2[i] = float_data[2]
        col3[i] = float_data[3]
        col4[i] = float_data[4]
        col5[i] = float_data[5]
        col6[i] = float_data[6]
        col7[i] = float_data[7]
    fl[name + '_0'] = col0
    fl[name+'_1'] = col1
    fl[name+'_2'] = col2
    fl[name+'_3'] = col3
    fl[name+'_4'] = col4
    fl[name+'_5'] = col5
    fl[name+'_6'] = col6
    fl[name+'_7'] = col7

fl = fl.drop(columns=columnlist)

fl.to_csv('..\Fermi-LAT Data\FL_numericalonly_nopositional.csv')

cl = pd.read_csv('..\\Fermi-LAT Data\\fl_classes_numerical.csv',index_col=0)


In [3]:
column_list = fl.columns
nan_count = np.zeros(len(column_list))


fl = fl.replace(np.inf,np.nan)
fl = fl.replace(-np.inf,np.nan)

'''
REPLACING ALL INFINITE VALUES WITH NaN BECAUSE SCIKIT LEARN CANNOT READ THEM
'''

for i in range(0,len(column_list)):
    n_nans = (1 - (fl[column_list[i]].count()/len(fl[column_list[i]]))) * 100
    #print('nans in ' + x + ': ' + str(n_nans))
    nan_count[i] = n_nans


nan_data = np.stack((column_list,nan_count),axis=1)
nc = pd.DataFrame(nan_data,columns=['Parameter','Percentage NaNs'])
nc = nc.drop([0])
#nc.to_csv('NaN_percentage_by_column.csv')

drop_cols = np.array([])

for i in range(1,len(nc)):
    if nc['Parameter'][i] == 'CLASS1':
        pass
    elif nc['Percentage NaNs'][i] >= 20 :
        nc_sub20 = nc.drop([i])
        drop_cols = np.append(drop_cols,nc['Parameter'][i])

#nc_sub20.to_csv('NaN_percentage_by_column_sub20.csv')

fl_sub20 = fl.drop(columns=drop_cols)

fl = fl_sub20.dropna()

In [4]:
fl_cl = pd.merge(fl,cl,left_index=True,right_index=True)

fl_cl.dropna()

fl_cl.to_csv('..\\Fermi-LAT Data\\fl_numericalonly_nopositional_withclasses.csv')


mask1 = fl_cl['CLASS1'] == 4
mask2 = fl_cl['CLASS1'] == 0

fl_cl = fl_cl[~mask1]
fl_cl = fl_cl[~mask2]


fl_cl.head()


  fl_cl = fl_cl[~mask2]


Unnamed: 0,Signif_Avg,Pivot_Energy,Flux1000,Energy_Flux100,PL_Flux_Density,PL_Index,LP_Flux_Density,LP_Index,LP_beta,LP_SigCurv,...,Flux_History_7,Sqrt_TS_History_0,Sqrt_TS_History_1,Sqrt_TS_History_2,Sqrt_TS_History_3,Sqrt_TS_History_4,Sqrt_TS_History_5,Sqrt_TS_History_6,Sqrt_TS_History_7,CLASS1
4,24.49722,2065.1467,7.024454e-10,7.807578e-12,1.690367e-13,2.081634,1.778363e-13,2.05247,0.037899,1.283201,...,1.039027e-08,7.926063,2.934573,5.890346,10.227452,7.208447,5.825856,6.605715,6.77131,1
10,16.672523,3402.819,2.461732e-10,2.441939e-12,2.144947e-14,1.817964,2.620644e-14,1.599259,0.160069,3.092949,...,2.305279e-10,4.456632,5.560203,3.472792,2.216244,5.653224,6.638917,3.212988,2.189012,1
25,13.404971,1446.4923,3.228662e-10,2.490286e-12,1.497786e-13,2.16073,1.797058e-13,2.042221,0.153955,2.639367,...,2.286408e-09,5.38886,3.738754,5.641364,1.812777,3.114181,4.748802,3.76637,2.456795,1
27,6.878327,2795.2131,1.67021e-10,1.111306e-12,1.933776e-14,2.053279,2.674631e-14,1.782085,0.303313,2.515318,...,2.298051e-10,4.283851,2.647704,1.020642,4.712511,0.972565,3.475557,0.09789,0.814137,1
29,23.338678,691.1044,4.611794e-10,5.242116e-12,1.507037e-12,2.404988,1.628805e-12,2.322214,0.079294,2.371903,...,3.127961e-09,11.69871,15.035206,7.581436,3.281046,3.570668,4.137671,2.708634,2.464151,2


Glossary of Terms and Acronyms:

PLEC = Power Law Exponential Cutoff
TS = Test Statistic: Used as threshold for inclusion of new sources.
ROI = Region of Interest??
Unc = Uncertainty