In [46]:
import sqlite3
import re
import numpy as np
import pandas as pd
from astropy.table import Table
from astropy import units as u
from astropy.coordinates import SkyCoord
from lsst.afw.table import BaseCatalog
import data_process_utils as dpu

#### Flag Analysis

In [47]:
def get_confm(flux_df, art_df, flag):
    tp = (art_df[flag]  == 1).sum()
    fp = (flux_df[flag]  == 1).sum()
    tn = (flux_df[flag]  == 0).sum()
    fn = (art_df[flag]  == 0).sum()
    return np.array([tp, fp, tn, fn])
def get_confusion_matrix(flux_df, art_df, flag_cols):
    confusion_matrix = np.array([0, 0, 0, 0])
    flag_list = []
    for flag in flag_cols:
        row = get_confm(flux_df, art_df, flag)
        """
        if row[0] < len(flux_df) or row[2] < len(fp_df):
            confusion_matrix = np.vstack((confusion_matrix, row))
            flag_list.append(flag)
        """
        confusion_matrix = np.vstack((confusion_matrix, row))
        flag_list.append(flag)
    confusion_matrix = confusion_matrix[1:, :]  
    return confusion_matrix, flag_list

In [48]:
config = 'al_default'
db = (
    f'./data'
    f'/patch_0to6/diff/{config}/detection/detection.sqlite'
)
conn = sqlite3.connect(db)
flux = pd.read_sql_query("SELECT * FROM fake_src WHERE host_mag = '20_21' and fake_mag = 20", conn)
flux_det = flux.loc[flux.matched_status.astype(bool)]

db = (
    f'./data'
    f'/patch_0to6/diff/{config}/detection/detection.sqlite'
)
conn = sqlite3.connect(db)
art = pd.read_sql_query("SELECT * FROM artifact WHERE host_mag = '20_21' and fake_mag = 20", conn)

In [49]:
t = BaseCatalog.readFits('./data/patch_0to6/diff/al_default/00_20_21_1013665_79_i/diff_20/schema/deepDiff_diaSrc.fits')
schema = t.schema

In [50]:
len(art), len(flux), len(flux_det)

(1862, 1184, 1184)

In [51]:
FLAG_LIST = [
    'base_PixelFlags_flag_saturated', 'base_PixelFlags_flag_saturatedCenter',
    'base_PixelFlags_flag_suspect', 'base_PixelFlags_flag_suspectCenter',
    'base_PixelFlags_flag_offimage', 'base_PixelFlags_flag_edge','base_PixelFlags_flag_bad',
    'ip_diffim_DipoleFit_flag_classification', 'ip_diffim_DipoleFit_flag_classificationAttempted',
    'base_SdssShape_flag', 'base_GaussianFlux_flag_badShape', 'slot_Shape_flag'
]
confusion_matrix, flag_list = get_confusion_matrix(flux_det, art, FLAG_LIST)

FULL_FLAGS = []
for i in flux_det.columns:
    if re.search('flag', i):
        FULL_FLAGS.append(i)
confusion_matrix_full, flag_list_full = get_confusion_matrix(flux_det, art, FULL_FLAGS)

In [52]:
# selected flags
with open('flag_table.txt', "w+") as file:
    file.write("\\begin{deluxetable*}{llrrrr}\n")
    file.write("\\tablecaption{Confusion Matrces of Selected Flags "
               "\\footnote{Description of each flag is documented in "
               "the schema data product (deepDiff\_diaSrc.fits) of the DIA pipeline.} "
               "\\label{tab:flag_selected}}\n")
    file.write("\\tablewidth{0pt}\n")
    file.write("\\tablehead{\n")
    file.write("\\colhead{Flag} & \\colhead{Description} & \\colhead{TP} & \\colhead{FP} & \\colhead{TN} & \\colhead{FN}}\n")
    file.write("\\startdata\n")
    for i, flag in enumerate(FLAG_LIST):
        flag_name = flag.replace('_', '\\_')
        dscp = schema.extract(f"{flag}")[f'{flag}'].getField().getDoc()
        dscp = dscp.replace('_', '\\_')
        file.write(f"{flag_name} & {dscp} & {confusion_matrix[i, 0]} & {confusion_matrix[i, 1]} & "
                   f"{confusion_matrix[i, 2]}  &{confusion_matrix[i, 3]} \\\\\n")
    file.write("\\enddata\n")
    file.write("\\tablecomments{For a specific flag, "
    "TP represents the number of artifacts which have the flag set to True. "
    "TN is the number of synthetic transients which do not have the flag set to True. "
    "FP represents the number of simulated transients which have the flag set to True. "
    "FN is the number of artifacts which do not have the flag set to True. "
    "A flag with high TP, TN and low FP, FN is helpful for filtering out artifacts.}\n")
    file.write("\\end{deluxetable*}\n")

In [53]:
# all flags
with open('flag_table_full.txt', "w+") as file:
    file.write("\\begin{longrotatetable}\n")
    file.write("\\begin{deluxetable*}{llrrrr}\n")
    file.write("\\tablecaption{Confusion Matrces of All Flags "
               "\\footnote{Description of each flag is documented in "
               "the schema data product (deepDiff\_diaSrc.fits) of the DIA pipeline.} "
               "\label{tab:flag_full}}\n")
    file.write("\\tablewidth{700pt} \n")
    file.write("\\tabletypesize{\\scriptsize}\n")
    file.write("\\tablehead{\n")
    file.write("\\colhead{Flag} & \\colhead{Description} & \\colhead{TP} & \\colhead{FP} & \\colhead{TN} & \\colhead{FN}}\n")
    file.write("\\startdata\n")
    for i, flag in enumerate(FULL_FLAGS):
        flag_name = flag.replace('_', '\\_')
        dscp = schema.extract(f"{flag}")[f'{flag}'].getField().getDoc()
        dscp = dscp.replace('_', '\\_')
        file.write(f"{flag_name} & {dscp} & {confusion_matrix_full[i, 0]} & {confusion_matrix_full[i, 1]} & "
                   f"{confusion_matrix_full[i, 2]}  &{confusion_matrix_full[i, 3]} \\\\\n")
    file.write("\\enddata\n")
    file.write("\\tablecomments{For a specific flag, "
    "TP represents the number of artifacts which have the flag set to True. "
    "TN is the number of synthetic transients which do not have the flag set to True. "
    "FP represents the number of simulated transients which have the flag set to True. "
    "FN is the number of artifacts which do not have the flag set to True. "
    "A flag with high TP, TN and low FP, FN is helpful for filtering out artifacts.}\n")
    file.write("\\end{deluxetable*}\n")
    file.write("\\end{longrotatetable}\n")

In [59]:
flux_det_astropy = Table.from_pandas(flux_det)
flux_det_astropy_flag_applied = dpu.remove_flag_astropy(flux_det_astropy, FLAG_LIST)

art_astropy = Table.from_pandas(art)
art_astropy_flag_applied = dpu.remove_flag_astropy(art_astropy, FLAG_LIST)

print('eff: ', len(flux_det_astropy_flag_applied) / len(flux_det_astropy))

purity = len(flux_det_astropy_flag_applied) / (len(flux_det_astropy_flag_applied) + len(art_astropy_flag_applied))
print('purity ', purity)

eff:  0.9991554054054054
purity  0.8901429646350639


#### Artifact Statistics

In [13]:
psf_table = Table.read('./data/table/gal_4639/psf_fwhm.csv', format='ascii.csv')
psf_table['psf_ratio'] = (psf_table['calexp_fwhm'] - psf_table['coadd_fwhm']) / psf_table['coadd_fwhm']
base_flags = ['base_PixelFlags_flag_saturated', 'base_PixelFlags_flag_saturatedCenter', 'base_PixelFlags_flag_suspect',
              'base_PixelFlags_flag_suspectCenter', 'base_PixelFlags_flag_offimage', 'base_PixelFlags_flag_edge',
              'base_PixelFlags_flag_bad', 'base_NaiveCentroid_flag']
dipole_flags = ['ip_diffim_DipoleFit_flag_classification', 'ip_diffim_DipoleFit_flag_classificationAttempted']
full_detections = Table.read('./data/table/gal_4639/full_detections.csv', format='ascii.csv')
for flag in (base_flags+dipole_flags):
    full_detections[flag] = (full_detections[flag].data == 'True')
full_detections['diaSrc_snr'] = full_detections['base_PsfFlux_instFlux'] /full_detections['base_PsfFlux_instFluxErr']
full_detections['psf_ratio'] = (full_detections['calexp_fwhm'] - full_detections['coadd_fwhm']) / full_detections['coadd_fwhm']

In [14]:
art_astropy = Table.from_pandas(art)
art_coord = SkyCoord(np.rad2deg(art_astropy['coord_ra'].data),
                     np.rad2deg(art_astropy['coord_dec'].data),
                     frame="icrs", unit="deg")
det_coord = SkyCoord(np.rad2deg(full_detections['coord_ra'].data),
                     np.rad2deg(full_detections['coord_dec'].data),
                     frame="icrs", unit="deg")
matched_status, matched_idx = dpu.two_direction_skymatch(art_coord,  det_coord, radius=2 * u.arcsec)

In [15]:
# There are four missing artifacts.
len(art_coord), np.sum(matched_status)

(1862, 1858)

In [16]:
# 4 artifacts have matched status = Flase, however, their matched detection have the same coord to them
# probably artifacts table have duplicated artifacts from different visits.
# they get matched to the same detecion in the full_detection table
art_coord[~matched_status], det_coord[matched_idx[~matched_status]]

(<SkyCoord (ICRS): (ra, dec) in deg
     [(56.94192337, -30.93836375), (56.64995067, -30.73225229),
      (55.80279134, -30.20938777), (55.79787475, -30.21251826)]>,
 <SkyCoord (ICRS): (ra, dec) in deg
     [(56.94192337, -30.93836375), (56.64995067, -30.73225229),
      (55.80279134, -30.20938777), (55.79787475, -30.21251826)]>)

In [28]:
artifacts = full_detections[matched_idx]
base_rm = dpu.remove_flag_astropy(artifacts, flag_list=base_flags)

In [18]:
broad = base_rm[base_rm['psf_ratio']>=0]
near = base_rm[np.logical_and(base_rm['psf_ratio']<0, base_rm['psf_ratio']>=-0.05)]
narrow = base_rm[base_rm['psf_ratio']<-0.05]

In [19]:
len(artifacts), len(base_rm), len(broad), len(near), len(narrow)

(1862, 772, 261, 164, 347)

In [20]:
n_broad = np.sum(psf_table['psf_ratio']>=0)
n_near = np.sum(np.logical_and(psf_table['psf_ratio']<0, psf_table['psf_ratio']>=-0.05))
n_narrow = np.sum(psf_table['psf_ratio']<-0.05)

In [21]:
n_broad, n_near, n_narrow

(30, 16, 24)

In [22]:
# artifact per detector
len(broad)/30, len(near)/16, len(narrow)/24

(8.7, 10.25, 14.458333333333334)

In [23]:
morph = {'broad': [len(broad)/30], 'near': [len(near)/16], 'sharp': [len(narrow)/24]}
pd.DataFrame(data=morph, index=['fraction'])

Unnamed: 0,broad,near,sharp
fraction,8.7,10.25,14.458333


In [24]:
# artifact fraction
def get_fraction(all_det, base_flags, dipole_flags):
    art_stat = pd.DataFrame(index=['sat', 'dipole', 'remaining'])
    base_keep = dpu.keep_flag_astropy(all_det, flag_list=base_flags)
    base_rm = dpu.remove_flag_astropy(all_det, flag_list=base_flags)
    dipole_keep = dpu.keep_flag_astropy(base_rm, flag_list=dipole_flags)
    dipole_rm = dpu.remove_flag_astropy(base_rm, flag_list=dipole_flags)
    sat = len(base_keep) / len(all_det)
    dipole = len(dipole_keep) / len(all_det)
    remaining = len(dipole_rm) / len(all_det)
    art_stat['fraction'] = [sat, dipole, remaining]
    return art_stat.transpose()
    
all_broad = artifacts[artifacts['psf_ratio']>=0]
all_near = artifacts[np.logical_and(artifacts['psf_ratio']<0, artifacts['psf_ratio']>=-0.05)]
all_narrow = artifacts[artifacts['psf_ratio']<-0.05]

In [25]:
# broad
get_fraction(all_broad, base_flags, dipole_flags)

Unnamed: 0,sat,dipole,remaining
fraction,0.61674,0.140969,0.242291


In [26]:
# near
get_fraction(all_near, base_flags, dipole_flags)

Unnamed: 0,sat,dipole,remaining
fraction,0.549451,0.159341,0.291209


In [27]:
# sharp
get_fraction(all_narrow, base_flags, dipole_flags)

Unnamed: 0,sat,dipole,remaining
fraction,0.575275,0.181151,0.243574


#### matched artifact fraction

In [29]:
len(artifacts)

1862

In [36]:
d = artifacts['matched_status'].data

In [39]:
d

array(['True', 'True', 'True', ..., 'False', 'True', 'True'], dtype='<U5')

In [40]:
np.sum(artifacts['matched_status'].data=='True') / len(artifacts)

0.8952738990332976

89.53% of detected artifacts are from background sources.

#### unmatched artifacts

In [45]:
np.sum(broad['matched_status'].data=='False'), np.sum(near['matched_status'].data=='False') ,np.sum(narrow['matched_status'].data=='False')

(9, 3, 22)