In [None]:
import os, sys
import numpy as np
from astropy.table import Table
from astropy.io import fits
import pandas as pd

%load_ext autoreload
%autoreload 2

# Proprocessing the SDSS (non-lens) catalog

Authors: Mike Baumer (mbaumer), Ji Won Park (jiwoncpark)

This notebook contains decision choices behind preprocessing the SDSS catalog queried from the [CFHTLenS catalog query page](http://www.cadc-ccda.hia-iha.nrc-cnrc.gc.ca/en/community/CFHTLens/query.html) before feeding it into SDSSRealizer. As the purpose of this set is to serve as non-lenses that are similar to lenses for a binary classifier, some bold cuts were made.

In [None]:
data_path = os.path.join(os.environ['SLREALIZERDIR'], 'data')
sys.path.insert(0, data_path)
sdss_f = os.path.join(data_path, 'sdss_original.fits')
bands = ['u', 'g', 'r', 'i', 'z']

In [None]:
%%time
data = Table.read(sdss_f, format='fits')

Let's take a look at the columns. This is more information than would make sense for making the LSST-like source table!

In [None]:
data.colnames

First, we make cuts on reasonableness.

In [None]:
for b in bands:
    data = data[(data['err_' + b] < 3)]
    data = data[(data['mE1E1Err_' + b] > 0)]
    data[(data['mE2E2Err_' + b] > 0)]
    data = data[(data['modelFlux_' + b] < 1.e9) & (data['modelFlux_' + b] > 0.001)]
    data = data[(data['offsetRa_' + b] < 4) & (data['offsetRa_' + b] > -4)]
    data = data[(data['offsetDec_' + b] < 4) & (data['offsetDec_' + b] > -4)]
    data = data[(~np.isnan(data['mE1_' + b])) & (data['mE1_' + b] < 15) & (data['mE1_' + b] > -15)]
    data = data[(~np.isnan(data['mE2_' + b])) & (data['mE2_' + b] < 15) & (data['mE2_' + b] > -15)]
    data = data[(data['mRrCc_' + b] < 300) & (data['mRrCc_' + b] > -300) & (np.abs(data['mRrCc_' + b]) > 1)]

We only keep the columns we want.

In [None]:
from itertools import product
keep_cols = ['mE1_', 'mE2_', 'offsetRa_', 'offsetDec_', 'modelFlux_', 'mRrCc_']
keep_cols = [c + b for c, b in product(keep_cols, bands)]

Since fits does not let us select a list of columns, just 1 or all (claim not yet confirmed; please prove me wrong!), we will first export to a Pandas dataframe.

I've used astropy's Table instead of the native fits because of a compiler mismatch between fits and pandas. This mismatch calls for a byteswap (fits uses big-endian and pandas uses little-endian).

In [None]:
df = data.to_pandas()
df_processed = df[keep_cols]
df_small = df_processed[:78] # for toydata

Lastly, we will add a column representing (fake) object IDs for these non-lenses. To prevent double assignments, the ID number will have to start with the maximum existing ID in the OM10 lens catalog, which happens to be `maxLensId = 222232634`.

In [None]:
f = os.path.join(data_path, 'qso_mock.fits')
from astropy.io import fits
d = fits.getdata(f)
print(d['LENSID'].max())

In [None]:
def add_objectId_column(df):
    idx=0
    numRows = len(df)
    maxLensId = 222232634
    idCol = np.arange(maxLensId, maxLensId + numRows)
    df.insert(loc=idx, column='objectId', value=idCol)
    return df

In [None]:
df_small = add_objectId_column(df_small)

Save the Dataframe to file, and we're done!

In [None]:
#save_filename = os.path.join(data_path, 'sdss_object_processed.csv')
#df_processed.to_csv(save_filename)
toy_fname = os.path.join(data_path, 'sdss_toy_processed.csv')
df_small.to_csv(toy_fname)