In [1]:
from ml_collections.config_dict import ConfigDict
import pandas as pd
import numpy as np
from astropy.io import fits
import utils.helpers as hp
from configuration import get_config

In [2]:
def extract_data(config: ConfigDict) -> dict:
    """
    Extract the data from the fits file. We also keep only the data points for which the
    magnitude values are below the magnitude limits.

    Args:
        config (ConfigDict): the main configuration file

    Returns:
        dict: a dictionary containing all the important quantities
    """
    fits_image = fits.open(config.paths.fitsfile)
    data = fits_image[1].data
    fits_image.close()

    quantities = dict(config.colnames)
    record = dict()
    
    # extract all the data points
    for qname in quantities:
        columns = quantities[qname]
        data_extracted = np.asarray([data[columns[i]] for i in range(len(columns))]).T
        record[qname] = pd.DataFrame(data_extracted, columns=columns, dtype = config.dtypes[qname])

    # choose rows for which the magnitudes are within the magnitude limit
    condition = np.sum((record['mag'].values < record['maglim'].values)*1, axis = 1)
    condition = (condition == 9) * 1 
    
    # apply the cuts 
    for qname in quantities:
        record[qname] = record[qname][condition == 1].reset_index(drop=True)
    return record 

In [63]:
def assign_binlabel(config, dataframe):
    nobjects = dataframe.shape[0]
    dfindex = list(dataframe.index)
    nbins = len(config.redshift.bounds)
    binlabels = [f'BIN_{i}' for i in range(nbins)]
    
    # record the bin label 
    recordbin = list()
    
    # this is the index of the object found in the catalogue
    recordindex = list() 
    
    
    for i in range(nobjects):
        for index in range(nbins):
            bound_1 = config.redshift.bounds[index][0]
            bound_2 = config.redshift.bounds[index][1]
            
            # some edge effect (the maximum redshift in the catalogue is 1.200195, not 1.2)
            if index == nbins-1:
                bound_2 += 0.01
            condition_1 = dataframe['Z_B'].values[i] > bound_1
            condition_2 = dataframe['Z_B'].values[i] <= bound_2
            if condition_1 and condition_2:
                recordbin.append(binlabels[index])
                recordindex.append(dfindex[i])
                
    df_binlabel = pd.DataFrame(recordbin, columns=['BINLABEL'], index=recordindex)
    return df_binlabel

In [64]:
def correct_data(config: ConfigDict, data: dict) -> dict:

    unique_names = np.unique(data['theliname'])
    nunique = len(unique_names)
    print(f'Number of tiles is: {nunique}')
    
    assert config.ntiles < nunique, 'The number of tiles is greater than the number of available tiles.'
    tiles = dict()
    for i in range(config.ntiles):
        record_tile = dict()
        
        # find the objects within a particular tile
        tile = data['theliname'] == unique_names[i]
        tile = tile.values
        
        # calculate the correction term
        scaled_magnitude = data['mag'][tile] + 2.5 * np.log10(data['flux'][tile].values)
        correction = 10**(0.4*data['extinction'][tile]) * 10**(-0.4 * np.median(scaled_magnitude.values, axis = 0))
        
        # correct for magnitude, flux and flux error
        record_tile['mag'] = data['mag'][tile] - data['extinction'][tile].values
        record_tile['flux'] = data['flux'][tile] * correction.values
        record_tile['fluxerr'] = data['fluxerr'][tile] * correction.values
        
        # record other important quantities 
        record_tile['magerr'] = data['magerr'][tile]
        record_tile['redshift'] = data['redshift'][tile]
        record_tile['theliname'] = data['theliname'][tile]
        record_tile['maglim'] = data['maglim'][tile]
        record_tile['extinction'] = data['extinction'][tile]
        
        # assign bin labels
        record_tile['binlabel'] = assign_binlabel(config, record_tile['redshift'])
        
        # record that specific tile
        tiles[unique_names[i]] = record_tile
        
        # save the tiles
        hp.pickle_save(record_tile, config.paths.tiles, unique_names[i])
        
        print(f'Number of objects in tile {unique_names[i]} is : {sum(tile*1)}')
    return tiles

In [65]:
config = get_config('KiDS-1000')
record = extract_data(config)
tiles = correct_data(config, record)

Number of tiles is: 988
Number of objects in tile KIDS_0p0_m28p2 is : [17337]
Number of objects in tile KIDS_0p0_m29p2 is : [14137]
Number of objects in tile KIDS_0p0_m30p2 is : [15246]
Number of objects in tile KIDS_0p0_m31p2 is : [15334]
Number of objects in tile KIDS_0p0_m32p1 is : [19782]


# Extracted Data

In [66]:
record.keys()

dict_keys(['extinction', 'flux', 'fluxerr', 'mag', 'magerr', 'maglim', 'redshift', 'theliname'])

In [67]:
record['mag'].head()

Unnamed: 0,MAG_GAAP_u,MAG_GAAP_g,MAG_GAAP_r,MAG_GAAP_i,MAG_GAAP_Z,MAG_GAAP_Y,MAG_GAAP_J,MAG_GAAP_H,MAG_GAAP_Ks
0,24.40625,23.5,22.9375,22.625,21.75,21.984375,21.703125,21.65625,20.703125
1,24.8125,24.03125,23.0625,22.265625,21.640625,21.453125,20.96875,20.5,20.09375
2,23.40625,23.796875,23.5,23.015625,22.3125,22.0625,21.78125,21.484375,21.703125
3,25.203125,25.96875,24.0,22.84375,21.921875,21.484375,21.15625,20.796875,20.46875
4,22.078125,21.3125,20.609375,20.375,20.015625,20.015625,19.703125,19.515625,19.3125


# Clean Data

In [68]:
tiles.keys()

dict_keys(['KIDS_0p0_m28p2', 'KIDS_0p0_m29p2', 'KIDS_0p0_m30p2', 'KIDS_0p0_m31p2', 'KIDS_0p0_m32p1'])

In [69]:
tiles['KIDS_0p0_m28p2'].keys()

dict_keys(['mag', 'magerr', 'flux', 'fluxerr', 'redshift', 'theliname', 'maglim', 'extinction', 'binlabel'])

In [70]:
tiles['KIDS_0p0_m28p2']['mag'].head()

Unnamed: 0,MAG_GAAP_u,MAG_GAAP_g,MAG_GAAP_r,MAG_GAAP_i,MAG_GAAP_Z,MAG_GAAP_Y,MAG_GAAP_J,MAG_GAAP_H,MAG_GAAP_Ks
8123868,24.09375,24.171875,22.65625,21.53125,20.8125,20.40625,20.015625,19.578125,19.171875
8123869,23.796875,23.609375,23.046875,22.765625,22.265625,22.140625,21.921875,22.046875,21.5625
8974042,24.0,24.71875,24.125,22.984375,23.765625,22.828125,23.421875,23.296875,23.09375
8974043,22.9375,22.671875,22.125,21.6875,21.734375,21.671875,21.5625,21.65625,21.03125
8974044,23.859375,24.1875,23.75,23.234375,22.625,22.546875,22.28125,22.40625,22.265625


In [77]:
tiles['KIDS_0p0_m30p2']['binlabel'].head()

Unnamed: 0,BINLABEL
6826357,BIN_2
6826358,BIN_0
6826359,BIN_4
6826360,BIN_3
6826361,BIN_3


In [78]:
tiles['KIDS_0p0_m30p2']['redshift'].head()

Unnamed: 0,Z_B,Z_ML
6826357,0.629883,0.640137
6826358,0.219971,0.010002
6826359,0.97998,1.05957
6826360,0.75,0.75
6826361,0.700195,0.700195


# Tiles to Bins

Now that we have the data for the different tiles, we have to aggregate by bin label.

In [87]:
nbins = len(config.redshift.bounds)
binlabels = [f'BIN_{i}' for i in range(nbins)]

specificbins_redshift = {binlabels[i]: [] for i in range(nbins)}
specificbins_binlabel = {binlabels[i]: [] for i in range(nbins)}
specificbins_flux = {binlabels[i]: [] for i in range(nbins)}
specificbins_fluxerr = {binlabels[i]: [] for i in range(nbins)}
specificbins_mag = {binlabels[i]: [] for i in range(nbins)}
specificbins_magerr = {binlabels[i]: [] for i in range(nbins)}

In [89]:
for tile in tiles:
    for binlabel in binlabels:
        condition = tiles[tile]['binlabel'] == binlabel
        condition = condition.values
        
        # record all the quantities we need 
        specificbins_redshift[binlabel].append(tiles[tile]['redshift'][condition])
        specificbins_binlabel[binlabel].append(tiles[tile]['binlabel'][condition])
        specificbins_flux[binlabel].append(tiles[tile]['flux'][condition])
        specificbins_fluxerr[binlabel].append(tiles[tile]['fluxerr'][condition])
        specificbins_mag[binlabel].append(tiles[tile]['mag'][condition])
        specificbins_magerr[binlabel].append(tiles[tile]['magerr'][condition])

In [97]:
def data_per_bin(config, redshift, mag, magerr, flux, fluxerr, binlabel):
    record_bins = dict()
    for 
    record['redshift'] = pd.concat(redshift[f'BIN_{binnumber}'])
    record['flux'] = pd.concat(flux[f'BIN_{binnumber}'])
    record['mag'] = pd.concat(mag[f'BIN_{binnumber}'])
    record['fluxerr'] = pd.concat(fluxerr[f'BIN_{binnumber}'])
    record['magerr'] = pd.concat(magerr[f'BIN_{binnumber}'])
    record['binlabel'] = pd.concat(binlabel[f'BIN_{binnumber}'])
    return record

In [98]:
info_bin_0 = data_per_bin(specificbins_redshift, specificbins_mag, specificbins_magerr, 
                         specificbins_flux, specificbins_fluxerr, specificbins_binlabel, binnumber = 0)

In [99]:
info_bin_0.keys()

dict_keys(['redshift', 'flux', 'mag', 'fluxerr', 'magerr', 'binlabel'])

In [100]:
info_bin_0['redshift']

Unnamed: 0,Z_B,Z_ML
8974045,0.239990,0.090027
8974048,0.280029,0.260010
9017070,0.219971,0.219971
9058622,0.180054,0.180054
9058627,0.270020,0.250000
...,...,...
10505702,0.209961,0.209961
10505714,0.140015,0.010002
10505724,0.130005,0.130005
10505739,0.270020,0.260010


In [101]:
info_bin_0['mag']

Unnamed: 0,MAG_GAAP_u,MAG_GAAP_g,MAG_GAAP_r,MAG_GAAP_i,MAG_GAAP_Z,MAG_GAAP_Y,MAG_GAAP_J,MAG_GAAP_H,MAG_GAAP_Ks
8974045,24.500000,22.765625,22.046875,21.625000,21.437500,21.265625,21.203125,21.140625,20.890625
8974048,23.281250,22.671875,22.093750,21.906250,21.796875,21.765625,21.531250,21.468750,21.921875
9017070,23.468750,22.484375,21.765625,21.453125,21.234375,21.296875,20.906250,20.718750,20.906250
9058622,22.421875,21.484375,21.000000,20.796875,20.656250,20.546875,20.390625,20.312500,20.187500
9058627,24.578125,23.375000,22.609375,22.421875,22.125000,22.171875,21.875000,21.781250,21.984375
...,...,...,...,...,...,...,...,...,...
10505702,21.765625,21.078125,20.640625,20.531250,20.562500,20.453125,20.140625,20.000000,20.140625
10505714,22.968750,21.968750,21.375000,21.218750,20.968750,20.765625,20.718750,20.593750,20.828125
10505724,22.140625,21.531250,21.265625,21.093750,21.031250,20.937500,20.968750,21.015625,21.031250
10505739,23.156250,22.531250,21.718750,21.484375,21.078125,20.906250,20.750000,20.296875,20.062500
