In [None]:
#!/usr/bin/env python3
"""feature_vectors.ipynb
James Gardner 2019

reads in TGSS and NVSS sources in a 20Â° patch of sky
and computes positional matches within 10'
adds labels based off of positional matching

requires the unzipped catalogues to be present in cwd
and expects names: TGSSADR1_7sigma_catalog.tsv and CATALOG.FIT

saves feature vectors (individuals and matches) as:
tgss.csv, nvss.csv, and patch_catalogue.csv
"""

import pandas as pd
import numpy as np
from astropy.io import fits
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

PATCH_SIZE = 20
SEPARATION_LIMIT = 10*1/60

In [None]:
def geodesic_dist(p1,p2):
    """arguments are two points on the unit sphere,
    with ra and dec given in radians;
    returns their geodesic distance, see:
    https://en.wikipedia.org/wiki/Great-circle_distance#Formulae"""
    ra1,dec1,ra2,dec2 = p1[0],p1[1],p2[0],p2[1]
    decdiff = (dec1-dec2)/2
    radiff  = (ra1-ra2)/2
    better_circle = 2*np.arcsin(np.sqrt(np.sin(decdiff)**2
                    + np.cos(dec1)*np.cos(dec2) * np.sin(radiff)**2))
    r = 1
    return better_circle*r

def degdist(p1,p2):
    """calls geodesic_dist on two points,
    with ra and dec given in degrees;
    returns their separation in degrees"""
    return 180/np.pi*geodesic_dist([x*np.pi/180 for x in p1],
                                   [x*np.pi/180 for x in p2])

In [None]:
def deci_deg_to_deg_min_sec(deci_deg):
    """converts decimal degrees to degrees-minutes-seconds"""
    is_positive = (deci_deg >= 0)
    deci_deg = abs(deci_deg)
    # divmod returns quotient and remainder
    minutes,seconds = divmod(deci_deg*3600,60)
    degrees,minutes = divmod(minutes,60)
    degrees = degrees if is_positive else -degrees
    return (degrees,minutes,seconds)

def deci_deg_to_hr_min_sec(deci_deg):
    """converts decimal degrees to hours-minutes-seconds
    assumes that deci_deg is postitive"""
    deci_hours = deci_deg/15.
    schminutes,schmeconds = divmod(deci_hours*3600,60)
    hours,schminutes = divmod(schminutes,60)   
    return (hours,schminutes,schmeconds)

def iau_designation(ra,dec):
    """generate NVSS names as per:
    https://heasarc.gsfc.nasa.gov/W3Browse/all/nvss.html
    There are four cases where there are pairs of sources which are
    so close together that their names would be identical according
    to this schema (see below), and the HEASARC has added suffixes
    of 'a' (for the source with smaller RA) and 'b' (for the source
    with the larger RA) in such cases in order to differentate them.
    It was easier just to hard-code this in,
    should really check if designation alreadys exists and compare
    """
    hr,schmin,schmec = deci_deg_to_hr_min_sec(ra)
    rhh = str(int(hr)).zfill(2)
    rmm = str(int(schmin)).zfill(2)
    rss = str(int(schmec - schmec%1)).zfill(2)

    deg,minu,sec = deci_deg_to_deg_min_sec(dec)
    sgn = '+' if deg>=0 else '-'
    ddd = str(int(abs(deg))).zfill(2)
    dmm = str(int(minu)).zfill(2)
    dss = str(int(sec - sec%1)).zfill(2)

    designation = ''.join(('NVSS J',rhh,rmm,rss,sgn,ddd,dmm,dss))
    
    close_pairs = {'NVSS J093731-102001':144.382,
                   'NVSS J133156-121336':202.987,
                   'NVSS J160612+000027':241.553,
                   'NVSS J215552+380029':328.968}
    if designation in close_pairs:
        if ra < close_pairs[designation]:
            designation = ''.join((designation,'a'))
        else:
            designation = ''.join((designation,'b'))         

    return designation

In [None]:
# choice of patch is arbitrary but must be within both surveys
# testing shows no discernible difference between patches
PATCH_DEC = -35
PATCH_RA  = 149
def df_in_patch(df_ra,df_dec,):
    in_patch = ((PATCH_RA  < df_ra)  & (df_ra  < PATCH_RA+PATCH_SIZE) &
                (PATCH_DEC < df_dec) & (df_dec < PATCH_DEC+PATCH_SIZE))
    return in_patch

In [None]:
# import TGSS and save desired feature vectors
tgss_df = pd.read_csv('TGSSADR1_7sigma_catalog.tsv',delimiter='\t',
                      index_col=0,usecols=(0,1,3,5,7,9,11,13))
tgss_df = tgss_df.sort_values(by=['DEC'])

tgss_df['Total_flux'] = tgss_df['Total_flux']*1e-3
tgss_df['Peak_flux']  = tgss_df['Peak_flux']*1e-3

tgss_df = tgss_df[df_in_patch(tgss_df['RA'],tgss_df['DEC'])]

tgss_df.index.names = ['name_TGSS']
tgss_df.columns = ['ra_TGSS','dec_TGSS','integrated_TGSS','peak_TGSS',
                   'major_ax_TGSS','minor_ax_TGSS','posangle_TGSS']

tgss_df.to_csv('tgss.csv')

In [None]:
# import NVSS and save feature vectors
with fits.open('CATALOG.FIT') as hdulist:
    data = hdulist[1].data
    nvss_data = np.column_stack((data['RA(2000)'],data['DEC(2000)'],data['PEAK INT'],
                                 data['MAJOR AX'],data['MINOR AX'],data['POSANGLE'],
                                 data['Q CENTER'],data['U CENTER'],data['P FLUX'],
                                 data['RES PEAK'],data['RES FLUX']))
    nvss_columns = ['RA(2000)','DEC(2000)','PEAK INT','MAJOR AX','MINOR AX','POSANGLE',
                    'Q CENTER','U CENTER','P FLUX','RES PEAK','RES FLUX']
    nvss_df = pd.DataFrame(data = nvss_data, columns = nvss_columns)
    nvss_df = nvss_df.sort_values(by=['DEC(2000)']).reset_index(drop = True)

    nvss_df = nvss_df[df_in_patch(nvss_df['RA(2000)'],nvss_df['DEC(2000)'])]

nvss_labels = np.array([iau_designation(p[0],p[1]) for p in
                        nvss_df[['RA(2000)','DEC(2000)']].values])
nvss_df['name_NVSS'] = nvss_labels
nvss_df.set_index('name_NVSS',inplace=True)

nvss_df.columns = ['ra_NVSS','dec_NVSS','peak_NVSS','major_ax_NVSS','minor_ax_NVSS','posangle_NVSS',
                  'q_centre_NVSS','u_centre_NVSS','polarised_NVSS','res_peak_NVSS','res_flux_NVSS']

nvss_df.to_csv('nvss.csv')

In [None]:
# positional matching the two surveys
tgss = tgss_df[['ra_TGSS','dec_TGSS']].values
nvss = nvss_df[['ra_NVSS','dec_NVSS']].values

nvss_dec_min = round(nvss[:,1].min(),1)
nvss_dec_max = round(nvss[:,1].max(),1)

# rough filtering using sorting of both surveys
# this is a small scale of the process in sky_positional_matching.ipynb
patch_matches = []
tqdmbar = tqdm(total=len(tgss))
for i1,p1 in enumerate(tgss):
    for i2,p2 in enumerate(nvss):
        if   p2[1] < p1[1] - SEPARATION_LIMIT:
            continue
        elif p1[1] + SEPARATION_LIMIT < p2[1]:
            break
        elif (abs((p1[0]-p2[0])*np.cos(p1[1]*np.pi/180)) < SEPARATION_LIMIT
                and abs(p1[1]-p2[1]) < SEPARATION_LIMIT):
            patch_matches.append((i1,i2))
    tqdmbar.postfix = 'matches = {}'.format(len(patch_matches))
    tqdmbar.update(1)
patch_matches = np.array(patch_matches)

In [None]:
# proper filtering based off of proper geodesic distance
tmp_patch_matches = []
for i1,i2 in tqdm(patch_matches):
    p1,p2 = tgss[i1],nvss[i2]
    d = degdist(p1,p2)
    if d < SEPARATION_LIMIT:
        tmp_patch_matches.append([i1,i2])
patch_matches = np.array(tmp_patch_matches)

patch_cat_columns = np.concatenate((tgss_df.reset_index().columns.values,
                                    nvss_df.reset_index().columns.values))
patch_cat = pd.DataFrame(columns=patch_cat_columns)

In [None]:
# construction of combined match catalogue
# warning: this can take a few minutes to complete
FREQ_TGSS,FREQ_NVSS = 150e6,1.4e9

for i1,i2 in tqdm(patch_matches):
    obj_t = tgss_df.reset_index().iloc[i1]
    obj_n = nvss_df.reset_index().iloc[i2]
    match_row = {**obj_t,**obj_n}

    separation = degdist((obj_t['ra_TGSS'],obj_t['dec_TGSS']),
                         (obj_n['ra_NVSS'],obj_n['dec_NVSS']))
    match_row['separation'] = separation

    # adding derived feature of spectral index will prove useful
    alpha = np.log(obj_t['peak_TGSS']/obj_n['peak_NVSS'])/np.log(FREQ_NVSS/FREQ_TGSS)
    match_row['spectral_alpha'] = alpha

    patch_cat = patch_cat.append(match_row, ignore_index=True)

patch_cat.set_index(['name_TGSS','name_NVSS'], inplace=True)

In [None]:
def separation_scorer(name_TGSS,name_NVSS):
    """given a name in each catalogue s.t. match is in patch_catalogue
    returns a score based off of the separation between sources
    note the choice of threshold at 40'' cuts off the second mode
    of separations as seen in hist_angle.pdf from sky_positional_matching.ipynb"""
    threshold = 40
    if (name_TGSS,name_NVSS) in patch_cat.index:
        sep = patch_cat.loc[name_TGSS,name_NVSS].separation
        sep *= 3600
        return max(0,(threshold-sep)/threshold)
    else:
        return 0

In [None]:
# labels to train against are the rounded separation scores
scores = [separation_scorer(name_TGSS,name_NVSS)
          for (name_TGSS,name_NVSS) in tqdm(patch_cat.index.values)]
patch_cat['score'] = scores
patch_cat.sort_values(by=['score'],inplace=True)
patch_cat.to_csv('patch_catalogue.csv')

In [None]:
# plot histogram of separation scores, observe 0.8 cut-off decision
scores = patch_cat['score'].values

plt.figure(figsize=(14,7))
plt.rcParams.update({'font.size': 18})
plt.hist(scores[scores>0],bins=80,density=True,color='slateblue')
plt.title('scores from separation for matches inside patch')
plt.xlabel('scores')
plt.ylabel('pdf')
plt.savefig('patch_scores_dist.pdf')