In [1]:
import random
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Circle
import pandas as pd

import photutils
from photutils import detect_threshold
from astropy.io import fits
from astropy.wcs import WCS
from astropy.modeling.fitting import LevMarLSQFitter
from astropy.modeling.functional_models import Gaussian2D

from astropy.convolution import Gaussian2DKernel
from astropy.stats import gaussian_fwhm_to_sigma
from photutils import detect_sources
import time

import shutil
from math import pi, log, sqrt
from tqdm.auto import tqdm
import fnmatch
import random
import warnings
warnings.filterwarnings("ignore")



In [2]:
# Define data paths
DR2_path = "/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2/"
mosaic_path = "/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/"
writeable = "/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_writable/"

# Calculate radius mosaic

In [3]:
def mosaic(file):
    hdul = fits.open(file)
    # data = hdul[0].data # The intensity data
    conv = hdul[0]._header["BMAJ"] # Should be 1/4 * BMAJ
    
    x = hdul[0]._header['NAXIS1']
    y = hdul[0]._header['NAXIS2']
    
    center = (x/2, y/2)

    # finite_area = np.sum(np.isfinite(data))
    # radius = np.sqrt(finite_area/pi) * conv
    radius = (max(x, y) / 2) * conv * (3/8)
    
    w = WCS(hdul[0].header)
    ra, dec = w.all_pix2world(x/2, y/2, 0, ra_dec_order=True)
    
    return np.array((radius, ra, dec))

In [4]:
fits_files = [mosaic_path + f for f in os.listdir(mosaic_path)]
info = []
for file in fits_files:
    info.append(mosaic(file))
    
info = np.array(info)

# Find overlapping mosaics

In [5]:
overlap_dict = {k.split('/')[-1]: [] for k in fits_files}

for i, f1 in enumerate(fits_files):
    f1 = f1.split('/')[-1]
    for j, f2 in enumerate(fits_files):
        if j > i:
            f2 = f2.split('/')[-1]
            offset = np.array((info[i][1] - info[j][1], info[i][2] - info[j][2]))
            distance = np.sqrt(np.sum(np.power(offset, 2)))
            radius = info[i][0] + info[j][0] 
            if distance <= radius:
                overlap_dict[f1].append(f2)
                overlap_dict[f2].append(f1)

# Pair mosaics

In [6]:
pairs = []
for combs in list(overlap_dict.items()):
    for m in combs[1]:
        p = tuple(sorted([combs[0].split('_')[0], m.split('_')[0]]))
        if p not in pairs:
            pairs.append(p)
print(pairs[:5])

[('P223+50', 'P223+52'), ('P219+50', 'P223+52'), ('P223+52', 'P227+50'), ('P219+52', 'P223+52'), ('P218+55', 'P223+52')]


# Prepare catalogue

In [7]:
header = ["label", "total_pixels", "x_pixels", "y_pixels",
          "integrated_intensity", "brightest_pixel", "brightest_pixel_x", "brightest_pixel_y",
          "brightest_pixel_RA", "brightest_pixel_DEC", "center_of_mass_x", "center_of_mass_y",
          "center_of_mass_RA", "center_of_mass_DEC", "center_of_gaus_fit_x", "center_of_gaus_fit_y",
          "center_of_gaus_fit_RA", "center_of_gaus_fit_DEC", "fit_x_axis", "fit_y_axis", "fit_theta",
          "deconv_x", "deconv_y", "integrated_intensity_fit", "ratio_residual"
         ]
catalogue = pd.read_csv(writeable + "catalogue_v5.csv", sep=",", names=header)

# Filter nans
rows = len(catalogue)
nan_rows = catalogue[catalogue.isnull().any(axis=1)]
ratio_nan = len(nan_rows) / rows
print('{0:.2f}% nan rows'.format(ratio_nan * 100))

# Show new catalogue
catalogue = catalogue[~catalogue.index.isin(nan_rows.index)].reindex()
catalogue['overlap'] = 0

# Add mosaic
catalogue['mosaic'], catalogue['object'] = catalogue['label'].str.split('_', 1).str
catalogue['object'] = catalogue['object'].astype(int)
catalogue.head()

0.15% nan rows


Unnamed: 0,label,total_pixels,x_pixels,y_pixels,integrated_intensity,brightest_pixel,brightest_pixel_x,brightest_pixel_y,brightest_pixel_RA,brightest_pixel_DEC,...,fit_x_axis,fit_y_axis,fit_theta,deconv_x,deconv_y,integrated_intensity_fit,ratio_residual,overlap,mosaic,object
0,P191+55_0,43,7,7,0.961113,0.001262,4195,13,191.526076,53.102602,...,1.719762,1.896624,-0.532125,0.632697,1.986707,0.971715,0.020838,0,P191+55,0
1,P191+55_1,20,7,5,0.326739,0.000757,4605,27,191.241494,53.108014,...,1.519698,1.080085,0.057949,0.0,0.0,0.330021,0.193748,0,P191+55,1
2,P191+55_2,45,8,7,0.745168,0.000793,3745,32,191.838427,53.11019,...,2.20038,1.81005,-0.030505,3.29362,1.472263,0.761026,0.051991,0,P191+55,2
3,P191+55_3,43,10,7,0.70281,0.000809,3775,33,191.817606,53.110655,...,1.720535,2.116901,0.055498,0.644249,2.9748,0.705155,0.082425,0,P191+55,3
4,P191+55_4,74,14,9,1.104583,0.000829,4346,35,191.421253,53.111697,...,3.113873,2.061817,-0.772092,6.145499,2.751915,1.080358,0.16109,0,P191+55,4


# Finding overlapping sources

In [8]:
mosaic_dict = {k.split('_')[0]: i for i, k in enumerate(overlap_dict.keys()) }
mosaic_dict_rev = {i: k.split('_')[0] for i, k in enumerate(overlap_dict.keys()) }
mosaic_dict

{'P223+52': 0,
 'P174+57': 1,
 'P4Hetdex16': 2,
 'P022+34': 3,
 'P18Hetdex03': 4,
 'P205+42': 5,
 'P236+48': 6,
 'P245+55': 7,
 'P10Hetdex': 8,
 'P32Hetdex08': 9,
 'P223+50': 10,
 'P233+60': 11,
 'P121+32': 12,
 'P178+55': 13,
 'P235+50': 14,
 'P12Hetdex11': 15,
 'P228+60': 16,
 'P15Hetdex13': 17,
 'P143+52': 18,
 'P202+42': 19,
 'P219+50': 20,
 'P181+60': 21,
 'P145+57': 22,
 'P217+47': 23,
 'P141+45': 24,
 'P227+50': 25,
 'P226+42': 26,
 'P156+42': 27,
 'P155+60': 28,
 'P163+45': 29,
 'P207+45': 30,
 'P176+60': 31,
 'P238+60': 32,
 'P229+45': 33,
 'P11Hetdex12': 34,
 'P202+60': 35,
 'P35Hetdex10': 36,
 'P236+53': 37,
 'P155+52': 38,
 'P027+31': 39,
 'P345+33': 40,
 'P000+38': 41,
 'P030+39': 42,
 'P145+45': 43,
 'P215+50': 44,
 'P222+60': 45,
 'P5Hetdex': 46,
 'P181+42': 47,
 'P013+26': 48,
 'P34Hetdex06': 49,
 'P151+55': 50,
 'P229+48': 51,
 'P36Hetdex10': 52,
 'P163+42': 53,
 'P017+39': 54,
 'P146+42': 55,
 'P191+55': 56,
 'P193+57': 57,
 'P035+34': 58,
 'P209+42': 59,
 'P8Hetdex':

In [9]:
catalogue["mosaic_id"] = catalogue["mosaic"].apply(lambda x: mosaic_dict[x])
catalogue.loc[:, ["mosaic", "mosaic_id"]]

Unnamed: 0,mosaic,mosaic_id
0,P191+55,56
1,P191+55,56
2,P191+55,56
3,P191+55,56
4,P191+55,56
...,...,...
2291390,P198+57,139
2291391,P198+57,139
2291392,P198+57,139
2291393,P198+57,139


In [10]:
def detectOverlap(data, labels): 
    obj_rows = []
    eps = 0.0016 # Define distance between brightest pixel
    i = 0
    
    while i < len(data)-1:
        # Same mosaic
        if labels[i][0] == labels[i+1][0]:
            i += 1
            continue
        
        if abs(data[i + 1][0] - data[i][0]) <= eps and abs(data[i + 1][1] - data[i][1]) <= eps:
            # Add to new rows
            obj_rows.append(np.array(sorted([labels[i], labels[i+1]], key=lambda x: x[0])).astype(int))            
            # Only 2 sources can overlap
            i += 2
        else:
            i += 1
    
    return np.array(obj_rows)

In [85]:
catalogue.sort_values(
    by=["brightest_pixel_RA", "brightest_pixel_DEC"]
).loc[:, ["brightest_pixel_RA", "brightest_pixel_DEC", "mosaic_id", "object", "total_pixels"]].head()

Unnamed: 0,brightest_pixel_RA,brightest_pixel_DEC,mosaic_id,object,total_pixels
693520,0.000432,38.996926,41,3985,20
693512,0.001013,38.993597,41,3977,32
690411,0.001123,37.227152,41,876,182
692946,0.001379,38.68441,41,3411,57
694254,0.001981,39.440752,41,4719,110


In [82]:
data, labels, sizes = np.array_split(np.array(catalogue.sort_values(by=["brightest_pixel_RA", "brightest_pixel_DEC"])
                              .loc[:, ["brightest_pixel_RA", "brightest_pixel_DEC",
                                       "mosaic_id", "object", "total_pixels"]]), 3, axis=1)

sizes = sizes.astype(int)
labels = labels.astype(int)

In [86]:
print(data[:5])
print(labels[:5])
print(sizes[:5])

[[4.32460168e-04 3.89969258e+01]
 [1.01317936e-03 3.89935968e+01]
 [1.12254014e-03 3.72271517e+01]
 [1.37903447e-03 3.86844096e+01]
 [1.98145617e-03 3.94407516e+01]]
[[  41 3985]
 [  41 3977]
 [  41  876]
 [  41 3411]
 [  41 4719]]
[[ 20]
 [ 32]
 [182]
 [ 57]
 [110]]


In [122]:
print("Finding overlaps between pairs")
overlap_rows = []
group = []
group_loc = []
group_sizes = []
i = -1

with tqdm(total=len(data)-2) as pbar:
    while i < len(data)-2:
        pbar.update(1) # Update progress bar
        i += 1 # Update index
        eps = 0.0016 # Define distance between brightest pixel

        # Same mosaic add current pair if needed and reset group
        if labels[i][0] == labels[i+1][0]:
            if len(group) > 0:
                overlap_labs = np.ones((len(group), 1))
                overlap_labs[np.argmax(group_sizes)] += 1
                group = np.append(np.vstack(group), overlap_labs, axis=1)
                overlap_rows.append(group)
            group, group_loc, group_sizes = ([], [], [])
        # Check if euclidean distance lower than Epsilon
        elif sqrt(np.sum(np.power(data[i] - data[i+1], 2))) <= eps:
            # Add current rows if needed
            if len(group) == 0:
                group.append(labels[i])
                group_loc.append(data[i])
                group_sizes.append(sizes[i])
            # Add to new rows
            group.append(labels[i+1])
            group_loc.append(data[i+1])
            group_sizes.append(sizes[i+1])
        # Check previous sources
        elif len(group) > 0:
            # Take the smallest distance
            if np.min(np.sqrt(np.sum(np.power(group_loc[:-1] - data[i+1], 2), axis=1))) <= eps:
                group.append(labels[i+1])
                group_loc.append(data[i+1])
                group_sizes.append(sizes[i+1])
            # No overlapping sources found add pair and reset group
            else:
                overlap_labs = np.ones((len(group), 1))
                overlap_labs[np.argmax(group_sizes)] += 1
                group = np.append(np.vstack(group), overlap_labs, axis=1)
                overlap_rows.append(group)
                group, group_loc, group_sizes = ([], [], [])
        # No overlap add current pair if needed and reset group
        else:
            if len(group) > 0:
                overlap_labs = np.ones((len(group), 1))
                overlap_labs[np.argmax(group_sizes)] += 1
                group = np.append(np.vstack(group), overlap_labs, axis=1)
                overlap_rows.append(group)
            group, group_loc, group_sizes = ([], [], [])

Finding overlaps between pairs


HBox(children=(FloatProgress(value=0.0, max=2287877.0), HTML(value='')))




# Label overlapping sources

In [134]:
flatten_source_labels = np.array([row for mat in overlap_rows for row in mat], dtype=int)
print(flatten_source_labels.shape)
flatten_source_labels[:5]

(224446, 3)


array([[ 149, 2529,    2],
       [  41, 3031,    1],
       [  41, 2920,    1],
       [ 149, 2401,    2],
       [ 149, 2189,    2]])

In [136]:
flatten_source_labels[(flatten_source_labels[:,0] == 149) & (flatten_source_labels[:,1] == 2529)]

array([[ 149, 2529,    2]])

In [137]:
df_source_labels = pd.DataFrame(flatten_source_labels, columns=["mosaic_id", "object", "overlap"])
df_source_labels.head()

Unnamed: 0,mosaic_id,object,overlap
0,149,2529,2
1,41,3031,1
2,41,2920,1
3,149,2401,2
4,149,2189,2


In [140]:
df_source_labels[(df_source_labels["mosaic_id"] == 149) & (df_source_labels["object"] == 2529)]

Unnamed: 0,mosaic_id,object,overlap
0,149,2529,2


In [143]:
new_cat = pd.merge(catalogue, df_source_labels, how='left', on=['mosaic_id', 'object'])
new_cat.head()

Unnamed: 0,label,total_pixels,x_pixels,y_pixels,integrated_intensity,brightest_pixel,brightest_pixel_x,brightest_pixel_y,brightest_pixel_RA,brightest_pixel_DEC,...,fit_theta,deconv_x,deconv_y,integrated_intensity_fit,ratio_residual,overlap_x,mosaic,object,mosaic_id,overlap_y
0,P191+55_0,43,7,7,0.961113,0.001262,4195,13,191.526076,53.102602,...,-0.532125,0.632697,1.986707,0.971715,0.020838,0,P191+55,0,56,1.0
1,P191+55_1,20,7,5,0.326739,0.000757,4605,27,191.241494,53.108014,...,0.057949,0.0,0.0,0.330021,0.193748,0,P191+55,1,56,
2,P191+55_2,45,8,7,0.745168,0.000793,3745,32,191.838427,53.11019,...,-0.030505,3.29362,1.472263,0.761026,0.051991,0,P191+55,2,56,
3,P191+55_3,43,10,7,0.70281,0.000809,3775,33,191.817606,53.110655,...,0.055498,0.644249,2.9748,0.705155,0.082425,0,P191+55,3,56,
4,P191+55_4,74,14,9,1.104583,0.000829,4346,35,191.421253,53.111697,...,-0.772092,6.145499,2.751915,1.080358,0.16109,0,P191+55,4,56,


In [144]:
new_cat['overlap'] = np.max(new_cat[['overlap_x', 'overlap_y']], axis=1).astype(int)
new_cat = new_cat.drop(labels=['overlap_x', 'overlap_y'], axis=1)
new_cat.head()

Unnamed: 0,label,total_pixels,x_pixels,y_pixels,integrated_intensity,brightest_pixel,brightest_pixel_x,brightest_pixel_y,brightest_pixel_RA,brightest_pixel_DEC,...,fit_y_axis,fit_theta,deconv_x,deconv_y,integrated_intensity_fit,ratio_residual,mosaic,object,mosaic_id,overlap
0,P191+55_0,43,7,7,0.961113,0.001262,4195,13,191.526076,53.102602,...,1.896624,-0.532125,0.632697,1.986707,0.971715,0.020838,P191+55,0,56,1
1,P191+55_1,20,7,5,0.326739,0.000757,4605,27,191.241494,53.108014,...,1.080085,0.057949,0.0,0.0,0.330021,0.193748,P191+55,1,56,0
2,P191+55_2,45,8,7,0.745168,0.000793,3745,32,191.838427,53.11019,...,1.81005,-0.030505,3.29362,1.472263,0.761026,0.051991,P191+55,2,56,0
3,P191+55_3,43,10,7,0.70281,0.000809,3775,33,191.817606,53.110655,...,2.116901,0.055498,0.644249,2.9748,0.705155,0.082425,P191+55,3,56,0
4,P191+55_4,74,14,9,1.104583,0.000829,4346,35,191.421253,53.111697,...,2.061817,-0.772092,6.145499,2.751915,1.080358,0.16109,P191+55,4,56,0


In [145]:
new_cat[(new_cat["mosaic_id"] == 148) & (new_cat["object"] == 7047)]

Unnamed: 0,label,total_pixels,x_pixels,y_pixels,integrated_intensity,brightest_pixel,brightest_pixel_x,brightest_pixel_y,brightest_pixel_RA,brightest_pixel_DEC,...,fit_y_axis,fit_theta,deconv_x,deconv_y,integrated_intensity_fit,ratio_residual,mosaic,object,mosaic_id,overlap
1540850,P1Hetdex15_7047,19,7,5,0.154293,0.000271,4820,5864,205.955078,48.287714,...,0.916413,0.611163,5.412476,0.0,0.16019,0.116676,P1Hetdex15,7047,148,0


In [146]:
len(new_cat[new_cat["overlap"] > 0])

224446

In [147]:
len(new_cat[new_cat["overlap"] == 1])

113645

In [148]:
len(new_cat[new_cat["overlap"] == 2])

110801

In [149]:
len(new_cat[(new_cat["overlap"] == 0) | (new_cat["overlap"] == 2)])

2174234

# Save overlapping sources

In [51]:
out = "overlapping_pairs"
if os.path.isdir(out):
    shutil.rmtree(out)

flattened_grouped_overlaps.map(lambda g: g).coalesce(1, shuffle = True).saveAsTextFile(out)

In [52]:
os.replace("overlapping_pairs/part-00000", "overlap_pairs.csv")
shutil.rmtree(out)

In [54]:
cat_out = os.path.join(writeable, 'catalogue_v5.4.csv')
new_cat.to_csv(cat_out, sep=',')