In [12]:
import pandas as pd
import numpy as np
import os
from PIL import Image, ImageDraw
from matplotlib import pyplot as plt

from astropy.io import fits
from astropy.wcs import WCS
from photutils import detect_threshold
from astropy.stats import gaussian_fwhm_to_sigma
from photutils import detect_sources
from astropy.convolution import Gaussian2DKernel

from pyspark import SparkContext
from pyspark.sql import SparkSession

import os, fnmatch
import warnings
import shutil

In [13]:
try:
    sc = SparkContext(appName="SDDM") #, master='spark://fs.dslc.liacs.nl:7078')
except ValueError:
    warnings.warn("SparkContext already exists in this scope")
    

  after removing the cwd from sys.path.


In [14]:
IMAGE_PATH = '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/' # './images_upper'
WRITE_PATH = './temp'

In [15]:
# Take two fits files
fits_files = [IMAGE_PATH + f for f in os.listdir(IMAGE_PATH)]

In [16]:
# Put the paths in an RDD and determine number of partitions
# More partitions == more cpu and faster (can crash when partitions are too high)
file_paths = sc.parallelize(fits_files, 8) # , len(fits_files)
file_paths.collect()

['/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P223+52_mosaic-blanked.fits',
 '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P174+57_mosaic-blanked.fits',
 '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P4Hetdex16_mosaic-blanked.fits',
 '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P022+34_mosaic-blanked.fits',
 '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P18Hetdex03_mosaic-blanked.fits',
 '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P205+42_mosaic-blanked.fits',
 '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P236+48_mosaic-blanked.fits',
 '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P245+55_mosaic-blanked.fits',
 '/data/astronomy-big-data/bc96e9620e41b9aba98292d37b5eec24/LoTSS_DR2_mosaic/P10Hetdex_mosaic-blanked.fits',
 '/data/astronomy-big-data/b

In [17]:
# Map the RDD with paths to the fits.open() function to get the content in the .fits
fits_content = file_paths.map(lambda file: fits.open(file)[0])
fits_content.getNumPartitions()

8

In [18]:
# Save pair of fits content: threshold matrix in a RDD.
fits_thresh = fits_content.map(lambda content: (content, detect_threshold(content.data , nsigma=3.)))

In [19]:
# Sigma?
sigma = 3.0 * gaussian_fwhm_to_sigma  # FWHM = 3.

# Use kernel (3x3) to find borders of sources
kernel = Gaussian2DKernel(sigma, x_size=3, y_size=3)
kernel.normalize()

# sources = object with label of sources for each pixel in the fits file
# Save pair of fits content: source matrix in a RDD.
fits_sources = fits_thresh.map(lambda ft: (ft[0].header["OBJECT"],
                                           detect_sources(ft[0].data, ft[1], npixels=16, filter_kernel=kernel),
                                           )
                                     )
fits_sources.getNumPartitions()

8

In [20]:
def getCombs(obj, source, df, num_pxl=5):
    rows = df[df["mosaic"] == obj]
    info = []
    
    for i in rows.index:
        r = rows.loc[i]
        
        w, h = source.data.shape
        x = r['brightest_pixel_x']
        y = r['brightest_pixel_y']
        num_pxl = max(r['x_pixels'], r['y_pixels']) + 5

        left = x - num_pxl if x - num_pxl > 0 else 0
        top = y - num_pxl if y - num_pxl > 0 else 0
        right = x + num_pxl if x + num_pxl < w else w
        bottom = y + num_pxl if y + num_pxl < h else h

        r_crop = source.data[top:bottom, left:right]
        cmap = source.make_cmap(random_state=12345)

        if i % 2 != 0:
            i -= 1
        
        info.append( (i, cmap, r_crop) )
    
    return (obj, info)

In [21]:
# Combine 2 rdds by key using fits_sources
df = pd.read_csv('./my_csv_0016.csv')
df.head()
r = df[df["mosaic"] == "P244+48"]
r.index

Int64Index([ 6004,  6006,  6008,  6010,  6012,  6014,  6016,  6018,  6020,
             6022,  6024,  6026,  6028,  6030,  6032,  6034,  6036,  6038,
             6040,  6721,  6723,  6725,  6727,  7881,  7883,  7885,  7887,
             7889,  7891, 19313, 19315, 19317, 19319, 19321, 19323, 19325,
            19327, 19329],
           dtype='int64')

In [24]:
df[df.index % 2 == 0]

Unnamed: 0,label,total_pixels,x_pixels,y_pixels,integrated_intensity,brightest_pixel,brightest_pixel_x,brightest_pixel_y,brightest_pixel_RA,brightest_pixel_DEC,...,center_of_gaus_fit_DEC,fit_x_axis,fit_y_axis,fit_theta,deconv_x,deconv_y,integrated_intensity_fit,ratio_residual,mosaic,comp
0,P219+42_6784,120,14,12,3.461160,0.001688,6330,3292,218.536494,42.111153,...,42.111861,2.562108,3.227591,0.185748,4.516719,6.462664,3.557838,0.042170,P219+42,6784
2,P219+42_6939,401,22,32,27.160097,0.006599,6502,3323,218.439605,42.122944,...,42.118790,11.189520,2.361998,0.293541,26.043924,3.864808,26.501162,0.234669,P219+42,6939
4,P219+42_7067,17,3,9,0.293204,0.000570,6650,3803,218.351818,42.321883,...,42.319909,0.771921,9.000000,-1.027259,0.000000,20.812481,0.271111,0.200686,P219+42,7067
6,P219+42_7533,70,9,10,2.778333,0.003007,7209,3903,218.035617,42.359081,...,42.358853,1.985885,1.821843,-0.054129,2.422545,1.550828,2.780752,0.035453,P219+42,7533
8,P219+42_7698,37,6,8,0.649759,0.000741,7443,2263,217.924149,41.673772,...,41.673523,1.905148,2.001231,-0.966779,2.031430,2.491592,0.662973,0.071871,P219+42,7698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22718,P154+50_5534,76,11,10,2.331464,0.002480,5005,6728,154.601074,50.518442,...,50.519388,1.983631,1.854266,0.376929,2.412287,1.750998,2.310770,0.048106,P154+50,5534
22720,P154+50_6098,30,6,7,0.461031,0.000660,5415,8445,154.323710,51.233045,...,51.232600,1.585476,1.921116,5.536534,0.000000,2.113174,0.473917,0.082271,P154+50,6098
22722,P154+50_6307,23,6,6,0.334681,0.000573,5587,7003,154.218049,50.631477,...,50.631207,1.533286,1.454660,0.537088,0.000000,0.000000,0.349570,0.113788,P154+50,6307
22724,P154+50_6492,27,6,6,0.342309,0.000507,5706,6251,154.144863,50.317652,...,50.317719,1.820558,1.673504,0.564605,1.542437,0.000000,0.350588,0.078688,P154+50,6492


In [22]:
# Combine 2 rdds by key using fits_sources
df = pd.read_csv('./my_csv_0016.csv')

combs = fits_sources.map(lambda fs: getCombs(fs[0], fs[1], df)).flatMap(lambda c: c[1])

In [38]:
def getTwoImgs(r1_crop, r1_cmap, r2_crop, r2_cmap):
    # num_pxl = max(r1['total_pixels'], r2['total_pixels'])
    # r1_crop, rect1, cmap1 = crop(r1, num_pxl, r1_source)
    # r2_crop, rect2, cmap2 = crop(r2, num_pxl, r2_source)

    fig, ax = plt.subplots(1,2)
    # ax[0].add_artist(rect1)
    ax[0].imshow(r1_crop, cmap=r1_cmap)
    ax[0].scatter(r1_crop.shape[0]// 2, r1_crop.shape[1] // 2, s=4, c='red')
    ax[0].set_title("%s;#pxl:%s"%(r1['label'], r1['total_pixels']))
    
    # ax[1].add_artist(rect2)
    ax[1].imshow(r2_crop, cmap=r2_cmap)
    ax[1].scatter(r1_crop.shape[0]// 2, r1_crop.shape[1] // 2, s=4, c='red')
    ax[1].set_title("%s;#pxl:%s"%(r2['label'], r2['total_pixels']))

    # plt.savefig(os.path.join(WRITE_PATH, '%s.png'%(name)))
    return (fig, ax)

def crop(r, num_pxl, r_source):
    r_img = r_source._data
    
    w, h = r_img.shape

    x = r['brightest_pixel_x']
    y = r['brightest_pixel_y']
    # x = r['center_of_mass_x']
    # y = r['center_of_mass_y']

    # draw a 5x5 rectangle around the brightest pixel
    # draw = ImageDraw.Draw(r_source._data)
    # draw.rectangle(((x-5, y-5), (x+5, y+5)), outline='red')
    rect = plt.Rectangle((x, y), 5, 5, color='red', fill=False)

    # crop cout a sub-image with a size of num_pxl by num_pxl
    # the briestest pixel should be at the center of the sub-image
    left = x - num_pxl if x - num_pxl > 0 else 0
    top = y - num_pxl if y - num_pxl > 0 else 0
    right = x + num_pxl if x + num_pxl < w else w
    bottom = y + num_pxl if y + num_pxl < h else h

    print(left, top, right, bottom)
    print('Ra:', r['brightest_pixel_RA'])
    print('Dec:', r['brightest_pixel_DEC'])
    print('Noise Threshold:', threshold)
    r_crop = r_img[top:bottom, left:right]
    cmap = r_source.make_cmap(random_state=12345)
    return r_crop, rect, cmap


In [None]:
# test = combs.reduceByKey(lambda r1, r2: getTwoImgs(r1[0], r1[1], r2[0], r2[1])).collect()
# test = combs.collect()
if os.path.isdir('source_matrices'):
    shutil.rmtree('source_matrices')
    

combs.coalesce(1).saveAsTextFile("source_matrices")

In [None]:

overlapping_sources = {}
for t in test:
    k = t[0]
    if k in overlapping_sources.keys():
        overlapping_sources[k][1] = t[1:]
    else:
        overlapping_sources[k] = [t[1:], None]

print(len(overlapping_sources.keys()))

In [None]:
for k, v in overlapping_sources.items():
    if v[1] is not None:
        r1_cmap, r1_crop, r2_cmap, r2_crop = v

        fig, ax = plt.subplots(1,2)

        ax[0].imshow(r1_crop, cmap=r1_cmap)
        ax[0].scatter(r1_crop.shape[0]// 2, r1_crop.shape[1] // 2, s=4, c='red')
        ax[0].set_title("%s;#pxl:%s"%(r1['label'], r1['total_pixels']))

        ax[1].imshow(r2_crop, cmap=r2_cmap)
        ax[1].scatter(r1_crop.shape[0]// 2, r1_crop.shape[1] // 2, s=4, c='red')
        ax[1].set_title("%s;#pxl:%s"%(r2['label'], r2['total_pixels']))
    else:
        print(k)

# Other function

In [20]:
getSource = lambda s: os.path.join(IMAGE_PATH, ''.join(fnmatch.filter(os.listdir(IMAGE_PATH), s['mosaic'] + '*.fits')))

df = pd.read_csv('./my_csv_0016.csv')

r1 = df.iloc[0]
r1_source = fits_sources.lookup(getSource(r1))

for i in range(0, len(df), 2):
    r2 = df.iloc[i+1]
    
    print(r1.equals(df.iloc[i]))
    
    if r1.equals(df.iloc[i]):
        r1 = df.iloc[i]
    else:
        r1 = df.iloc[i]
        r1_source = fits_sources.lookup(getSource(r1))
    
    r2_source = fits_sources.lookup(getSource(r2))
    
    getTwoImgs(r1, r2, i, r1_source, r2_source)
    

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



KeyboardInterrupt: 