In [None]:
import pandas as pd
import numpy as np
from astropy.coordinates import SkyCoord
from astropy import units as u
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings('ignore')
from pyarrow import ArrowInvalid
from tqdm import tqdm
from sklearn.cluster import DBSCAN

In [None]:
def indexify(cat,band='NUV'):
    return cat['ECLIPSE'].astype(str).str.zfill(5) + '_' + cat['LEG'].astype(str).str.zfill(2) + '_' + cat.index.astype(str).str.zfill(5) + '_' + band

In [None]:
nd_catfiles = !ls data/*/*nd*catalog*
fd_catfiles = !ls data/*/*fd*catalog*
nuv_catalog = pd.DataFrame()
for f in tqdm(nd_catfiles):
    try:
        tbl = pd.read_parquet(f)
        tbl['GLCAT_VISIT_ID'] = indexify(tbl,band='NUV')
        nuv_catalog = pd.concat([nuv_catalog,tbl])
    except ArrowInvalid:
        #print(f'Unable to open {f}')
        continue
fuv_catalog = pd.DataFrame()
for f in tqdm(fd_catfiles):
    try:
        tbl = pd.read_parquet(f)
        tbl['GLCAT_VISIT_ID'] = indexify(tbl,band='FUV')
        fuv_catalog = pd.concat([fuv_catalog,tbl])
    except ArrowInvalid:
        #print(f'Unable to open {f}')
        continue

In [None]:
def compute_separation_matrix(ra1, dec1, ra2, dec2):
    """
    Compute angular separations between two sets of coordinates efficiently.
    Uses astropy's SkyCoord separation matrix, which is vectorized and much faster.
    Returns a (len(ra1), len(ra2)) array of separations in arcseconds.
    """
    coords1 = SkyCoord(ra=ra1*u.degree, dec=dec1*u.degree)
    coords2 = SkyCoord(ra=ra2*u.degree, dec=dec2*u.degree)
    # Use astropy's separation matrix (broadcasts efficiently)
    # This returns a Quantity array of shape (len(coords1), len(coords2))
    sep_matrix = coords1[:, None].separation(coords2[None, :]).arcsec
    return sep_matrix

# compute_separation_matrix(
#     nuv_catalog['RA'].values, nuv_catalog['DEC'].values,
#     fuv_catalog['RA'].values, fuv_catalog['DEC'].values
# )



In [None]:
cat = nuv_catalog
boxwidth = 0.5/60 # 0.5 arcmin
for i,row in tqdm(enumerate(nuv_catalog.iterrows())):
    index,entry = row
    eclipse,leg,ra,dec = entry[['ECLIPSE','LEG','RA','DEC']]
    ra_diff = (cat['RA'] - ra) * np.cos(np.radians(dec))
    dec_diff = cat['DEC'] - dec
    matches = cat[(cat['ECLIPSE']!=eclipse) & (cat['LEG']!=leg) & (np.abs(ra_diff)<=boxwidth) & (np.abs(dec_diff)<=boxwidth)]
    if not len(matches):
        continue
    sep = compute_separation_matrix([ra],[dec],
                                    matches['RA'].values, matches['DEC'].values)
    break

6451it [00:18, 345.91it/s]


In [None]:
indices = [entry['GLCAT_VISIT_ID']]+matches['GLCAT_VISIT_ID'].values.tolist()
pos = nuv_catalog[nuv_catalog['GLCAT_VISIT_ID'].isin(indices)][['RA','DEC']]
pos

In [None]:
clustering = DBSCAN(eps=3/60/60,min_samples=1).fit(nuv_catalog[['RA','DEC']].values)
clustering.labels_

In [None]:
np.unique(clustering.labels_)