In [1]:
from dask.distributed import Client
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import matplotlib.pyplot as plt 
import numpy as np  
import lsdb
import pandas as pd
from catalog_filtering import bandFilterLenient, contains_PM

In [13]:
#Bands we care about
bandList = ['G','R','I','Z','Y']

#Keeping only the important columns
crucialCols = (
    [f'CLASS_STAR_{band}' for band in bandList] + 
    [f'FLAGS_{band}' for band in bandList] + 
    ['RA','DEC','COADD_OBJECT_ID'] + 
    [f'SPREAD_MODEL_{band}' for band in bandList] + 
    [f'WAVG_MAG_PSF_{band}' for band in bandList] + 
    [f'WAVG_MAGERR_PSF_{band}' for band in bandList]
)

catalog = lsdb.read_hipscat('hipscat/des_one_deg/', columns=crucialCols)

#Filtering for specific measurement vaTlues
qs = bandFilterLenient(bandList,classStar=0.95,spreadModel=0.05,magError=0.05,flag=True,invalidMags=True)
filtered_catalog = catalog.query(qs)

#Performing crossmatch
xmatch = filtered_catalog.crossmatch(filtered_catalog, n_neighbors=100, radius_arcsec=18, suffixes=['_1', '_2'])
xmatch_ddf = xmatch._ddf

with Client():
    df = xmatch.compute()



In [14]:
# neighbors = xmatch_ddf.groupby('_hipscat_index')['_dist_arcsec'].count()
# neighbors -= 1 #Double counting adjustment
# neighbors.name = 'neighbors'
neighbors = df.groupby('_hipscat_index')['_dist_arcsec'].count()
neighbors -= 1 #Double counting adjustment
df['neighbors'] = neighbors

In [16]:
# xmatch_ddf_neighbors = xmatch_ddf.join(neighbors, on='_hipscat_index')
# #may need .assign if .join shuffles, shuffling is computationally expensive

# three_or_more_matches = xmatch_ddf_neighbors.query('neighbors >= 3')
# star_groups = three_or_more_matches.groupby('_hipscat_index')


three_or_more_matches = df.query('neighbors >= 3')
star_groups = three_or_more_matches.groupby('_hipscat_index')

In [39]:
agg_func = lambda x: list(x)
star_groups_df = three_or_more_matches.groupby('_hipscat_index')
star_groups_df = star_groups_df.agg(agg_func)


  star_groups_df = star_groups_df.agg(agg_func)


ArrowNotImplementedError: Unsupported cast from list<item: double> to double using function cast_double

In [6]:
def distance_to_line(PQ, line_vector):
    return np.abs(np.cross(PQ, line_vector)) / np.linalg.norm(line_vector)

def closeToProjection(line_vector, x, y, error):
    PQ = np.array([x, y])
    distance = distance_to_line(PQ, line_vector)
    return distance < error

In [41]:
pdf = pd.DataFrame({
    'index': [1, 1, 2, 2, 2, 3, 3, 3],
    'A': [10, 10, 30, 40, 30, 50, 60, 70],
    'B': ['x', 'y', 'x', 'z', 'x', 'y', 'y', 'z']
})

ddf = pdf.set_index('index')

agg_func = lambda x: list(x.unique())

result = ddf.groupby(ddf.index).agg(agg_func)

print(result)


                  A       B
index                      
1              [10]  [x, y]
2          [30, 40]  [x, z]
3      [50, 60, 70]  [y, z]


In [5]:

PM_df = pd.DataFrame()

plt.figure(figsize=(8, 8))
for _hipscat_index, group in star_groups:
    #origin point
    ra1, dec1 = group.iloc[0][["RA_1", "DEC_1"]]
    
    #array of matched points
    ra2, dec2 = da.from_array(group[["RA_2", "DEC_2"]])
    # ra2, dec2 = group[["RA_2", "DEC_2"]].to_numpy().T
    print(ra2, dec2)

    #perform delta RA, DEC calculations
    x_vals = (ra2 - ra1) * np.cos(np.radians(dec1)) * 3600
    y_vals = (dec2 - dec1) * 3600

    #Create list of tuples of coordinates
    coords = zip(x_vals, y_vals)
    coords = list(coords)

    id_1 = group.iloc[0]['COADD_OBJECT_ID_1']
    id_2 = group['COADD_OBJECT_ID_2'].to_numpy().T


    for i in range(len(coords)):
        found = 0
        line_vector = coords[i]

        if (line_vector == (0,0)): continue #invalid line vector
        
        for j in range(len(coords)):
            if (j == i): continue #prevent double counting

            test_x, test_y = coords[j]
            if closeToProjection(line_vector, test_x, test_y, .2): #2 milliseconds
                found += 1

        if (found >= 3) and (id_1 == min(id_2)): #4 or more in a line (probable PMS) and prevent double graphing
            plt.scatter(x_vals, y_vals, s=20)
            PM_df = pd.concat([PM_df, group], axis=0)

plt.xlabel('ΔRight-Ascention (arcsecs)')
plt.ylabel('ΔDeclination (arcsecs)')
plt.title('Possible PM Stars Following Star Filtering and Line Projection')


NotImplementedError: Iteration of DataFrameGroupBy objects requires computing the groups which may be slow. You probably want to use 'apply' to execute a function for all the columns. To access individual groups, use 'get_group'. To list all the group names, use 'df[<group column>].unique().compute()'.

<Figure size 800x800 with 0 Axes>

In [None]:
def get_collinear_star_groups(star_groups, max_unalignment):
    for _hipscat_index, group in star_groups:
        #origin point
        ra1, dec1 = group.iloc[0][["RA_1", "DEC_1"]]
        
        #array of matched points
        ra2, dec2 = da.from_array(group[["RA_2", "DEC_2"]])
        # ra2, dec2 = group[["RA_2", "DEC_2"]].to_numpy().T
        print(ra2, dec2)

        #perform delta RA, DEC calculations
        x_vals = (ra2 - ra1) * np.cos(np.radians(dec1)) * 3600
        y_vals = (dec2 - dec1) * 3600

        #Create list of tuples of coordinates
        coords = zip(x_vals, y_vals)
        coords = list(coords)

        id_1 = group.iloc[0]['COADD_OBJECT_ID_1']
        id_2 = group['COADD_OBJECT_ID_2'].to_numpy().T


        for i in range(len(coords)):
            found = 0
            line_vector = coords[i]

            if (line_vector == (0,0)): continue #invalid line vector
            
            for j in range(len(coords)):
                if (j == i): continue #prevent double counting

                test_x, test_y = coords[j]
                if closeToProjection(line_vector, test_x, test_y, .2): #2 milliseconds
                    found += 1

            if (found >= 3) and (id_1 == min(id_2)): #4 or more in a line (probable PMS) and prevent double graphing
                plt.scatter(x_vals, y_vals, s=20)
                PM_df = pd.concat([PM_df, group], axis=0)
