In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')
from astropy.io import fits
import smatch

from tqdm.notebook import tqdm

In [2]:
data = pd.read_csv('intersection.csv')
data = data.dropna()  # Remove rows with NaN values
data = data.replace([np.inf, -np.inf], np.nan).dropna()  # Remove rows with infinite values
data = data.reset_index(drop=True)
print(data.columns)

Index(['K_RA', 'K_DEC', 'K_SCORE', 'Y3_COADD_OBJECT_ID', 'J_RA', 'J_DEC',
       'score_sims', 'score_real', 'score_both', 'COADD_OBJECT_ID', 'G_RA',
       'G_DEC', 'SINGLE', 'RING', 'SMOOTH', 'COMPANIONS', 'SDSS_SPIRALS',
       'DES_SPIRALS', 'CROWDED', 'ARTIFACTS', 'MOST_NEGATIVES'],
      dtype='object')


In [3]:
#Lenses in SLED database:
sled_data = pd.read_csv('data/SLED_database.csv')
sled_data = sled_data[sled_data['flag'] != 'CONTAMINANT']
sled_data = sled_data.reset_index(drop=True)

size = 0.0032 # 0.00028, very small

ra1, dec1 = data['G_RA'], data['G_DEC']
ra2, dec2 = sled_data['ra'], sled_data['dec']
matches = smatch.match(ra1, dec1, size, ra2, dec2, nside=32, maxmatch=1) #0.0024

data_tmp_sled = pd.DataFrame()
for i in tqdm(range(len(matches))):
    point = pd.concat( (data.iloc[matches[i][0]], sled_data.iloc[matches[i][1]]), axis=0)
    data_tmp_sled = data_tmp_sled.append(point, ignore_index=True)

print(len(data_tmp_sled))
#data_tmp_sled.to_csv('data/in_sled.csv', index=False)
data_tmp_sled.head()

  0%|          | 0/2271 [00:00<?, ?it/s]

2271


Unnamed: 0,K_RA,K_DEC,K_SCORE,Y3_COADD_OBJECT_ID,J_RA,J_DEC,score_sims,score_real,score_both,COADD_OBJECT_ID,...,score,image_sep,info,n_img,flag,image_conf,lens_type,source_type,contaminant_type,papers
0,2.146579,-39.377357,0.001118,182434686.0,2.146579,-39.377357,0.0604,0.0,0.0,1046786000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
1,1.117249,-38.735816,0.190351,142345819.0,1.117249,-38.735816,0.9792,0.0,0.0,1037198000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
2,0.719928,-38.28619,0.999342,142322552.0,0.719928,-38.28619,0.2046,0.0,0.0,1037155000.0,...,2,,,,CANDIDATE,,Galaxy,,,2022A&A...668A..73R
3,359.35464,-39.374435,0.999394,213772612.0,359.35464,-39.374435,0.0104,0.0,0.0,1031999000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
4,359.132439,-38.75275,2.6e-05,172759386.0,359.132439,-38.75275,0.9329,0.0,0.0,1028411000.0,...,1,,,,CANDIDATE,,Galaxy,,,"2019ApJS..243...17J, 2022arXiv220602764S"


In [4]:
#Testing a different max distance
sled_data = pd.read_csv('data/SLED_database.csv')
sled_data = sled_data[sled_data['flag'] != 'CONTAMINANT']
sled_data = sled_data.reset_index(drop=True)

size = 0.001 # 0.00028, very small

ra1, dec1 = data['G_RA'], data['G_DEC']
ra2, dec2 = sled_data['ra'], sled_data['dec']
matches = smatch.match(ra1, dec1, size, ra2, dec2, nside=32, maxmatch=1) #0.0024

data_tmp_sled = pd.DataFrame()
for i in tqdm(range(len(matches))):
    point = pd.concat( (data.iloc[matches[i][0]], sled_data.iloc[matches[i][1]]), axis=0)
    data_tmp_sled = data_tmp_sled.append(point, ignore_index=True)

print(len(data_tmp_sled))
#data_tmp_sled.to_csv('data/in_sled.csv', index=False)
data_tmp_sled.head()

def spherical_distance(ra1, dec1, ra2, dec2):
    # Convert degrees to radians
    ra1, dec1, ra2, dec2 = map(np.radians, [ra1, dec1, ra2, dec2])
    
    # Spherical law of cosines formula
    cos_distance = np.sin(dec1) * np.sin(dec2) + np.cos(dec1) * np.cos(dec2) * np.cos(ra1 - ra2)
    distance = np.arccos(cos_distance)
    
    # Convert distance from radians to degrees
    distance_deg = np.degrees(distance)
    return distance_deg

data_tmp_sled['distance_deg'] = spherical_distance(data_tmp_sled['G_RA'], data_tmp_sled['G_DEC'], 
                                                   data_tmp_sled['ra'], data_tmp_sled['dec'])

# If you want the distance in arcseconds
data_tmp_sled['distance_arcsec'] = data_tmp_sled['distance_deg'] * 3600
top_n_rows = data_tmp_sled.nlargest(5, 'distance_arcsec')

for i in range(len(top_n_rows)):
    print(' ')
    print(top_n_rows['distance_arcsec'].iloc[i])
    print(top_n_rows['G_RA'].iloc[i], top_n_rows['G_DEC'].iloc[i])
    print(top_n_rows['ra'].iloc[i], top_n_rows['dec'].iloc[i])

  0%|          | 0/1754 [00:00<?, ?it/s]

1754
 
3.5997455824094713
46.676989 -33.459318
46.67754 -33.45843
 
3.576749909622807
15.305893 -23.451732
15.30481 -23.45173
 
3.5758676387322397
334.426509 -41.873092
334.4254 -41.87254
 
3.5569043127400777
351.29451 -41.190952
351.29546 -41.19027
 
3.5456185816954418
3.377745 -41.729467
3.37675 -41.72882


In [5]:
#Updating the in sled to a closer correspondence: 

sled_data = pd.read_csv('data/SLED_database.csv')
sled_data = sled_data[sled_data['flag'] != 'CONTAMINANT']
sled_data = sled_data.reset_index(drop=True)

size = 0.0005 # 0.00028, very small

ra1, dec1 = data['G_RA'], data['G_DEC']
ra2, dec2 = sled_data['ra'], sled_data['dec']
matches = smatch.match(ra1, dec1, size, ra2, dec2, nside=32, maxmatch=1) #0.0024

data_tmp_sled = pd.DataFrame()
for i in tqdm(range(len(matches))):
    point = pd.concat( (data.iloc[matches[i][0]], sled_data.iloc[matches[i][1]]), axis=0)
    data_tmp_sled = data_tmp_sled.append(point, ignore_index=True)

print(len(data_tmp_sled))
#data_tmp_sled.to_csv('data/in_sled2_coadd_ids.csv', index=False)
#data_tmp_sled.to_csv('data/in_sled2.csv', index=False)
data_tmp_sled.head()

  0%|          | 0/1687 [00:00<?, ?it/s]

1687


Unnamed: 0,K_RA,K_DEC,K_SCORE,Y3_COADD_OBJECT_ID,J_RA,J_DEC,score_sims,score_real,score_both,COADD_OBJECT_ID,...,score,image_sep,info,n_img,flag,image_conf,lens_type,source_type,contaminant_type,papers
0,2.146579,-39.377357,0.001118,182434686.0,2.146579,-39.377357,0.0604,0.0,0.0,1046786000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
1,1.117249,-38.735816,0.190351,142345819.0,1.117249,-38.735816,0.9792,0.0,0.0,1037198000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
2,0.719928,-38.28619,0.999342,142322552.0,0.719928,-38.28619,0.2046,0.0,0.0,1037155000.0,...,2,,,,CANDIDATE,,Galaxy,,,2022A&A...668A..73R
3,359.35464,-39.374435,0.999394,213772612.0,359.35464,-39.374435,0.0104,0.0,0.0,1031999000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
4,359.132439,-38.75275,2.6e-05,172759386.0,359.132439,-38.75275,0.9329,0.0,0.0,1028411000.0,...,1,,,,CANDIDATE,,Galaxy,,,"2019ApJS..243...17J, 2022arXiv220602764S"


In [6]:
#Lenses in Jacobs:
pattern = r'\b2019ApJS..243...17J\b'
data_tmp = data_tmp_sled[data_tmp_sled['papers'].str.contains(pattern, regex=True)]

print(len(data_tmp))
data_tmp.to_csv('data/in_jacobs.csv', index=False)
data_tmp.head()

610


Unnamed: 0,K_RA,K_DEC,K_SCORE,Y3_COADD_OBJECT_ID,J_RA,J_DEC,score_sims,score_real,score_both,COADD_OBJECT_ID,...,score,image_sep,info,n_img,flag,image_conf,lens_type,source_type,contaminant_type,papers
0,2.146579,-39.377357,0.001118,182434686.0,2.146579,-39.377357,0.0604,0.0,0.0,1046786000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
1,1.117249,-38.735816,0.190351,142345819.0,1.117249,-38.735816,0.9792,0.0,0.0,1037198000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
3,359.35464,-39.374435,0.999394,213772612.0,359.35464,-39.374435,0.0104,0.0,0.0,1031999000.0,...,1,,,,CANDIDATE,,Galaxy,,,2019ApJS..243...17J
4,359.132439,-38.75275,2.6e-05,172759386.0,359.132439,-38.75275,0.9329,0.0,0.0,1028411000.0,...,1,,,,CANDIDATE,,Galaxy,,,"2019ApJS..243...17J, 2022arXiv220602764S"
12,0.818255,-33.8012,0.730613,139823797.0,0.818255,-33.8012,1.0,0.0,0.0,1032630000.0,...,3,,,,CANDIDATE,,Galaxy,,,"2019ApJS..243...17J, 2019MNRAS.484.5330J"


In [7]:
#Lenses in Karina:
pattern = r'\b2022A&A...668A..73R\b'
data_tmp = data_tmp_sled[data_tmp_sled['papers'].str.contains(pattern, regex=True)]

print(len(data_tmp))
data_tmp.to_csv('data/in_karina.csv', index=False)
data_tmp.head()

241


Unnamed: 0,K_RA,K_DEC,K_SCORE,Y3_COADD_OBJECT_ID,J_RA,J_DEC,score_sims,score_real,score_both,COADD_OBJECT_ID,...,score,image_sep,info,n_img,flag,image_conf,lens_type,source_type,contaminant_type,papers
2,0.719928,-38.28619,0.999342,142322552.0,0.719928,-38.28619,0.2046,0.0,0.0,1037155000.0,...,2,,,,CANDIDATE,,Galaxy,,,2022A&A...668A..73R
5,0.528898,-37.674508,0.974356,140630354.0,0.528898,-37.674508,0.0,0.0,0.0,1036796000.0,...,1,,,,CANDIDATE,,Galaxy,,,"2022A&A...668A..73R, 2022arXiv220602764S"
7,3.498401,-34.493713,0.996786,179781037.0,3.498401,-34.493713,0.0174,0.0,0.0,1049545000.0,...,2,,,,CANDIDATE,,Galaxy,,,2022A&A...668A..73R
45,5.679873,-27.934942,0.998621,194032993.0,5.679873,-27.934942,0.0002,0.0,0.0,1062162000.0,...,2,,,,CANDIDATE,,Galaxy,,,2022A&A...668A..73R
53,9.593161,-25.842242,0.970344,155609778.0,9.593161,-25.842242,1.0,0.0,0.0,1084326000.0,...,3,4.94,,,CANDIDATE,,Galaxy,,,"2019ApJS..243...17J, 2019MNRAS.484.5330J, 2022..."


In [8]:
# Lenses in my results from experts

filepath = '/Users/jimenagonzalez/research/DSPL/SpaceWarps_Inspection/Results/For_experts/Data_experts/'
known_data = pd.read_csv(filepath + 'complete_data_ano.csv')
known_data = known_data[known_data['SCORE'] > 0.00001]
sled_data = sled_data.reset_index(drop=True)

size = 0.0005 # 0.00028, very small

ra1, dec1 = data['G_RA'], data['G_DEC']
ra2, dec2 = known_data['RA'], known_data['DEC']
matches = smatch.match(ra1, dec1, size, ra2, dec2, nside=32, maxmatch=1) #0.0024

#index_1 = [match[1] for match in matches]
#data_tmp = known_data.loc[index_1]
#print(len(data_tmp))

data_tmp = pd.DataFrame()
for i in tqdm(range(len(matches))):
    point = pd.concat( (data.iloc[matches[i][0]], known_data.iloc[matches[i][1]]), axis=0)
    data_tmp = data_tmp.append(point, ignore_index=True)

print(len(data_tmp))
data_tmp.to_csv('data/in_jimena.csv', index=False)
data_tmp.head()

  0%|          | 0/521 [00:00<?, ?it/s]

521


Unnamed: 0,K_RA,K_DEC,K_SCORE,Y3_COADD_OBJECT_ID,J_RA,J_DEC,score_sims,score_real,score_both,COADD_OBJECT_ID,...,scorer_1_norm,scorer_2_norm,scorer_3_norm,scorer_4_norm,scorer_5_norm,scorer_6_norm,scorer_7_norm,Aver_norm_wo_8,Aver_norm_wo_78,most_common
0,0.482615,-40.438966,0.005141,141179639.0,0.482615,-40.438966,0.0602,0.0,0.0,1034975000.0,...,-0.511468,-1.046181,-1.083515,-1.040979,1.127265,1.221862,-0.460595,0.572758,0.624955,0.0
1,1.117249,-38.735816,0.190351,142345819.0,1.117249,-38.735816,0.9792,0.0,0.0,1037198000.0,...,-0.511468,0.004058,0.868572,-1.040979,1.127265,1.221862,-0.460595,0.963632,1.051451,0.0
2,359.35464,-39.374435,0.999394,213772612.0,359.35464,-39.374435,0.0104,0.0,0.0,1031999000.0,...,0.652356,-1.046181,-0.107472,-0.032784,-0.07345,1.221862,0.822461,0.993326,0.992718,1.0
3,359.902843,-36.059186,0.411994,140042778.0,359.902843,-36.059186,1.0,0.0,1.0,1032846000.0,...,-0.511468,1.054296,-0.107472,-1.040979,-0.07345,-0.776087,-0.460595,0.556856,0.607604,0.0
4,2.395415,-36.166892,0.000123,143217230.0,2.395415,-36.166892,0.0343,0.0,0.0,1047668000.0,...,-0.511468,2.104535,-1.083515,-0.032784,-0.07345,1.221862,-0.460595,0.957887,1.045182,0.0


In [9]:
print(len(known_data))

print(len(known_data[known_data['SCORE'] > 0.00001]))

#I should remove from this data the subjects that are random, I could use the subject id column

2502
2502
