In [3]:
from astropy.io import fits
import matplotlib.pyplot as plt
import numpy as np
from astroquery.gaia import Gaia
import logging
from tqdm import tqdm
from astropy.table import vstack
logging.basicConfig(level=logging.INFO)

In [4]:
# Import Apogee Data sets
apogee_data_file_NN = '../data/Apogee_DR17_vac_NN/apogee_astroNN-DR17.fits'
apogee_data_file_starhorse = '../data/Apogee_DR17_vac_starhorse/APOGEE_DR17_EDR3_STARHORSE_v2.fits'
apogee_data_file_allstar = '../data/Apogee_DR17_Allstar/allStar-dr17-synspec_rev1.fits'

#Importing data from file
with fits.open(apogee_data_file_allstar) as file:
    apogee_data = file[1].data
    # Cleaning the data set using mask statments for initial inputs
    # Filter for Main Red stars
    mrs_filter = apogee_data['EXTRATARG']==0

    # Filter bad star data
    bs_filter = apogee_data['ASPCAPFLAG'] != 'STAR_BAD'
    prog_filter = apogee_data['PROGRAMNAME'] != 'magclouds'
    rg_filter = apogee_data['LOGG'] < 3.0

    # Filter for valid element abundances
    fe_h_flag_filter = apogee_data['FE_H_FLAG'] == 0
    fe_h_err_filter = apogee_data['FE_H_ERR'] < 0.1

    # Cant find this may be part of the VAC data set
    # alpha_fe_flag_filter = apogee_data['alpha_FE_FLAG'] == 0
    # alpha_fe_err_filter = apogee_data['alpga_FE_ERR'] < 0.1

    al_fe_flag_filter = apogee_data['AL_FE_FLAG'] == 0
    al_fe_err_filter = apogee_data['AL_FE_ERR'] < 0.1

    # Cant find this may be part of the VAC data set
    # mg_mn_flag_filter = apogee_data['CE_FE_FLAG'] == 0
    # mg_mn_err_filter = apogee_data['CE_FE_ERR'] < 0.1

    ce_fe_flag_filter = apogee_data['CE_FE_FLAG'] == 0
    ce_fe_err_filter = apogee_data['CE_FE_ERR'] < 0.15

    # All Main Red Stars
    apogee_data_red = apogee_data[mrs_filter]
    # All stars remaining based on APOGEE filters 
    # Note this is currently missign alpha/Fe and Mg/Mn filters
    apogee_data_filtered= apogee_data[mrs_filter & bs_filter & prog_filter & rg_filter & fe_h_flag_filter & fe_h_err_filter & al_fe_flag_filter & al_fe_err_filter & ce_fe_flag_filter & ce_fe_err_filter]


In [5]:
print(f'Number of stars in the red giant sample: {len(apogee_data_red)}')
print(f'Number of stars in the (APOGEE) filtered sample: {len(apogee_data_filtered)}')
print(f'Columns: {apogee_data_filtered.columns}')

Number of stars in the red giant sample: 372458
Number of stars in the (APOGEE) filtered sample: 164627
Columns: ColDefs(
    name = 'FILE'; format = '64A'
    name = 'APOGEE_ID'; format = '30A'
    name = 'TARGET_ID'; format = '58A'
    name = 'APSTAR_ID'; format = '71A'
    name = 'ASPCAP_ID'; format = '77A'
    name = 'TELESCOPE'; format = '6A'
    name = 'LOCATION_ID'; format = 'J'
    name = 'FIELD'; format = '20A'
    name = 'ALT_ID'; format = '30A'
    name = 'RA'; format = 'D'
    name = 'DEC'; format = 'D'
    name = 'GLON'; format = 'D'
    name = 'GLAT'; format = 'D'
    name = 'J'; format = 'E'
    name = 'J_ERR'; format = 'E'
    name = 'H'; format = 'E'
    name = 'H_ERR'; format = 'E'
    name = 'K'; format = 'E'
    name = 'K_ERR'; format = 'E'
    name = 'SRC_H'; format = '16A'
    name = 'WASH_M'; format = 'E'
    name = 'WASH_M_ERR'; format = 'E'
    name = 'WASH_T2'; format = 'E'
    name = 'WASH_T2_ERR'; format = 'E'
    name = 'DDO51'; format = 'E'
    name = 'DDO

In [23]:
# Extract GAIA ID fron remaining stars
gaia_ids =  np.array(apogee_data_filtered['GAIAEDR3_SOURCE_ID'])

# Set size for SQL query and split up GAIA IDs
query_size = 1000
indiv_queries = np.array_split(gaia_ids, np.ceil(len(gaia_ids) / query_size))

# Empty list to store the results of each query
list_query_results = []

# Wrap the loop with tqdm for a progress bar
for i, query in enumerate(tqdm(indiv_queries, desc="Processing Queries")):
    # Convert the chunk to a comma-separated string for SQL syntax
    gaia_id_list = ", ".join(query.astype(str))
    
    # Define the query
    distance_query = f"""
    SELECT source_id, r_med_geo, r_lo_geo, r_hi_geo, r_med_photogeo, r_lo_photogeo, r_hi_photogeo
    FROM external.gaiaedr3_distance
    WHERE source_id IN ({gaia_id_list});
    """
    # Run the query with SQL
    job = Gaia.launch_job(distance_query)
    results = job.get_results()
    
    # Append the results to the list
    list_query_results.append(results)

# Combine all results into a single table
all_query_results = vstack(list_query_results)



Processing Queries: 100%|██████████| 165/165 [01:31<00:00,  1.80it/s]


In [24]:
# check GAIA IDs are same in both column before merging
if np.all(apogee_data_filtered['GAIAEDR3_SOURCE_ID'] == all_query_results['source_id']):
    # Merge the tables
    print('gfhi')
    # apogee_data_filtered = apogee_data_filtered.copy()
    # apogee_data_filtered['r_med_geo'] = all_query_results['r_med_geo']
    # apogee_data_filtered['r_lo_geo'] = all_query_results['r_lo_geo']
    # apogee_data_filtered['r_hi_geo'] = all_query_results['r_hi_geo']
    # apogee_data_filtered['r_med_photogeo'] = all_query_results['r_med_photogeo']
    # apogee_data_filtered['r_lo_photogeo'] = all_query_results['r_lo_photogeo']
    # apogee_data_filtered['r_hi_photogeo'] = all_query_results['r_hi_photogeo']

ValueError: operands could not be broadcast together with shapes (163794,) (164627,) 

In [7]:
# # Print the results
# print(results)


# # Filter for eccentricity
# ecc_filter = apogee_data['ECCENTRICITY'] > 0.85
# # Filter for orbital apocenter
# apo_filter = apogee_data['APOCENTER'] > 5
# # Filter for distance error
# dist_err_filter = apogee_data['DIST_ERR'] < 1.5
# # Filter for orbital energy
# energy_filter = apogee_data['ENERGY'] < 0




# # Plot the HR diagram
# plt.figure(figsize=(10, 8))
# scatter = plt.scatter(filtered_teff, filtered_logg, c=filtered_fe_h, cmap='viridis', s=10, alpha=0.7)
# plt.colorbar(scatter, label='[Fe/H] (Metallicity)')

# # Reverse x-axis (hotter stars on the left)
# plt.gca().invert_xaxis()

# # Label axes
# plt.xlabel('Effective Temperature (K)', fontsize=14)
# plt.ylabel('Surface Gravity (log g)', fontsize=14)
# plt.title('Hertzsprung-Russell Diagram (APOGEE Data)', fontsize=16)
# plt.grid(True)

# plt.show()





# # Extract relevant columns
# teff = apogee_data['TEFF']
# logg = apogee_data['LOGG']
# bp_rp = apogee_data['bp_rp']

# # Apply conditions for red stars (example thresholds)
# red_star_mask = (teff < 5000) & (logg < 3) & (bp_rp > 1.0)

# # Filter the data
# red_stars = apogee_data[red_star_mask]
# print(f'Number of red stars: {len(red_stars)}')
