In [8]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from astropy.coordinates import SkyCoord
from astropy import units as u


In [2]:
# Specify the directory
directory = 'training_set/agn_spectra'

# Get a list of all files in the directory
id_train_agn = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# star directory
directory = 'training_set/star_spectra'

# Get a list of all files in the directory
id_train_star = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# binary directory
directory = 'training_set/bin_spectra'

# Get a list of all files in the directory
id_train_bin = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# galaxy directory
directory = 'training_set/gal_spectra'

# Get a list of all files in the directory
id_train_gal = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# print the number of files in each category
print('Number of AGN spectra:', len(id_train_agn))
print('Number of STAR spectra:', len(id_train_star))
print('Number of BIN spectra:', len(id_train_bin))
print('Number of GAL spectra:', len(id_train_gal))




Number of AGN spectra: 35936
Number of STAR spectra: 86037
Number of BIN spectra: 40676
Number of GAL spectra: 1699


In [3]:
# Specify the directory
directory = 'validation_set/agn_spectra'

# Get a list of all files in the directory
id_val_agn = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# star directory
directory = 'validation_set/star_spectra'

# Get a list of all files in the directory
id_val_star = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# binary directory
directory = 'validation_set/bin_spectra'

# Get a list of all files in the directory
id_val_bin = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# galaxy directory
directory = 'validation_set/gal_spectra'

# Get a list of all files in the directory
id_val_gal = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# print the number of files in each category
print('Number of AGN spectra:', len(id_val_agn))
print('Number of STAR spectra:', len(id_val_star))
print('Number of BIN spectra:', len(id_val_bin))
print('Number of GAL spectra:', len(id_val_gal))


Number of AGN spectra: 400
Number of STAR spectra: 400
Number of BIN spectra: 400
Number of GAL spectra: 400


## Note: had to do it twice; val and train

In [4]:
lamost_catalog = pd.read_csv("dr9_v2.0_LRS_catalogue.csv")  # Assuming CSV format for LAMOST catalog

# Starting of with Gal

In [21]:
obsid_list = id_train_gal
obsid_list = [int(obsid) for obsid in obsid_list]

# Filter the DataFrame
filtered_df = lamost_catalog[lamost_catalog['obsid'].isin(obsid_list)]

# Get the 'ra' and 'dec' values
ra_dec_values = filtered_df[['ra', 'dec']]
print(ra_dec_values)

################################################################################################################

# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
gal_data = pd.read_pickle("gal_data.pkl")  # Loaded GAL data

# Ensure that RA and Dec columns are numeric and have units
gal_data['ra'] = pd.to_numeric(gal_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
gal_data['dec'] = pd.to_numeric(gal_data['dec'], errors='coerce')

# Drop rows with NaN values in 'ra' and 'dec' columns
gal_data = gal_data.dropna(subset=['ra', 'dec'])
ra_dec_values = filtered_df[['obsid','ra', 'dec']]

# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
gal_coords = SkyCoord(ra=gal_data['ra'].values*u.deg, dec=gal_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=ra_dec_values['ra'].values*u.deg, dec=ra_dec_values['dec'].values*u.deg)

# Crossmatch the GAL and LAMOST data
idx, d2d, _ = gal_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_gal = gal_data.iloc[matches]
matched_lamost = ra_dec_values.iloc[idx[matches]]

# Combine the matched data
matched_data = pd.concat([matched_gal.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

#######################################################################################################################

# Specify the directory
directory = 'gaia_training_set/gal_data'

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Loop through the DataFrame and save each row as a .npy file
for index, row in matched_data.iterrows():
    # Extract the filename
    filename = row['obsid']
    
    # Select the columns you want to save
    values_to_save = row[['ra','ra_error', 'dec', 'dec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error']].values
    
    # Save the values as a .npy file in the specified directory
    np.save(os.path.join(directory, f"{filename}.npy"), values_to_save)


                  ra        dec
89        332.393310  -1.199750
617       332.627720   0.360050
1342      330.416750  -0.672220
2102      331.741550   1.177070
3482       43.313820  -0.989460
...              ...        ...
10670233  209.355176  55.069539
10678832  189.839336  45.334841
10680626  155.780246  47.714593
10681007  236.849923  32.482925
10682030  222.381889  -3.255226

[1699 rows x 2 columns]


 # Doing the same for bin

In [28]:
obsid_list = id_val_bin
obsid_list = [int(obsid) for obsid in obsid_list]

# Filter the DataFrame
# Filter the DataFrame
filtered_df = lamost_catalog[lamost_catalog['obsid'].isin(obsid_list)]

# Get the 'ra' and 'dec' values
ra_dec_values = filtered_df[['ra', 'dec']]
print(ra_dec_values)


                  ra        dec
16577     351.174109  31.489124
119044    106.765840  27.194232
189217      0.000707  32.146507
198231     55.221293  52.095851
220948     45.609962  50.278437
...              ...        ...
10624028  357.123560  51.339421
10628359  125.487290  23.334434
10636107   37.437670  58.890582
10638219   54.564625  54.660116
10683416  275.201100   8.617276

[400 rows x 2 columns]


In [29]:
# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
bin_data = pd.read_pickle("Pickles/bin_data.pkl")  # Loaded BIN data

# Ensure that RA and Dec columns are numeric and have units
bin_data['ra'] = pd.to_numeric(bin_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
bin_data['dec'] = pd.to_numeric(bin_data['dec'], errors='coerce')

# Drop rows with NaN values in 'ra' and 'dec' columns
bin_data = bin_data.dropna(subset=['ra', 'dec'])
ra_dec_values = filtered_df[['obsid','ra', 'dec']]
# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
bin_coords = SkyCoord(ra=bin_data['ra'].values*u.deg, dec=bin_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=ra_dec_values['ra'].values*u.deg, dec=ra_dec_values['dec'].values*u.deg)

# Crossmatch the GAL and LAMOST data
idx, d2d, _ = bin_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_bin = bin_data.iloc[matches]
matched_lamost = ra_dec_values.iloc[idx[matches]]

# Combine the matched data
matched_data = pd.concat([matched_bin.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

#######################################################################################################################
# Specify the directory
directory = 'gaia_validation_set/bin_data'

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Loop through the DataFrame and save each row as a .npy file
# Loop through the DataFrame and save each row as a .npy file
for index, row in matched_data.iterrows():
    # Extract the filename
    filename = row['obsid']
    
    # Select the columns you want to save
    values_to_save = row[['ra','ra_error', 'dec', 'dec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error']].values
    
    # Save the values as a .npy file in the specified directory
    np.save(os.path.join(directory, f"{filename}.npy"), values_to_save)


# Doing the same for Stars

In [32]:
obsid_list = id_val_star
obsid_list = [int(obsid) for obsid in obsid_list]

# Filter the DataFrame
filtered_df = lamost_catalog[lamost_catalog['obsid'].isin(obsid_list)]

# Get the 'ra' and 'dec' values
ra_dec_values = filtered_df[['ra', 'dec']]
print(ra_dec_values)    

# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
star_data = pd.read_pickle("Pickles/star_data.pkl")  # Loaded STAR data

# Ensure that RA and Dec columns are numeric and have units
star_data['ra'] = pd.to_numeric(star_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
star_data['dec'] = pd.to_numeric(star_data['dec'], errors='coerce')

# Drop rows with NaN values in 'ra' and 'dec' columns
star_data = star_data.dropna(subset=['ra', 'dec'])
ra_dec_values = filtered_df[['obsid','ra', 'dec']]
# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
star_coords = SkyCoord(ra=star_data['ra'].values*u.deg, dec=star_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=ra_dec_values['ra'].values*u.deg, dec=ra_dec_values['dec'].values*u.deg)

# Crossmatch the GAL and LAMOST data
idx, d2d, _ = star_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_star = star_data.iloc[matches]
matched_lamost = ra_dec_values.iloc[idx[matches]]

# Combine the matched data
matched_data = pd.concat([matched_star.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

#######################################################################################################################
# Specify the directory
directory = 'gaia_validation_set/star_data'

# Create the directory if it doesn't exist
# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Loop through the DataFrame and save each row as a .npy file
# Loop through the DataFrame and save each row as a .npy file
for index, row in matched_data.iterrows():
    # Extract the filename
    filename = row['obsid']
    
    # Select the columns you want to save
    values_to_save = row[['ra','ra_error', 'dec', 'dec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error']].values
    
    # Save the values as a .npy file in the specified directory
    np.save(os.path.join(directory, f"{filename}.npy"), values_to_save)


                  ra        dec
78867      50.714358  51.674194
157499     56.451623  48.307761
245373    145.822784  31.473291
288677     19.738333  -0.429333
344065     39.539134  56.226054
...              ...        ...
10542009   62.107115  55.052085
10590464  156.773001  35.559637
10612659  316.005910  34.610072
10616792   44.856373  29.498262
10685162  282.309970   8.837147

[400 rows x 2 columns]


# Doing the same for AGN

In [35]:
obsid_list = id_train_agn
obsid_list = [int(obsid) for obsid in obsid_list]

# Filter the DataFrame
filtered_df = lamost_catalog[lamost_catalog['obsid'].isin(obsid_list)]

# Get the 'ra' and 'dec' values
ra_dec_values = filtered_df[['ra', 'dec']]
print(ra_dec_values)    

# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
agn_data = pd.read_pickle("Pickles/agn_data.pkl")  # Loaded AGN data

# Ensure that RA and Dec columns are numeric and have units
agn_data['ra'] = pd.to_numeric(agn_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
agn_data['dec'] = pd.to_numeric(agn_data['dec'], errors='coerce')

# Drop rows with NaN values in 'ra' and 'dec' columns
agn_data = agn_data.dropna(subset=['ra', 'dec'])
ra_dec_values = filtered_df[['obsid','ra', 'dec']]

# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
agn_coords = SkyCoord(ra=agn_data['ra'].values*u.deg, dec=agn_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=ra_dec_values['ra'].values*u.deg, dec=ra_dec_values['dec'].values*u.deg)

# Crossmatch the GAL and LAMOST data
idx, d2d, _ = agn_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_agn = agn_data.iloc[matches]
matched_lamost = ra_dec_values.iloc[idx[matches]]

# Combine the matched data
matched_data = pd.concat([matched_agn.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

#######################################################################################################################

# Specify the directory
directory = 'gaia_training_set/agn_data'

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Loop through the DataFrame and save each row as a .npy file

for index, row in matched_data.iterrows():
    # Extract the filename
    filename = row['obsid']
    
    # Select the columns you want to save
    values_to_save = row[['ra','ra_error', 'dec', 'dec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error']].values
    
    # Save the values as a .npy file in the specified directory
    np.save(os.path.join(directory, f"{filename}.npy"), values_to_save)

                  ra        dec
191       330.559330  -1.194770
250       330.872710  -1.160080
265       330.263060  -0.883410
344       331.182280  -0.110750
357       330.774316   0.581631
...              ...        ...
10685617  223.215770  38.688223
10685708  241.194370   5.478200
10685736  242.269670   5.353891
10685779  243.686543   4.325836
10686193  226.997120  26.928283

[35936 rows x 2 columns]


In [2]:
import gc
gc.collect()

200