In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from astroquery.gaia import Gaia
from astroquery.simbad import Simbad
from astropy.coordinates import SkyCoord
import astropy.units as u
import numpy as np
import pyvo as vo
import pickle
from astroquery.vizier import Vizier

In [6]:
def GetGAIAData(GaiaDR3SourceIDs):
    try:
        qry = f"SELECT * FROM gaiadr3.gaia_source gs WHERE gs.source_id in ({GaiaDR3SourceIDs});"
        job = Gaia.launch_job_async(qry)
        tblGaia = job.get_results()
        dfGaia = tblGaia.to_pandas()
        return dfGaia
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error

def split_ids_into_chunks(id_string, chunk_size=50000):
    id_list = id_string.split(', ')
    chunks = [', '.join(id_list[i:i + chunk_size]) for i in range(0, len(id_list), chunk_size)]
    return chunks

## Getting AGN

#### Getting SIMBAD labels

In [14]:
criteria_ = "otype = 'AGN..' "

# Initialize SIMBAD
simbad = Simbad()
simbad.ROW_LIMIT = -1
simbad.add_votable_fields("ids", "otype")

# Define coordinates (random)
coordinates = SkyCoord(0, -90, unit=("deg", "deg"))
coordinates2 = SkyCoord(0, 90, unit=("deg", "deg"))

# Query region 1st half
result = simbad.query_region(coordinates, radius="90d0m",
                           #  criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")
                           criteria=criteria_)
filtered_result = result["ids", "otype"] # Keep only the columns we need
result_df = filtered_result.to_pandas() # Convert result to a Pandas DataFrame
print(result_df.shape)
filtered_result = result_df.dropna() # Remove rows with any NULL values
data1 = filtered_result.to_numpy() # Convert to numpy array


# Query region 2nd half
result2 = simbad.query_region(coordinates2, radius="90d0m",
                             #criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")
                             criteria=criteria_)
filtered_result2 = result2["ids", "otype"] # Keep only the columns we need
result_df2 = filtered_result2.to_pandas() # Convert result to a Pandas DataFrame
print(result_df2.shape)
filtered_result2 = result_df2.dropna() # Remove rows with any NULL values
data2 = filtered_result2.to_numpy() # Convert to numpy array

# Combine the two arrays & save
#otypes_agn = np.row_stack((data1, data2))
#np.save("fullsky4catsunfiltered", simbad_data)

# without filtering
otypes_agn = pd.concat([result_df, result_df2])

(136855, 2)
(819002, 2)


#### Getting coresponding Gaia data

In [15]:
# Concatenate the DataFrames
simbad_data = otypes_agn

# Filter SIMBAD data to only include rows where 'ids' contains 'Gaia DR3'
simbad_data['gaia_id'] = simbad_data['ids'].apply(lambda x: next((id for id in x.split('|') if id.startswith('Gaia DR3')), None))

# Remove 'Gaia DR3' prefix and drop rows with NaN values in 'gaia_id'
simbad_data['gaia_id'] = simbad_data['gaia_id'].str.lstrip('Gaia DR3')
simbad_data = simbad_data.dropna(subset=['gaia_id'])

# Ensure 'gaia_id' is a string
simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)

# Split the Gaia DR3 source IDs into chunks of 30,000 IDs each
GaiaDR3SourceIDs = ', '.join(simbad_data['gaia_id'].astype(str))
chunks = split_ids_into_chunks(GaiaDR3SourceIDs)

# Initialize an empty DataFrame to store all data
combined_df = pd.DataFrame()

# Process each chunk and append the results to the combined DataFrame
for chunk in chunks:
    dfGaia = GetGAIAData(chunk)
    combined_df = pd.concat([combined_df, dfGaia], ignore_index=True)

# Convert the combined DataFrame to a NumPy array if needed
combined_matrix = combined_df.to_numpy()

# GAIA data
gaia_data = combined_df[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error"]]

# Convert Gaia source_id to string
gaia_data['source_id'] = gaia_data['source_id'].astype(str)

# Merge Gaia and SIMBAD data on matching IDs
agn_data = pd.merge(gaia_data, simbad_data, left_on='source_id', right_on='gaia_id', how='inner')
agn_data.to_pickle("agn_data.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)


INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gaia_data['source_id'] = gaia_data['source_id'].astype(str)


In [6]:
with open("agn_data.pkl", "rb") as f:
    agn_data = pickle.load(f)
agn_data_high_otype = agn_data.copy()
agn_data_high_otype = agn_data_high_otype.assign(otype= "AGN")
agn_data_high_otype = agn_data_high_otype[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error", "otype"]]
agn_data_high_otype.to_pickle("agn_data_high_otype.pkl")

## Getting non-AGN Galaxies

In [17]:
criteria_ = "otype = 'LSB..' OR otype = 'bCG..' OR otype = 'SBG..' OR otype = 'H2G..' OR otype = 'EmG..'"

# Initialize SIMBAD
simbad = Simbad()
simbad.ROW_LIMIT = -1
simbad.add_votable_fields("ids", "otype")

# Define coordinates (random)
coordinates = SkyCoord(0, -90, unit=("deg", "deg"))
coordinates2 = SkyCoord(0, 90, unit=("deg", "deg"))

# Query region 1st half
result = simbad.query_region(coordinates, radius="90d0m",
                           #  criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")
                           criteria=criteria_)
filtered_result = result["ids", "otype"] # Keep only the columns we need
result_df = filtered_result.to_pandas() # Convert result to a Pandas DataFrame
print(result_df.shape)
filtered_result = result_df.dropna() # Remove rows with any NULL values
data1 = filtered_result.to_numpy() # Convert to numpy array


# Query region 2nd half
result2 = simbad.query_region(coordinates2, radius="90d0m",
                             #criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")
                             criteria=criteria_)
filtered_result2 = result2["ids", "otype"] # Keep only the columns we need
result_df2 = filtered_result2.to_pandas() # Convert result to a Pandas DataFrame
print(result_df2.shape)
filtered_result2 = result_df2.dropna() # Remove rows with any NULL values
data2 = filtered_result2.to_numpy() # Convert to numpy array

# Combine the two arrays & save
#otypes_agn = np.row_stack((data1, data2))
#np.save("fullsky4catsunfiltered", simbad_data)

# without filtering
otypes_gal = pd.concat([result_df, result_df2])

(57267, 2)
(42307, 2)


#### Getting corresponding gaia data

In [18]:
# Concatenate the DataFrames
simbad_data = otypes_gal

# Filter SIMBAD data to only include rows where 'ids' contains 'Gaia DR3'
simbad_data['gaia_id'] = simbad_data['ids'].apply(lambda x: next((id for id in x.split('|') if id.startswith('Gaia DR3')), None))

# Remove 'Gaia DR3' prefix and drop rows with NaN values in 'gaia_id'
simbad_data['gaia_id'] = simbad_data['gaia_id'].str.lstrip('Gaia DR3')
simbad_data = simbad_data.dropna(subset=['gaia_id'])

# Ensure 'gaia_id' is a string
simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)

# Split the Gaia DR3 source IDs into chunks of 30,000 IDs each
GaiaDR3SourceIDs = ', '.join(simbad_data['gaia_id'].astype(str))
chunks = split_ids_into_chunks(GaiaDR3SourceIDs)

# Initialize an empty DataFrame to store all data
combined_df = pd.DataFrame()

# Process each chunk and append the results to the combined DataFrame
for chunk in chunks:
    dfGaia = GetGAIAData(chunk)
    combined_df = pd.concat([combined_df, dfGaia], ignore_index=True)

# Convert the combined DataFrame to a NumPy array if needed
combined_matrix = combined_df.to_numpy()

# GAIA data
gaia_data = combined_df[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error"]]

# Convert Gaia source_id to string
gaia_data['source_id'] = gaia_data['source_id'].astype(str)

# Merge Gaia and SIMBAD data on matching IDs
gal_data = pd.merge(gaia_data, simbad_data, left_on='source_id', right_on='gaia_id', how='inner')
gal_data.to_pickle("gal_data.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)


INFO: Query finished. [astroquery.utils.tap.core]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gaia_data['source_id'] = gaia_data['source_id'].astype(str)


In [10]:
with open("gal_data.pkl", "rb") as f:
    gal_data = pickle.load(f)
gal_data_high_otype = gal_data.copy()
gal_data_high_otype = gal_data_high_otype.assign(otype= "GAL") 
gal_data_high_otype = gal_data_high_otype[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error", "otype"]]
print(gal_data_high_otype.shape)

(33531, 18)


# Getting Binaries

In [20]:
criteria_ = "otype = '**..'"

# Initialize SIMBAD
simbad = Simbad()
simbad.ROW_LIMIT = -1
simbad.add_votable_fields("ids", "otype")

# Define coordinates (random)
coordinates = SkyCoord(0, -90, unit=("deg", "deg"))
coordinates2 = SkyCoord(0, 90, unit=("deg", "deg"))

# Query region 1st half
result = simbad.query_region(coordinates, radius="90d0m",
                           #  criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")
                           criteria=criteria_)
filtered_result = result["ids", "otype"] # Keep only the columns we need
result_df = filtered_result.to_pandas() # Convert result to a Pandas DataFrame
print(result_df.shape)
filtered_result = result_df.dropna() # Remove rows with any NULL values
data1 = filtered_result.to_numpy() # Convert to numpy array


# Query region 2nd half
result2 = simbad.query_region(coordinates2, radius="90d0m",
                             #criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")
                             criteria=criteria_)
filtered_result2 = result2["ids", "otype"] # Keep only the columns we need
result_df2 = filtered_result2.to_pandas() # Convert result to a Pandas DataFrame
print(result_df2.shape)
filtered_result2 = result_df2.dropna() # Remove rows with any NULL values
data2 = filtered_result2.to_numpy() # Convert to numpy array

# Combine the two arrays & save
#otypes_agn = np.row_stack((data1, data2))
#np.save("fullsky4catsunfiltered", simbad_data)

# without filtering
otypes_bin = pd.concat([result_df, result_df2])

(1446685, 2)
(917897, 2)


#### Getting coresponding gaia data

In [21]:
# Concatenate the DataFrames
simbad_data = otypes_bin

# Filter SIMBAD data to only include rows where 'ids' contains 'Gaia DR3'
simbad_data['gaia_id'] = simbad_data['ids'].apply(lambda x: next((id for id in x.split('|') if id.startswith('Gaia DR3')), None))

# Remove 'Gaia DR3' prefix and drop rows with NaN values in 'gaia_id'
simbad_data['gaia_id'] = simbad_data['gaia_id'].str.lstrip('Gaia DR3')
simbad_data = simbad_data.dropna(subset=['gaia_id'])

# Ensure 'gaia_id' is a string
simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)

# Split the Gaia DR3 source IDs into chunks of 30,000 IDs each
GaiaDR3SourceIDs = ', '.join(simbad_data['gaia_id'].astype(str))
chunks = split_ids_into_chunks(GaiaDR3SourceIDs)

# Initialize an empty DataFrame to store all data
combined_df = pd.DataFrame()

# Process each chunk and append the results to the combined DataFrame
for chunk in chunks:
    dfGaia = GetGAIAData(chunk)
    combined_df = pd.concat([combined_df, dfGaia], ignore_index=True)

# Convert the combined DataFrame to a NumPy array if needed
combined_matrix = combined_df.to_numpy()

# GAIA data
gaia_data = combined_df[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error"]]

# Convert Gaia source_id to string
gaia_data['source_id'] = gaia_data['source_id'].astype(str)

# Merge Gaia and SIMBAD data on matching IDs
bin_data = pd.merge(gaia_data, simbad_data, left_on='source_id', right_on='gaia_id', how='inner')
bin_data.to_pickle("bin_data.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)


INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gaia_data['source_id'] = gaia_data['source_id'].astype(str)


In [11]:
with open("bin_data.pkl", "rb") as f:
    bin_data = pickle.load(f)
bin_data_high_otype = bin_data.copy()
bin_data_high_otype = bin_data_high_otype.assign(otype= "BIN")
bin_data_high_otype = bin_data_high_otype[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error", "otype"]]
print(bin_data_high_otype.shape)

(1700440, 18)


# Getting data for single stars

In [4]:
criteria_ = "otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' OR otype = 'Pe*..' OR otype = 'SN*' OR otype = 'LM*' OR otype = 'V*' OR otype = 'Em*' OR otype = 'PM*' OR otype = 'HV*'"

# Initialize SIMBAD
simbad = Simbad()
simbad.ROW_LIMIT = -1
simbad.add_votable_fields("ids", "otype")

# Define coordinates (random)
coordinates = SkyCoord(0, -90, unit=("deg", "deg"))
coordinates2 = SkyCoord(0, 90, unit=("deg", "deg"))

# Query region 1st half
result = simbad.query_region(coordinates, radius="90d0m",
                           #  criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")
                           criteria=criteria_)
filtered_result = result["ids", "otype"] # Keep only the columns we need
result_df = filtered_result.to_pandas() # Convert result to a Pandas DataFrame
print(result_df.shape)
filtered_result = result_df.dropna() # Remove rows with any NULL values
data1 = filtered_result.to_numpy() # Convert to numpy array


# Query region 2nd half
result2 = simbad.query_region(coordinates2, radius="90d0m",
                             #criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")
                             criteria=criteria_)
filtered_result2 = result2["ids", "otype"] # Keep only the columns we need
result_df2 = filtered_result2.to_pandas() # Convert result to a Pandas DataFrame
print(result_df2.shape)
filtered_result2 = result_df2.dropna() # Remove rows with any NULL values
data2 = filtered_result2.to_numpy() # Convert to numpy array

# Combine the two arrays & save
#otypes_agn = np.row_stack((data1, data2))
#np.save("fullsky4catsunfiltered", simbad_data)

# without filtering
otypes_star = pd.concat([result_df, result_df2])

(1237922, 2)
(810162, 2)


#### Getting coresponding gaia data 

In [12]:
# Concatenate the DataFrames
simbad_data = otypes_star

# Filter SIMBAD data to only include rows where 'ids' contains 'Gaia DR3'
simbad_data['gaia_id'] = simbad_data['ids'].apply(lambda x: next((id for id in x.split('|') if id.startswith('Gaia DR3')), None))

# Remove 'Gaia DR3' prefix and drop rows with NaN values in 'gaia_id'
simbad_data['gaia_id'] = simbad_data['gaia_id'].str.lstrip('Gaia DR3')
simbad_data = simbad_data.dropna(subset=['gaia_id'])

# Ensure 'gaia_id' is a string
simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)

# Split the Gaia DR3 source IDs into chunks of 30,000 IDs each
GaiaDR3SourceIDs = ', '.join(simbad_data['gaia_id'].astype(str))
chunks = split_ids_into_chunks(GaiaDR3SourceIDs)

# Initialize an empty DataFrame to store all data
combined_df = pd.DataFrame()

# Process each chunk and append the results to the combined DataFrame
for chunk in chunks:
    dfGaia = GetGAIAData(chunk)
    combined_df = pd.concat([combined_df, dfGaia], ignore_index=True)

# Convert the combined DataFrame to a NumPy array if needed
combined_matrix = combined_df.to_numpy()

# GAIA data
gaia_data = combined_df[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error"]]

# Convert Gaia source_id to string
gaia_data['source_id'] = gaia_data['source_id'].astype(str)

# Merge Gaia and SIMBAD data on matching IDs
star_data = pd.merge(gaia_data, simbad_data, left_on='source_id', right_on='gaia_id', how='inner')
star_data.to_pickle("star_data.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)


INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gaia_data['source_id'] = gaia_data['source_id'].astype(str)


In [13]:
with open('star_data.pkl', 'rb') as file:
    star_data = pickle.load(file)
star_data_high_otype = star_data.copy()
star_data_high_otype = star_data_high_otype.assign(otype= "STAR")
star_data_high_otype = star_data_high_otype[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error", "otype"]]
print(star_data_high_otype.shape)

(1499508, 18)


## Naive way combining all data

In [35]:
all_data_high_otype = pd.concat([agn_data_high_otype, gal_data_high_otype, bin_data_high_otype, star_data_high_otype])
all_data_high_otype = all_data_high_otype.dropna()  
np.save("all_data_high_otype", all_data_high_otype.to_numpy())
print(all_data_high_otype.shape)


(3267794, 18)


# Getting equal amounts of data for each category

In [39]:
n = min(agn_data_high_otype.dropna().shape[0], gal_data_high_otype.dropna().shape[0], bin_data_high_otype.dropna().shape[0], star_data_high_otype.dropna().shape[0])
equal_data_high_otype = pd.concat([agn_data_high_otype.sample(n=n), gal_data_high_otype, bin_data_high_otype.sample(n=n), star_data_high_otype.sample(n=n)])
equal_data_high_otype = equal_data_high_otype.dropna()
np.save("equal_data_high_otype", equal_data_high_otype.to_numpy())
print(equal_data_high_otype.shape)

(14071, 18)


### Using all except galaxies

In [15]:
n = min(agn_data_high_otype.dropna().shape[0], bin_data_high_otype.dropna().shape[0], star_data_high_otype.dropna().shape[0])
no_gal_high_otype = pd.concat([agn_data_high_otype.dropna().sample(n=n), bin_data_high_otype.dropna().sample(n=n), star_data_high_otype.dropna().sample(n=n)])
#equal_data_high_otype = equal_data_high_otype.dropna()
np.save("no_gal_high_otype",no_gal_high_otype.to_numpy())
print(no_gal_high_otype.shape)

(949731, 18)


In [None]:
no_gal_high_otype = pd.concat([agn_data_high_otype, bin_data_high_otype, star_data_high_otype])
no_gal_high_otype = no_gal_high_otype.dropna()  

In [7]:
del_gal = pd.DataFrame(all_data_high_otype)
# filter the rows that contain the substring
substring = 'GAL'
filter = del_gal['17'].str.contains(substring)
filtered_df = df[~filter]

# display the filtered data frame
print(f"\nData Frame after removing rows that contain '{substring}' in 'Name' column:")
print(filtered_df)

KeyError: '17'

## Getting LAMOST DATA

In [12]:
# Load your AGN data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
agn_data = pd.read_pickle("agn_data.pkl")  # Loaded AGN data
lamost_catalog = pd.read_csv("dr9_v2.0_LRS_catalogue.csv")  # Assuming CSV format for LAMOST catalog

# Ensure that RA and Dec columns are numeric and have units
agn_data['ra'] = pd.to_numeric(agn_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
agn_data['dec'] = pd.to_numeric(agn_data['dec'], errors='coerce')
lamost_catalog['ra'] = pd.to_numeric(lamost_catalog['ra'], errors='coerce')
lamost_catalog['dec'] = pd.to_numeric(lamost_catalog['dec'], errors='coerce')

# Drop rows with NaN values in RA or Dec
agn_data = agn_data.dropna(subset=['ra', 'dec'])
lamost_catalog = lamost_catalog.dropna(subset=['ra', 'dec'])

# Convert AGN and LAMOST data to SkyCoord objects for crossmatching
agn_coords = SkyCoord(ra=agn_data['ra'].values*u.deg, dec=agn_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=lamost_catalog['ra'].values*u.deg, dec=lamost_catalog['dec'].values*u.deg)

# Perform the crossmatch using astropy's match_to_catalog_sky function
idx, d2d, _ = agn_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_agn = agn_data.iloc[matches]
matched_lamost = lamost_catalog.iloc[idx[matches]]

# Combine matched data
agn_lamost_data = pd.concat([matched_agn.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

# Save the crossmatched data
agn_lamost_data.to_pickle("agn_lamost_data.pkl")

print(f"Number of matches: {agn_lamost_data.shape[0]}")
print("out of ", agn_data.shape[0])


Number of matches: 40138
out of  412025
