In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from astroquery.gaia import Gaia
from astroquery.simbad import Simbad
from astropy.coordinates import SkyCoord
import astropy.units as u
import numpy as np
import pyvo as vo


## Checking column names for ref

In [2]:
gaiadr3_table = Gaia.load_table('gaiadr3.gaia_source')
print(gaiadr3_table)
for column in gaiadr3_table.columns:
  print(column.name)

TAP Table name: gaiadr3.gaia_source
Description: This table has an entry for every Gaia observed source as published with this data release. It contains the basic source parameters, in their final state as processed by the Gaia Data Processing and Analysis Consortium from the raw data coming from the spacecraft. The table is complemented with others containing information specific to certain kinds of objects (e.g.~Solar--system objects, non--single stars, variables etc.) and value--added processing (e.g.~astrophysical parameters etc.). Further array data types (spectra, epoch measurements) are presented separately via Datalink resources.
Size (bytes): 3646930329600
Num. columns: 152
solution_id
designation
source_id
random_index
ref_epoch
ra
ra_error
dec
dec_error
parallax
parallax_error
parallax_over_error
pm
pmra
pmra_error
pmdec
pmdec_error
ra_dec_corr
ra_parallax_corr
ra_pmra_corr
ra_pmdec_corr
dec_parallax_corr
dec_pmra_corr
dec_pmdec_corr
parallax_pmra_corr
parallax_pmdec_corr
pm

In [3]:
r = r.drop('pseudocolour', axis=1)
r = r.drop('pseudocolour_error', axis=1)

# Remove rows with any NULL values
result = r.dropna()

NameError: name 'r' is not defined

## Getting Simbad otypes for these bad boys

In [4]:
# Initialize SIMBAD
simbad = Simbad()
simbad.ROW_LIMIT = -1
simbad.add_votable_fields( "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra", "hpx","ids", "otype")

# Define coordinates (random)
coordinates = SkyCoord(0, -90, unit=("deg", "deg"))
coordinates2 = SkyCoord(0, 90, unit=("deg", "deg"))

# Query region 1st half
result = simbad.query_region(coordinates, radius="90d0m",
                             criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")

filtered_result = result["main_id", "ra", "dec", "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra", 
                         "hpx", "ids", "otype"]
result_df = filtered_result.to_pandas() # Convert result to a Pandas DataFrame
filtered_result = result_df.dropna() # Remove rows with any NULL values
data1 = filtered_result.to_numpy() # Convert to numpy array


# Query region 2nd half
result2 = simbad.query_region(coordinates2, radius="90d0m",
                             criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")

filtered_result2 = result2["main_id", "ra", "dec", "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra",
                            "hpx", "ids", "otype"]
result_df2 = filtered_result2.to_pandas() # Convert result to a Pandas DataFrame
filtered_result2 = result_df2.dropna() # Remove rows with any NULL values
data2 = filtered_result2.to_numpy() # Convert to numpy array

# Combine the two arrays & save
simbad_data = np.row_stack((data1, data2))
np.save("fullsky4catsunfiltered", simbad_data)

# without filtering
simbad_data_all = pd.concat([result_df, result_df2])



## Adding labels to gaia

In [28]:
simbad_data_all = pd.concat([result_df, result_df2])
simbad_data_ref = simbad_data_all[["ids", "otype"]]

# Filter SIMBAD data to only include rows where 'ids' contains 'Gaia DR3'
simbad_data_ref['gaia_id'] = simbad_data_ref['ids'].apply(lambda x: next((id for id in x.split('|') if id.startswith('Gaia DR3')), None))

# Remove 'Gaia DR3' prefix and drop rows with NaN values in 'gaia_id'
simbad_data_ref['gaia_id'] = simbad_data_ref['gaia_id'].str.lstrip('Gaia DR3')
simbad_data_ref = simbad_data_ref.dropna(subset=['gaia_id'])

# Ensure 'gaia_id' is a string
simbad_data_ref['gaia_id'] = simbad_data_ref['gaia_id'].astype(str)

# Convert Gaia source_id to string
gaia_data['source_id'] = gaia_data['source_id'].astype(str)

# Merge Gaia and SIMBAD data on matching IDs
merged_data2 = pd.merge(gaia_data, simbad_data_ref, left_on='source_id', right_on='gaia_id', how='inner')

merged_data2 = merged_data2.drop(['gaia_id', 'ids'], axis=1)
merged_data2 = merged_data2.dropna()
np.save("fullsky4catgaia", merged_data2.to_numpy())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simbad_data_ref['gaia_id'] = simbad_data_ref['ids'].apply(lambda x: next((id for id in x.split('|') if id.startswith('Gaia DR3')), None))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simbad_data_ref['gaia_id'] = simbad_data_ref['gaia_id'].str.lstrip('Gaia DR3')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [29]:
FirstData = np.load("fullsky4catgaia.npy", allow_pickle=True)
print(FirstData.shape)
X = FirstData[:, 1:-1]  # All columns except the last one
y = FirstData[:, -1]    # The last column

Macondition = np.isin(y, ["bC*", "sg*", "s*r", "s*y", "s*b", "WR*", "N*", "Psr"])
y4cat = np.where(Macondition, "Ma*", y)
YOcondition = np.isin(y, ["Or*", "TT*", "out", "Ae*", "HH"])
y4cat = np.where(YOcondition, "Y*O", y4cat)
MScondition = np.isin(y, ["Be*", "BS*", "SX*", "gD*", "dS*"])
y4cat = np.where(MScondition, "MS*", y4cat)
Evcondition = np.isin(y, ["RG*", "HS*", "RR*", "HB*", "WV*", "Ce*", "cC*", "C*", "S*", "LP*", "AS*", "AB*", "Mi*", "OH*", "pA*", "RV*", "PN", "WD*"])
y4cat = np.where(Evcondition, "Ev*", y4cat)

# Combine X and y4cat into a single array
combined_data = np.column_stack((X, y4cat))

# Save the combined array
np.save("cat4gaia.npy", combined_data)

(459713, 26)


## Combining both

# Getting gaia data for simbad sources 

In [6]:
def GetGAIAData(GaiaDR3SourceIDs):
    try:
        qry = f"SELECT * FROM gaiadr3.gaia_source gs WHERE gs.source_id in ({GaiaDR3SourceIDs});"
        job = Gaia.launch_job_async(qry)
        tblGaia = job.get_results()
        dfGaia = tblGaia.to_pandas()
        return dfGaia
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error

def split_ids_into_chunks(id_string, chunk_size=30000):
    id_list = id_string.split(', ')
    chunks = [', '.join(id_list[i:i + chunk_size]) for i in range(0, len(id_list), chunk_size)]
    return chunks

#load simbad data
#simbad_data = np.load("fullsky4catsunfiltered.npy", allow_pickle=True)
print(type(simbad_data))

# Example usage
GaiaDR3SourceIDs = ', '.join(simbad_data['gaia_id'].astype(str))
chunks = split_ids_into_chunks(GaiaDR3SourceIDs)

# Initialize an empty DataFrame to store all data
combined_df = pd.DataFrame()

# Process each chunk and append the results to the combined DataFrame
for chunk in chunks:
    dfGaia = GetGAIAData(chunk)
    combined_df = pd.concat([combined_df, dfGaia], ignore_index=True)

# Convert the combined DataFrame to a NumPy array if needed
combined_matrix = combined_df.to_numpy()

# Print the combined DataFrame and matrix to verify
print(combined_df)
print(combined_matrix)

<class 'pandas.core.frame.DataFrame'>
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query 

### gaia data

In [17]:
cleangaia = gaia_data.dropna() 

simbad_all_4cat = np.load("fullsky4cats.npy", allow_pickle=True)
simbad_all_4cat = simbad_all_4cat[]

## Combining both df

In [7]:
# Concatenate the DataFrames
simbad_data = pd.concat((result_df, result_df2))

# Filter SIMBAD data to only include rows where 'ids' contains 'Gaia DR3'
simbad_data['gaia_id'] = simbad_data['ids'].apply(lambda x: next((id for id in x.split('|') if id.startswith('Gaia DR3')), None))

# Remove 'Gaia DR3' prefix and drop rows with NaN values in 'gaia_id'
simbad_data['gaia_id'] = simbad_data['gaia_id'].str.lstrip('Gaia DR3')
simbad_data = simbad_data.dropna(subset=['gaia_id'])

# Ensure 'gaia_id' is a string
simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)

# Split the Gaia DR3 source IDs into chunks of 30,000 IDs each
GaiaDR3SourceIDs = ', '.join(simbad_data['gaia_id'].astype(str))
chunks = split_ids_into_chunks(GaiaDR3SourceIDs)

# Initialize an empty DataFrame to store all data
combined_df = pd.DataFrame()

# Process each chunk and append the results to the combined DataFrame
for chunk in chunks:
    dfGaia = GetGAIAData(chunk)
    combined_df = pd.concat([combined_df, dfGaia], ignore_index=True)

# Convert the combined DataFrame to a NumPy array if needed
combined_matrix = combined_df.to_numpy()

# GAIA data
gaia_data = combined_df[["source_id", "ra", "dec", "phot_g_mean_flux", "phot_g_mean_flux_error", "pm", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error", "teff_gspphot", "teff_gspphot_lower", "teff_gspphot_upper", "logg_gspphot", "logg_gspphot_lower", "logg_gspphot_upper", "mh_gspphot", "mh_gspphot_upper", "mh_gspphot_lower", "bp_rp", "bp_g", "g_rp", "ruwe" ]]

# Convert Gaia source_id to string
gaia_data['source_id'] = gaia_data['source_id'].astype(str)

# Merge Gaia and SIMBAD data on matching IDs
merged_data = pd.merge(gaia_data, simbad_data, left_on='source_id', right_on='gaia_id', how='inner')

np.save("fullsky4catsgaia", merged_data)
# Display the merged data
print(merged_data)


INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]
INFO: Query finished. [astroquery.utils.tap.core]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gaia_data['source_id'] = gaia_data['source_id'].astype(str)


                   source_id        ra_x      dec_x  phot_g_mean_flux  \
0        2314807774192293888  359.616575 -30.575340      8.217440e+02   
1        2315780777558518400    6.124880 -32.736242      2.055230e+06   
2        2315815721412502656    6.526350 -32.406611      2.893416e+03   
3        2317499000699411328    7.858203 -31.144719      5.026150e+02   
4        2319939469836304000    4.038796 -31.038104      4.600667e+02   
...                      ...         ...        ...               ...   
1033887  4602668309390248320  264.190870  35.380182      1.160842e+05   
1033888  4602859761853734784  270.991181  31.192658      3.651999e+04   
1033889  4603106464774587136  269.939597  31.588431      9.242272e+04   
1033890  4606894488492467200  273.617550  37.672687      1.638562e+05   
1033891  4610563558792185344  268.659376  37.337387      1.399180e+03   

         phot_g_mean_flux_error         pm  parallax  parallax_error  \
0                      1.528197  19.457048  3.62147

## Cleaning data

In [14]:
merged_datatemp = merged_data.dropna(subset=["teff_gspphot"])
merged_data2 = np.delete(merged_datatemp, 40, 1)
merged_data2 = np.delete(merged_data2, 38, 1)
merged_data2 = np.delete(merged_data2, 26, 1)
merged_data2 = np.delete(merged_data2, 25, 1)


np.save("fullsky4catsgaia3", merged_data2)

## Other trashy stuff