In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from astroquery.gaia import Gaia
from astroquery.simbad import Simbad
from astropy.coordinates import SkyCoord
import astropy.units as u
import numpy as np
import pyvo as vo
import pickle
from astroquery.vizier import Vizier

In [3]:
def GetGAIAData(GaiaDR3SourceIDs):
    try:
        qry = f"SELECT * FROM gaiadr3.gaia_source gs WHERE gs.source_id in ({GaiaDR3SourceIDs});"
        job = Gaia.launch_job_async(qry)
        tblGaia = job.get_results()
        dfGaia = tblGaia.to_pandas()
        return dfGaia
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error

def split_ids_into_chunks(id_string, chunk_size=50000):
    id_list = id_string.split(', ')
    chunks = [', '.join(id_list[i:i + chunk_size]) for i in range(0, len(id_list), chunk_size)]
    return chunks

# Doing all criteria

In [None]:
criteria_ = "otype = 'MS*' OR otype = 'Be*' OR otype = 'BS*' OR otype = 'SX*' OR otype = 'gD*' OR otype = 'dS*' OR otype = 'Ma*' OR otype = 'bC*' OR otype = 'sg*' \
OR otype = 's*r' OR otype = 's*y' OR otype = 's*b' OR otype = 'WR*' OR otype = 'N*' OR otype = 'Psr' OR otype = 'Y*O' OR otype = 'Or*' OR otype = 'TT*' OR  \
otype = 'Ae*' OR otype = 'out' OR otype = 'HH' OR otype = 'Ev*' OR otype = 'RG*' OR otype = 'HS*' OR otype = 'HB*' OR otype = 'RR*' OR otype = 'WV*' OR otype = 'Ce*' \
OR otype = 'cC*' OR otype = 'C*' OR otype = 'S*' OR otype = 'LP*' OR otype = 'AB*' OR otype = 'Mi*' OR otype = 'OH*' OR otype = 'pA*' OR otype = 'RV*' OR otype = 'PN' \
OR otype = 'WD*' OR otype = 'Pe*' OR otype = 'a2*' OR otype = 'RC*' OR otype = 'EB*' OR otype = 'El*' OR otype = 'SB*' OR otype = 'RS*' OR otype = 'BY*' OR otype = 'Sy*' \
OR otype = 'XB*' OR otype = 'LXB' OR otype = 'HXB' OR otype = 'CV*' OR otype = 'No*' OR otype = '**' OR otype = 'SN*' OR otype = 'LM*' OR otype = 'BD*' OR otype = 'Ir*' \
OR otype = 'Er*' OR otype = 'Ro*' OR otype = 'Pu*' OR otype = 'Em*' OR otype = 'PM*' OR otype = 'HV*'"

# Initialize SIMBADotype = 'MS*' OR otype = 'Ma*' OR otype = 'Y*O' OR otype = 'Ev*' OR otype = 'RG*' OR 
simbad = Simbad()
simbad.ROW_LIMIT = -1
temp = simbad.list_votable_fields()
simbad.add_votable_fields("ids", "alltypes", "otype")

# Define coordinates (random)
coordinates = SkyCoord(0, -90, unit=("deg", "deg"))
coordinates2 = SkyCoord(0, 90, unit=("deg", "deg"))

# Query region 1st half
result = simbad.query_region(coordinates, radius="90d0m", criteria=criteria_)
result_df = result.to_pandas() # Convert result to a Pandas DataFrame

filtered_result = result["ids", "otypes", "otype"] # Keep only the columns we need
result_df = filtered_result.to_pandas() # Convert result to a Pandas DataFrame
print(result_df.shape)


# Query region 2nd half
result2 = simbad.query_region(coordinates2, radius="90d0m", criteria=criteria_)
filtered_result2 = result2["ids", "otypes", "otype"] # Keep only the columns we need
result_df2 = filtered_result2.to_pandas() # Convert result to a Pandas DataFrame
print(result_df2.shape)


# Combine the four arrays & save
bin = pd.concat([result_df, result_df2])

  warn("Partial result set. Potential causes MAXREC, async storage space, etc.",


(2000000, 3)
(1896034, 3)


# Getting the otype

In [123]:
simbad = Simbad()
simbad.ROW_LIMIT = -1
temp = simbad.list_votable_fields()
simbad.add_votable_fields("ids", "alltypes", "otype")

query1 = simbad.query_tap("""SELECT oid, otype
FROM basic
WHERE (otype = 'MS*' OR otype = 'Be*' OR otype = 'BS*' OR otype = 'SX*' OR otype = 'gD*' OR otype = 'dS*' OR otype = 'Ma*' OR otype = 'bC*' OR otype = 'sg*' \
OR otype = 's*r' OR otype = 's*y' OR otype = 's*b' OR otype = 'WR*' OR otype = 'N*' OR otype = 'Psr' OR otype = 'Y*O' OR otype = 'Or*' OR otype = 'TT*' OR  \
otype = 'Ae*' OR otype = 'out' OR otype = 'HH' OR otype = 'Ev*' OR otype = 'RG*' OR otype = 'HS*' OR otype = 'HB*' OR otype = 'RR*' OR otype = 'WV*' OR otype = 'Ce*' \
OR otype = 'cC*' OR otype = 'C*' OR otype = 'S*' OR otype = 'LP*' OR otype = 'AB*' OR otype = 'Mi*' OR otype = 'OH*' OR otype = 'pA*' OR otype = 'RV*' OR otype = 'PN' \
OR otype = 'WD*' OR otype = 'Pe*' OR otype = 'a2*' OR otype = 'RC*' OR otype = 'EB*' OR otype = 'El*' OR otype = 'SB*' OR otype = 'RS*' OR otype = 'BY*' OR otype = 'Sy*' \
OR otype = 'XB*' OR otype = 'LXB' OR otype = 'HXB' OR otype = 'CV*' OR otype = 'No*' OR otype = '**' OR otype = 'SN*' OR otype = 'LM*' OR otype = 'BD*' OR otype = 'Ir*' \
OR otype = 'Er*' OR otype = 'Ro*' OR otype = 'Pu*' OR otype = 'Em*' OR otype = 'PM*' OR otype = 'HV*') AND (pmra < 0 ) AND (pmdec < -4)""", maxrec=2000000) 

In [121]:
query2 = Simbad.query_tap("""SELECT oid, otype
FROM basic
WHERE (otype = 'MS*' OR otype = 'Be*' OR otype = 'BS*' OR otype = 'SX*' OR otype = 'gD*' OR otype = 'dS*' OR otype = 'Ma*' OR otype = 'bC*' OR otype = 'sg*' \
OR otype = 's*r' OR otype = 's*y' OR otype = 's*b' OR otype = 'WR*' OR otype = 'N*' OR otype = 'Psr' OR otype = 'Y*O' OR otype = 'Or*' OR otype = 'TT*' OR  \
otype = 'Ae*' OR otype = 'out' OR otype = 'HH' OR otype = 'Ev*' OR otype = 'RG*' OR otype = 'HS*' OR otype = 'HB*' OR otype = 'RR*' OR otype = 'WV*' OR otype = 'Ce*' \
OR otype = 'cC*' OR otype = 'C*' OR otype = 'S*' OR otype = 'LP*' OR otype = 'AB*' OR otype = 'Mi*' OR otype = 'OH*' OR otype = 'pA*' OR otype = 'RV*' OR otype = 'PN' \
OR otype = 'WD*' OR otype = 'Pe*' OR otype = 'a2*' OR otype = 'RC*' OR otype = 'EB*' OR otype = 'El*' OR otype = 'SB*' OR otype = 'RS*' OR otype = 'BY*' OR otype = 'Sy*' \
OR otype = 'XB*' OR otype = 'LXB' OR otype = 'HXB' OR otype = 'CV*' OR otype = 'No*' OR otype = '**' OR otype = 'SN*' OR otype = 'LM*' OR otype = 'BD*' OR otype = 'Ir*' \
OR otype = 'Er*' OR otype = 'Ro*' OR otype = 'Pu*' OR otype = 'Em*' OR otype = 'PM*' OR otype = 'HV*') AND (pmra < 0 ) AND (pmdec >= -4)""", maxrec=2000000) 

In [122]:
query3 = Simbad.query_tap("""SELECT oid, otype
FROM basic
WHERE (otype = 'MS*' OR otype = 'Be*' OR otype = 'BS*' OR otype = 'SX*' OR otype = 'gD*' OR otype = 'dS*' OR otype = 'Ma*' OR otype = 'bC*' OR otype = 'sg*' \
OR otype = 's*r' OR otype = 's*y' OR otype = 's*b' OR otype = 'WR*' OR otype = 'N*' OR otype = 'Psr' OR otype = 'Y*O' OR otype = 'Or*' OR otype = 'TT*' OR  \
otype = 'Ae*' OR otype = 'out' OR otype = 'HH' OR otype = 'Ev*' OR otype = 'RG*' OR otype = 'HS*' OR otype = 'HB*' OR otype = 'RR*' OR otype = 'WV*' OR otype = 'Ce*' \
OR otype = 'cC*' OR otype = 'C*' OR otype = 'S*' OR otype = 'LP*' OR otype = 'AB*' OR otype = 'Mi*' OR otype = 'OH*' OR otype = 'pA*' OR otype = 'RV*' OR otype = 'PN' \
OR otype = 'WD*' OR otype = 'Pe*' OR otype = 'a2*' OR otype = 'RC*' OR otype = 'EB*' OR otype = 'El*' OR otype = 'SB*' OR otype = 'RS*' OR otype = 'BY*' OR otype = 'Sy*' \
OR otype = 'XB*' OR otype = 'LXB' OR otype = 'HXB' OR otype = 'CV*' OR otype = 'No*' OR otype = '**' OR otype = 'SN*' OR otype = 'LM*' OR otype = 'BD*' OR otype = 'Ir*' \
OR otype = 'Er*' OR otype = 'Ro*' OR otype = 'Pu*' OR otype = 'Em*' OR otype = 'PM*' OR otype = 'HV*') AND (pmra >= 0) AND (pmdec < -3)""", maxrec=2000000) 

In [120]:
query4 = Simbad.query_tap("""SELECT oid, otype
FROM basic
WHERE (otype = 'MS*' OR otype = 'Be*' OR otype = 'BS*' OR otype = 'SX*' OR otype = 'gD*' OR otype = 'dS*' OR otype = 'Ma*' OR otype = 'bC*' OR otype = 'sg*' \
OR otype = 's*r' OR otype = 's*y' OR otype = 's*b' OR otype = 'WR*' OR otype = 'N*' OR otype = 'Psr' OR otype = 'Y*O' OR otype = 'Or*' OR otype = 'TT*' OR  \
otype = 'Ae*' OR otype = 'out' OR otype = 'HH' OR otype = 'Ev*' OR otype = 'RG*' OR otype = 'HS*' OR otype = 'HB*' OR otype = 'RR*' OR otype = 'WV*' OR otype = 'Ce*' \
OR otype = 'cC*' OR otype = 'C*' OR otype = 'S*' OR otype = 'LP*' OR otype = 'AB*' OR otype = 'Mi*' OR otype = 'OH*' OR otype = 'pA*' OR otype = 'RV*' OR otype = 'PN' \
OR otype = 'WD*' OR otype = 'Pe*' OR otype = 'a2*' OR otype = 'RC*' OR otype = 'EB*' OR otype = 'El*' OR otype = 'SB*' OR otype = 'RS*' OR otype = 'BY*' OR otype = 'Sy*' \
OR otype = 'XB*' OR otype = 'LXB' OR otype = 'HXB' OR otype = 'CV*' OR otype = 'No*' OR otype = '**' OR otype = 'SN*' OR otype = 'LM*' OR otype = 'BD*' OR otype = 'Ir*' \
OR otype = 'Er*' OR otype = 'Ro*' OR otype = 'Pu*' OR otype = 'Em*' OR otype = 'PM*' OR otype = 'HV*') AND (pmra >= 0) AND (pmdec >= -3)""", maxrec=2000000) 

In [124]:
# Print length of each query
print(len(query1))
print(len(query2))
print(len(query3))
print(len(query4))

# Print total length of all queries
print('Number of stars in all queries: ', len(query1) + len(query2) + len(query3) + len(query4))

# Make into a dataframe and concatenate
df1 = query1.to_pandas()
df2 = query2.to_pandas()
df3 = query3.to_pandas()
df4 = query4.to_pandas()

df = pd.concat([df1, df2, df3, df4])

1528405
1737482
537097
746532
Number of stars in all queries:  4549516


In [125]:
# keep only the first column of df1
dftest = df1.iloc[:, 0]

# Getting the IDs

In [None]:
from astroquery.simbad import Simbad
from astropy.table import Table 

# I want to query the ids table for all the oid in dftest
# I will use the following query
# SELECT ids from ids WHERE oidref = dftest
# I will use the following query
simbad.query_objectids()




letters_table = dftest
Simbad.query_tap("SELECT ids from ids WHERE oidref = TAP_UPLOAD.my_table_name",
                 my_table_name=letters_table) 

DALServiceError: 404 Client Error: 404 for url: https://simbad.cds.unistra.fr/simbad/sim-tap/async/phase

In [83]:
# Print the number of stars in each category
print('Number of stars in each category: ')
counts = df['otype'].value_counts()
print(counts)

Number of stars in each category: 
otype
RR*    2075616
PM*    2050194
LM*    2010719
EB*     319244
LP*      92719
SB*      33090
WD*      22148
RG*      20078
Em*      12888
Ro*      12662
C*       12316
**       10547
HB*      10258
cC*      10101
Pu*       8646
Pe*       6911
Y*O       6247
Mi*       4202
RS*       3768
El*       3572
dS*       3548
TT*       2716
BY*       2333
Er*       1701
Or*       1335
HS*        967
s*r        924
BS*        840
AB*        749
Ce*        633
s*b        516
Be*        451
gD*        386
WV*        361
CV*        358
HV*        293
S*         280
BD*        271
WR*        249
sg*        206
s*y        195
MS*        186
a2*        159
PN         153
HXB        125
RV*        114
OH*         91
Ir*         80
Psr         78
pA*         51
RC*         49
SX*         42
bC*         41
Ae*         34
No*         27
Sy*         27
SN*         11
XB*         11
LXB         10
HH           5
Name: count, dtype: int64


In [74]:
# Save the data
df.to_csv("all_stars.csv", index=False)

In [75]:
# Load the DataFrame
simbad_data = pd.read_csv("all_stars.csv")

# Filter SIMBAD data to only include rows where 'ids' contains 'Gaia DR3'
simbad_data['gaia_id'] = simbad_data['ids'].apply(lambda x: next((id for id in x.split('|') if id.startswith('Gaia DR3')), None))

# Remove 'Gaia DR3' prefix and drop rows with NaN values in 'gaia_id'
simbad_data['gaia_id'] = simbad_data['gaia_id'].str.lstrip('Gaia DR3')
simbad_data = simbad_data.dropna(subset=['gaia_id'])

# Ensure 'gaia_id' is a string
simbad_data['gaia_id'] = simbad_data['gaia_id'].astype(str)

# Split the Gaia DR3 source IDs into chunks of 30,000 IDs each
GaiaDR3SourceIDs = ', '.join(simbad_data['gaia_id'].astype(str))
chunks = split_ids_into_chunks(GaiaDR3SourceIDs)

# Initialize an empty DataFrame to store all data
combined_df = pd.DataFrame()

# Process each chunk and append the results to the combined DataFrame
for chunk in chunks:
    dfGaia = GetGAIAData(chunk)
    combined_df = pd.concat([combined_df, dfGaia], ignore_index=True)

# Convert the combined DataFrame to a NumPy array if needed
combined_matrix = combined_df.to_numpy()

# GAIA data
gaia_data = combined_df[["source_id", "ra", "ra_error", "dec", "dec_error", "phot_g_mean_flux", "phot_g_mean_flux_error", "pmra", "pmra_error", "pmdec", "pmdec_error", "parallax", "parallax_error", "phot_bp_mean_flux", "phot_bp_mean_flux_error", "phot_rp_mean_flux", "phot_rp_mean_flux_error"]]

# Convert Gaia source_id to string
gaia_data['source_id'] = gaia_data['source_id'].astype(str)

# Merge Gaia and SIMBAD data on matching IDs
star_data = pd.merge(gaia_data, simbad_data, left_on='source_id', right_on='gaia_id', how='inner')
star_data.to_pickle("star_data.pkl")

KeyError: 'ids'

In [4]:
# open star_data.pkl
star_data = pd.read_pickle("star_data.pkl")

# print otypes
print(star_data["otype"].value_counts())

otype
*    3845419
Name: count, dtype: int64
