Experimenting with Astroquery

In [2]:
from astroquery.simbad import Simbad
from astropy.coordinates import SkyCoord
import numpy as np

In [2]:
Simbad().list_wildcards() sdada

*: Any string of characters (including an empty one)
?: Any character (exactly one character)
[abc]: Exactly one character taken in the list. Can also be defined by a range of characters: [A-Z]
[^0-9]: Any (one) character not in the list.


# First data (Half Sky 4cat)

In [3]:
# Initialize SIMBAD
simbad = Simbad()
simbad.ROW_LIMIT = -1
simbad.add_votable_fields( "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra", "hpx", "otype")

# Define coordinates (random)
coordinates = SkyCoord(135.9, -65.3, unit=("deg", "deg"))

# Query region
result = simbad.query_region(coordinates, radius="90d0m",
                             criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")

#print(result)
#preresult = np.array(result)
filtered_result = result["main_id", "ra", "dec", "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra", "hpx", "otype"]

# Convert result to a Pandas DataFrame
result_df = filtered_result.to_pandas()

# Remove rows with any NULL values
filtered_result = result_df.dropna()

# Display the filtered result
print(filtered_result)

#Save as np array
FirstData = filtered_result.to_numpy()

np.save("FirstData", FirstData)

KeyboardInterrupt: 

In [7]:
listvotablefield =  np.array(Simbad.list_votable_fields())

# Full Sky 4cat

In [2]:
# Initialize SIMBAD
simbad = Simbad()
simbad.ROW_LIMIT = -1
simbad.add_votable_fields( "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra", "hpx", "otype")

# Define coordinates (random)
coordinates = SkyCoord(0, -90, unit=("deg", "deg"))
coordinates2 = SkyCoord(0, 90, unit=("deg", "deg"))

# Query region 1st half
result = simbad.query_region(coordinates, radius="90d0m",
                             criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")

filtered_result = result["main_id", "ra", "dec", "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra", 
                         "hpx", "otype"]
result_df = filtered_result.to_pandas() # Convert result to a Pandas DataFrame
filtered_result = result_df.dropna() # Remove rows with any NULL values
data1 = filtered_result.to_numpy() # Convert to numpy array


# Query region 2nd half
result2 = simbad.query_region(coordinates2, radius="90d0m",
                             criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' ")

filtered_result2 = result2["main_id", "ra", "dec", "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra",
                            "hpx", "otype"]
result_df2 = filtered_result2.to_pandas() # Convert result to a Pandas DataFrame
filtered_result2 = result_df2.dropna() # Remove rows with any NULL values
data2 = filtered_result2.to_numpy() # Convert to numpy array

# Combine the two arrays & save
data =np.row_stack((data1, data2))
np.save("fullsky4cats", data)

# 5 cat (with binaries)

In [6]:
# Initialize SIMBAD
simbad = Simbad()
simbad.ROW_LIMIT = -1
simbad.add_votable_fields( "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra", "hpx", "otype")

# Define coordinates (random)
coordinates = SkyCoord(135.9, -65.3, unit=("deg", "deg"))

# Query region
result = simbad.query_region(coordinates, radius="89d0m",
                             criteria="otype = 'Ma*..' OR otype = 'MS*..' OR otype = 'Y*O..' OR otype = 'Ev*..' OR otype = '**..' ")

print("result")
print(result)
preresult = np.array(result)

filtered_result = result["main_id", "ra", "dec", "plx_value", "V", "I", "J", "H", "K","G", "pmdec", "pmra", "hpx", "otype"]

# Convert result to a Pandas DataFrame
result_df = filtered_result.to_pandas()

# Remove rows with any NULL values
filtered_result = result_df.dropna()

# Display the filtered result
print("filtered_result")
print(filtered_result)

#Save as np array
Withbinaries = filtered_result.to_numpy()

np.save("Withbinaries", Withbinaries)

  warn("Partial result set. Potential causes MAXREC, async storage space, etc.",


result
          main_id                    ra         ...  pmdec   otype
                                    deg         ... mas / yr      
---------------------------- ------------------ ... -------- -----
                   HD  89205 153.77614408634466 ...    19.79    **
     2MASS J05443173+0912459      86.1322393154 ...   -1.215   Y*O
     2MASS J05484868+1020286     87.20284830913 ...    0.389   Y*O
           OGLE BLG-ELL-6837    267.20515207752 ...   -2.219   El*
        OGLE SMC106.6  21797 14.467708333333334 ...       --   EB*
            UCAC4 276-214163        317.4141433 ...    -10.2    **
       CRTS J182214.8-540825 275.56161211834916 ...   -5.753   RR*
Gaia DR3 6652725606444204800  275.5616105570904 ...   -2.179   RR*
       CRTS J143012.9-082355 217.55391979307998 ...   -0.826   RR*
                         ...                ... ...      ...   ...
Gaia DR3 6762548092012280576  286.1116738822854 ...   -7.508   EB*
Gaia DR3 6762551042648799360 286.31771105350043 ...    

# Making it into 4 high hierarchy options

In [7]:
FirstData = np.load("fullsky4cats.npy", allow_pickle=True)
print(FirstData.shape)
X = FirstData[:, 1:-1]  # All columns except the last one
y = FirstData[:, -1]    # The last column

Macondition = np.isin(y, ["bC*", "sg*", "s*r", "s*y", "s*b", "WR*", "N*", "Psr"])
y4cat = np.where(Macondition, "Ma*", y)
YOcondition = np.isin(y, ["Or*", "TT*", "out", "Ae*", "HH"])
y4cat = np.where(YOcondition, "Y*O", y4cat)
MScondition = np.isin(y, ["Be*", "BS*", "SX*", "gD*", "dS*"])
y4cat = np.where(MScondition, "MS*", y4cat)
Evcondition = np.isin(y, ["RG*", "HS*", "RR*", "HB*", "WV*", "Ce*", "cC*", "C*", "S*", "LP*", "AS*", "AB*", "Mi*", "OH*", "pA*", "RV*", "PN", "WD*"])
y4cat = np.where(Evcondition, "Ev*", y4cat)

# Combine X and y4cat into a single array
combined_data = np.column_stack((X, y4cat))

# Save the combined array
np.save("cat4.npy", combined_data)

(92526, 14)


# For data with gaia and simbad

In [4]:
FirstData = np.load("fullsky4catsgaiac.npy", allow_pickle=True)
print(FirstData.shape)
X = FirstData[:, 1:-1]  # All columns except the last one
y = FirstData[:, -1]    # The last column

Macondition = np.isin(y, ["bC*", "sg*", "s*r", "s*y", "s*b", "WR*", "N*", "Psr"])
y4cat = np.where(Macondition, "Ma*", y)
YOcondition = np.isin(y, ["Or*", "TT*", "out", "Ae*", "HH"])
y4cat = np.where(YOcondition, "Y*O", y4cat)
MScondition = np.isin(y, ["Be*", "BS*", "SX*", "gD*", "dS*"])
y4cat = np.where(MScondition, "MS*", y4cat)
Evcondition = np.isin(y, ["RG*", "HS*", "RR*", "HB*", "WV*", "Ce*", "cC*", "C*", "S*", "LP*", "AS*", "AB*", "Mi*", "OH*", "pA*", "RV*", "PN", "WD*"])
y4cat = np.where(Evcondition, "Ev*", y4cat)

# Combine X and y4cat into a single array
combined_data = np.column_stack((X, y4cat))

# Save the combined array
np.save("fullsky4catsgaia.npy", combined_data)

(48466, 38)


In [9]:
# Count the occurrences of each class
unique_classes, counts = np.unique(combined_data[:, -1], return_counts=True)

# Print the results
for cls, count in zip(unique_classes, counts):
    print(f"Class {cls}: {count} occurrences")

Class Ev*: 80207 occurrences
Class MS*: 1416 occurrences
Class Ma*: 976 occurrences
Class Y*O: 9927 occurrences
