In [28]:
import pandas as pd
import numpy as np

In [29]:
df = pd.read_csv("./hygdata_v3.csv", low_memory=False)  # To ensure no mixed types

In [30]:
df.head()

Unnamed: 0,id,hip,hd,hr,gl,bf,proper,ra,dec,dist,...,bayer,flam,con,comp,comp_primary,base,lum,var,var_min,var_max
0,0,,,,,,Sol,0.0,0.0,0.0,...,,,,1,0,,1.0,,,
1,1,1.0,224700.0,,,,,6e-05,1.089009,219.7802,...,,,Psc,1,1,,9.63829,,,
2,2,2.0,224690.0,,,,,0.000283,-19.49884,47.9616,...,,,Cet,1,2,,0.392283,,,
3,3,3.0,224699.0,,,,,0.000335,38.859279,442.4779,...,,,And,1,3,,386.901132,,,
4,4,4.0,224707.0,,,,,0.000569,-51.893546,134.2282,...,,,Phe,1,4,,9.366989,,,


## Fields in the HYG (Hipparcos-Yale-Gliese) Database
[More about the HYG database](http://www.astronexus.com/hyg)
1. `StarID`: The database primary key from a larger "master database" of stars.
2. `HD`: The star's ID in the Henry Draper catalog, if known.
3. `HR`: The star's ID in the Harvard Revised catalog, which is the same as its number in the Yale Bright Star Catalog.
4. `Gliese`: The star's ID in the third edition of the Gliese Catalog of Nearby Stars.
5. `BayerFlamsteed`: The Bayer / Flamsteed designation, from the Fifth Edition of the Yale Bright Star Catalog. This is a combination of the two designations. The Flamsteed number, if present, is given first; then a three-letter abbreviation for the Bayer Greek letter; the Bayer superscript number, if present; and finally, the three-letter constellation abbreviation. Thus Alpha Andromedae has the field value "21Alp And", and Kappa1 Sculptoris (no Flamsteed number) has "Kap1Scl".
6. `RA, Dec`: The star's right ascension and declination, for epoch 2000.0. Stars present only in the Gliese Catalog, which uses 1950.0 coordinates, have had these coordinates precessed to 2000.
7. `ProperName`: A common name for the star, such as "Barnard's Star" or "Sirius". I have taken these names primarily from the Hipparcos project's web site, which lists representative names for the 150 brightest stars and many of the 150 closest stars. I have added a few names to this list. Most of the additions are designations from catalogs mostly now forgotten (e.g., Lalande, Groombridge, and Gould ["G."]) except for certain nearby stars which are still best known by these designations.
8. `Distance`: The star's distance in parsecs, the most common unit in astrometry. To convert parsecs to light years, multiply by 3.262. A value of 10000000 indicates missing or dubious (e.g., negative) parallax data in Hipparcos.
9. `Mag`: The star's apparent visual magnitude.
10. `AbsMag`: The star's absolute visual magnitude (its apparent magnitude from a distance of 10 parsecs).
11. `Spectrum`: The star's spectral type, if known.
12. `ColorIndex`: The star's color index (blue magnitude - visual magnitude), where known.
13. `* X,Y,Z`: The Cartesian coordinates of the star, in a system based on the equatorial coordinates as seen from Earth. +X is in the direction of the vernal equinox (at epoch 2000), +Z towards the north celestial pole, and +Y in the direction of R.A. 6 hours, declination 0 degrees.
14. `* VX,VY,VZ`: The Cartesian velocity components of the star, in the same coordinate system described immediately above. They are determined from the proper motion and the radial velocity (when known). The velocity unit is parsecs per year; these are small values (around 10-5 to 10-6), but they enormously simplify calculations using parsecs as base units for celestial mapping.

In [31]:
# Remove the sun because it doesn't make sense in a star chart
df = df[df['proper'] != 'Sol']

# Translate plaintext Bayer designations into non-ASCII greek letters
greek_dict = {'Alp': u"α",'Bet': u"β",'Chi': u"χ",'Del': u"δ",'Eps': u"ε",'Eta': u"η",
              'Gam': u"γ",'Iot': u"ι",'Kap': u"κ",'Lam': u"λ",'Mu': u"μ",'Nu': u"ν",
              'Ome': u"ω",'Omi': u"ο",'Phi': u"φ",'Pi': u"π",'Psi': u"ψ",'Rho': u"ρ",
              'Sig': u"σ",'Tau': u"τ",'The': u"θ",'Ups': u"υ",'Xi': u"ξ",'Zet': u"ζ"}

print(df[pd.notnull(df['bayer'])]['bayer'].unique())

['Tau' 'The' 'Zet' 'Alp' 'Bet' 'Kap-1' 'Eps' 'Gam-3' 'Kap-2' 'Gam' 'Chi'
 'Sig' 'Iot' 'Pi' 'Rho' 'Kap' 'Eta' 'Lam-1' 'Bet-1' 'Bet-2' 'Lam' 'Bet-3'
 'Lam-2' 'Del' 'Mu' 'Xi' 'Phi-1' 'Omi' 'Nu' 'Phi-2' 'Ups-1' 'Phi-3'
 'Ups-2' 'Phi-4' 'Ome' 'Psi-1' 'Ups' 'Psi-2' 'Phi' 'Psi-3' 'Psi' 'Tau-1'
 'Tau-2' 'Eta-1' 'Gam-2' 'Eta-2' 'Gam-1' 'Xi-1' 'Pi-1' 'Pi-2' 'Xi-2'
 'Iot-1' 'Iot-2' 'Eta-3' 'Rho-1' 'Rho-2' 'Rho-3' 'The-1' 'Tau-3' 'Zet-1'
 'Zet-2' 'Tau-4' 'Chi-1' 'Chi-2' 'Chi-3' 'Tau-5' 'Tau-6' 'Tau-7' 'Tau-8'
 'Tau-9' 'Ome-1' 'Omi-1' 'Omi-2' 'Ome-2' 'Ups-4' 'Del-1' 'Del-2' 'Del-3'
 'The-2' 'Sig-1' 'Sig-2' 'Pi-3' 'Pi-4' 'Pi-5' 'Pi-6' 'Nu-1' 'Nu-2' 'Nu-3'
 'Psi-4' 'Psi-5' 'Psi-6' 'Psi-7' 'Psi-8' 'Psi-9' 'Mu-1' 'Mu-2' 'Sig-3'
 'Alp-1' 'Alp-2' 'Zet-3' 'Zet-4' 'Eps-1' 'Eps-2']


In [32]:
def get_greek_letter(n):
    if str(n) == 'nan':
        return(np.nan)
    split = n.split("-")
    greek = greek_dict.get(split[0])
    if len(split) > 1:
        r = greek + split[1]
    else: 
        r = greek
    return(r)

df['greek_letters'] = df['bayer'].apply(get_greek_letter)
print(df['greek_letters'].unique())

[nan 'τ' 'θ' 'ζ' 'α' 'β' 'κ1' 'ε' 'γ3' 'κ2' 'γ' 'χ' 'σ' 'ι' 'π' 'ρ' 'κ'
 'η' 'λ1' 'β1' 'β2' 'λ' 'β3' 'λ2' 'δ' 'μ' 'ξ' 'φ1' 'ο' 'ν' 'φ2' 'υ1' 'φ3'
 'υ2' 'φ4' 'ω' 'ψ1' 'υ' 'ψ2' 'φ' 'ψ3' 'ψ' 'τ1' 'τ2' 'η1' 'γ2' 'η2' 'γ1'
 'ξ1' 'π1' 'π2' 'ξ2' 'ι1' 'ι2' 'η3' 'ρ1' 'ρ2' 'ρ3' 'θ1' 'τ3' 'ζ1' 'ζ2'
 'τ4' 'χ1' 'χ2' 'χ3' 'τ5' 'τ6' 'τ7' 'τ8' 'τ9' 'ω1' 'ο1' 'ο2' 'ω2' 'υ4'
 'δ1' 'δ2' 'δ3' 'θ2' 'σ1' 'σ2' 'π3' 'π4' 'π5' 'π6' 'ν1' 'ν2' 'ν3' 'ψ4'
 'ψ5' 'ψ6' 'ψ7' 'ψ8' 'ψ9' 'μ1' 'μ2' 'σ3' 'α1' 'α2' 'ζ3' 'ζ4' 'ε1' 'ε2']


In [33]:
print(len(df[pd.notnull(df['spect'])]['spect'].unique()), 'unique spectral designations')

4307 unique spectral designations


In [34]:
print(df[pd.notnull(df['spect'])]['spect'].unique()[:1000])

['F5' 'K3V' 'B9' 'F0V' 'G8III' 'M0V:' 'G0' 'M6e-M8.5e Tc' 'G5' 'F6V' 'A2'
 'K4III' 'K0III' 'K0' 'K2' 'F3V' 'K5' 'G8/K0III/IV' 'F2V' 'G0V' 'G3IV'
 'F7V' 'G5V' 'F3/F5V' 'A0' 'B8' 'F2' 'F7.5IV-V' 'G6V' 'B...' 'G9III-IV'
 'G6/G8V:' 'K1III' 'K0/K1III' 'G1IV' 'A4V' 'M:' 'G2IV/V' 'K2V' 'B5' 'F2IV'
 'B9p SiEu' 'K2III' 'G3/G5V' 'M0' 'A0V' 'G2V' 'F5IV/V' 'F8V' 'G2' 'F5V'
 'F6/F7V' 'C5p' 'M1III' 'F0' 'G7II-III' 'M2III' 'K3II/III' 'M2' 'M1.5V:'
 'G0IV' 'A2IV' 'K1IIICNIV' 'F0III' 'Am...' 'F8' 'F5IV' 'K2III-IV'
 'B9IIIp Mn' 'K0/K1IV' 'B0' 'K3' 'B7III-IV' 'A9V' 'F3IV...' 'M3III' 'B8V'
 'A3' 'B9V' 'A3V' 'G9III' 'Ap...' 'G3V' 'R...' 'M0V' 'O7' 'G9II-III'
 'A1m...' 'B4V' 'A6V' 'M3' 'G8/K0III' 'K3III' 'G6/G8III' 'A2V' 'G8'
 'F0/F2V' 'F6/F7IV' 'G8IV' 'K0...' 'F2/F3V' 'M1' 'K1IIICN...' 'K5III:'
 'A2III' 'B3Ia' 'B2IVne+...' 'A5' 'K1IV/VCN...' 'F8/G0V' 'G8V' 'B9IVn'
 'M4' 'G5IV' 'A7V' 'B0Ibp' 'F0III/IV' 'G8/K0IV' 'O8e' 'F1:' 'G2/G3V'
 'A1III' 'Gp' 'A1/A2IV/V' 'B' 'A1V' 'M5e-M7e' 'B3V' 'B8/B9V' 'K0V'
 'K3Ibva

## Spectral Designation
[A note on the spectral atlas and spectral classification](https://lweb.cfa.harvard.edu/~pberlind/atlas/htmls/note.html)

### Morgan-Keenan designations
Morgan–Keenan (MK) system using the letters O, B, A, F, G, K, and M, a sequence from the hottest (O type) to the coolest (M type).

In [35]:
def get_first_letter(name):
    '''Preprocess spectral designations to remove numbers'''
    if str(name) != 'nan':
        if len(name) > 1:
            if name[0:2] == 'sd':
                # remove MK system luminosity class to look just at
                # Morgan-Keenan designations
                name = name[2::]
            alphas = ''.join(c for c in name if c not in '?:!/;.,[]{}()')
            return(alphas[0].upper())
        else:
            return(name.upper())
    return(name)

In [36]:
df['dist'].replace(to_replace=100000, value=np.nan, inplace=True)  # Why?
    
df['spect_desig'] = df['spect'].apply(get_first_letter)    
print(len(df[pd.notnull(df['spect_desig'])]['spect_desig'].unique()), 'unique spectral designations')
print(df[pd.notnull(df['spect_desig'])]['spect_desig'].unique())

14 unique spectral designations
['F' 'K' 'B' 'G' 'M' 'A' 'C' 'R' 'O' 'W' 'N' 'S' 'D' 'P']


In [37]:
# color scheme
color_dict = { 
    'O':'#5A90C3', 'B':'#93C2F1', 'A':'#f3e8d3', 'F':'#d4bf94',
    'G':'#FFD423', 'K':'#F99220', 'M':'#FF2620',  'L':'#FF2620',
    'T':'#FF6199', 'Y':'#6B22FF', 
    'C':'#979330', 'R':'#979330', 'W':'#979330', 'N':'#979330',
    'S':'#979330', 'D':'#979330', 'P':'#979330',
    'nan': '#000000' # unknown
}
df['color'] = df['spect_desig'].replace(to_replace=color_dict)
df['color'] = df['color'].replace(to_replace=np.nan, value='#000000')
df['linecolor'] = df['color'].replace(['#000000'], ['#f3e8d3']) # beige outline for black NANs


In [38]:
display(df.head())
df.to_csv("./hygdata_v3_processed.csv", index=False)  # no row names

print(len(df), 'total stars available in database')
df = df[df['mag'] <= 6.5]  # stars with an apparent magnitude less than 6.5 are visible to the naked eye
print(len(df), 'stars visible to the human eye')
df.to_csv('./hygdata_v3_processed_mag65.csv', index=False)  # no row names

Unnamed: 0,id,hip,hd,hr,gl,bf,proper,ra,dec,dist,...,comp_primary,base,lum,var,var_min,var_max,greek_letters,spect_desig,color,linecolor
1,1,1.0,224700.0,,,,,6e-05,1.089009,219.7802,...,1,,9.63829,,,,,F,#d4bf94,#d4bf94
2,2,2.0,224690.0,,,,,0.000283,-19.49884,47.9616,...,2,,0.392283,,,,,K,#F99220,#F99220
3,3,3.0,224699.0,,,,,0.000335,38.859279,442.4779,...,3,,386.901132,,,,,B,#93C2F1,#93C2F1
4,4,4.0,224707.0,,,,,0.000569,-51.893546,134.2282,...,4,,9.366989,,,,,F,#d4bf94,#d4bf94
5,5,5.0,224705.0,,,,,0.000665,-40.591202,257.732,...,5,,21.998851,,,,,G,#FFD423,#FFD423


119613 total stars available in database
8912 stars visible to the human eye
