# Red Giant Classification Project

**The goal - predicting the evolutionary stage of Red Giant stars**

## initializing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

while True:
    try:
        import altair as alt
        break
    except ModuleNotFoundError:
        !pip install altair

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

### upload

In [5]:
upload = pd.read_csv('/'.join(['https:/',
                               'raw.githubusercontent.com',
                               'Morshay',
                               'red-giant-blue-giant',
                               'main',
                               'red_giant_data.csv'])
                     ).rename(str.lower, axis='columns')

In [8]:
df = upload.copy()
df.sample(10)

Unnamed: 0,kpmag,numax,e_numax,delnu,e_delnu,a,e_a,width,e_width,teff,...,e_nocorr,rgbcorm,e_rgbcorm,rgbcorr,e_rgbcorr,clcorm,e_clcorm,clcorr,e_clcorr,phase
8527,15.8,106.15,1.29,9.875,0.055,59.2,4.2,32.4,2.9,5366,...,0.17,1.33,0.1,6.35,0.17,1.29,0.1,6.25,0.17,1
11008,12.3,57.43,0.65,5.583,0.014,65.2,3.6,18.0,1.3,4892,...,0.24,1.66,0.11,9.86,0.24,1.75,0.12,10.1,0.24,2
4265,12.57,43.57,0.56,4.61,0.044,82.9,3.6,17.0,1.3,5017,...,0.34,1.62,0.13,11.1,0.34,1.71,0.13,11.4,0.35,2
10758,13.85,25.82,0.4,3.33,0.027,150.9,8.8,8.1,0.7,4921,...,0.39,1.15,0.09,12.21,0.37,1.26,0.1,12.8,0.39,2
4006,13.42,26.11,0.56,3.149,0.078,74.8,4.0,10.6,2.9,5035,...,0.82,1.54,0.19,13.99,0.78,1.71,0.22,14.73,0.84,2
90,11.34,54.12,0.48,5.505,0.015,88.4,4.5,17.7,1.0,4632,...,0.19,1.27,0.07,9.02,0.18,1.42,0.08,9.5,0.2,1
6041,10.88,33.34,0.91,4.066,0.046,108.8,4.9,14.4,1.1,4729,...,0.42,1.05,0.11,10.39,0.4,1.14,0.12,10.81,0.42,2
8793,13.54,47.7,0.56,5.263,0.016,107.1,6.7,15.0,1.1,4946,...,0.23,1.19,0.08,9.14,0.22,1.28,0.09,9.48,0.23,1
1083,12.39,38.3,0.97,4.001,0.06,48.7,2.7,16.1,1.4,5008,...,0.56,1.87,0.2,12.72,0.54,2.04,0.22,13.28,0.57,2
14936,13.69,50.06,0.56,5.361,0.013,90.3,4.4,16.8,1.0,4961,...,0.23,1.32,0.09,9.37,0.23,1.39,0.09,9.61,0.23,1


### prelim cleanup

In [9]:
cleanup_df = df.copy()

columns that start in *e_* describe the error in measurement, and are irrelevant for the calculation process.

In [10]:
cleanup_df.drop(
    columns=cleanup_df.columns[
        cleanup_df.columns.str.contains('e_')],
    inplace=True)

cleanup_df.columns

Index(['kpmag', 'numax', 'delnu', 'a', 'width', 'teff', 'log(g)', '[fe/h]',
       'nocorm', 'nocorr', 'rgbcorm', 'rgbcorr', 'clcorm', 'clcorr', 'phase'],
      dtype='object')

the mass and radius corrections can be averaged

In [11]:
cleanup_df[cleanup_df.columns[
        cleanup_df.columns.str.contains('cor')]
          ].sample(3)

Unnamed: 0,nocorm,nocorr,rgbcorm,rgbcorr,clcorm,clcorr
12055,1.22,11.08,1.15,10.74,1.24,11.19
11128,1.26,7.49,1.15,7.15,1.27,7.51
1503,1.26,11.34,1.26,11.35,1.27,11.39


In [12]:
cleanup_df['rad'] = cleanup_df.nocorr + cleanup_df.rgbcorr + cleanup_df.clcorr
cleanup_df['mass'] = cleanup_df.nocorm + cleanup_df.rgbcorm + cleanup_df.clcorm

cleanup_df.drop(
    columns=cleanup_df.columns[
        cleanup_df.columns.str.contains('cor')],
    inplace=True)

cleanup_df.sample(10)

Unnamed: 0,kpmag,numax,delnu,a,width,teff,log(g),[fe/h],phase,rad,mass
4651,12.51,58.96,5.929,90.3,16.9,4791,2.678,-0.09,1,26.73,4.14
2181,11.66,44.69,4.917,96.5,15.4,4710,2.554,-0.18,1,28.99,3.66
13959,12.82,24.24,2.884,127.5,9.7,4483,2.278,0.24,1,44.68,4.6
9955,11.9,58.18,5.425,45.9,21.8,4883,2.676,0.17,2,32.1,5.95
6894,13.56,78.98,7.562,75.1,24.8,4875,2.809,0.13,1,22.39,3.93
13245,12.72,66.99,6.576,82.6,20.4,4741,2.731,-0.03,1,24.53,3.94
15182,10.94,29.94,3.883,137.0,12.6,4567,2.373,0.25,2,30.83,2.74
1567,13.51,39.52,4.246,103.6,15.8,4791,2.504,0.4,2,35.02,4.77
1441,13.37,52.85,5.486,86.3,16.8,4879,2.634,-0.18,1,28.34,4.21
14779,13.25,58.98,6.098,81.2,18.6,5157,2.694,-0.21,1,26.67,4.27


renaming and reordering columns

In [13]:
cleanup_df.rename(
    {
        'kpmag': 'ap_mag',  # apparent magnitude (brightness)
        'numax': 'freq',  # maximum oscillation frequency
        'delnu': 'fr_sep',  # frequency separation of oscillation modes
        'a': 'amp',  # oscillation amplitude
        'width': 'pow_ex',  # power excess width
        'teff': 'temp',  # effective temperature
        'log(g)': 'grav', # surface gravity logarithm
        '[fe/h]': 'metal',  # metallicity
    },
axis='columns',
inplace=True)

Index(['ap_mag', 'freq', 'fr_sep', 'amp', 'pow_ex', 'temp', 'grav', 'metal',
       'phase', 'rad', 'mass'],
      dtype='object')

In [16]:
cleanup_df = cleanup_df[['phase'] + list(cleanup_df.columns.drop('phase'))]
cleanup_df.columns

Index(['phase', 'ap_mag', 'freq', 'fr_sep', 'amp', 'pow_ex', 'temp', 'grav',
       'metal', 'rad', 'mass'],
      dtype='object')

change phase names for clarity

In [17]:
cleanup_df.phase.replace({
    1: 'RGB', # Red Giant Branch
    2: 'HeB', # Helium Burning Phase
}, inplace=True)
cleanup_df.sample(10)

Unnamed: 0,phase,ap_mag,freq,fr_sep,amp,pow_ex,temp,grav,metal,rad,mass
7708,HeB,12.13,75.22,6.495,48.4,29.5,4920,2.79,0.39,29.18,6.37
3532,HeB,11.63,32.37,4.023,114.7,13.2,4722,2.414,-0.18,31.43,3.12
4368,HeB,13.5,26.55,3.311,98.7,10.9,4763,2.33,-0.09,38.42,3.84
10255,HeB,11.46,29.79,3.892,135.4,13.1,4882,2.386,-0.21,31.79,2.98
4597,RGB,13.66,170.48,13.535,38.9,50.5,4945,3.146,-0.23,15.16,3.91
605,RGB,13.49,91.41,8.186,36.9,19.0,4980,2.877,0.04,22.39,4.6
6946,HeB,8.77,36.29,3.662,57.9,19.2,4986,2.476,0.0,44.7,7.27
2453,RGB,13.47,61.23,6.255,85.0,17.7,4574,2.684,-0.25,24.01,3.39
4132,HeB,13.8,29.92,4.452,172.6,13.9,5070,2.396,0.28,25.5,1.97
13397,HeB,11.07,31.7,4.259,142.9,10.5,5052,2.42,0.1,29.24,2.73


In [18]:
df = cleanup_df.copy()

## data overview

### globals and defs

In [2]:
basic_forest = RandomForestClassifier(
    max_samples=.25,
    n_estimators=10,
    random_state=6582119)

In [3]:
def plot_results(model, X, y, stage):

    y_pred = model.predict(X)

    names = model.classes_
    
    print(f'{stage.upper()}:', end='\n\n')
    
    print('confusion matrix:')
    print(pd.DataFrame(
        confusion_matrix(y, y_pred, labels=names),
        index=names, columns=names), end='\n\n')

    print('classification report:')
    print('\n', classification_report(
            y, y_pred, labels=names), end='\n\n')

    res_chart = alt.Chart(
        pd.DataFrame({'feature': model.feature_names_in_,
                     'importance': model.feature_importances_}
                     ).sort_values('importance').tail(10),
        title=f'{stage} feature importances'
    ).mark_bar(color='firebrick' # red fire
               ).encode(
        y=alt.Y('feature:N', sort='-x'),
        x=alt.X('importance:Q', axis=alt.Axis(format='.0%'))
    ).properties(height=150, width=400)

    return res_chart

In [4]:
def quick_test(data, target='phase', model=basic_forest):

    y = data[target]
    X = data.drop(columns=target)

    for col in X.select_dtypes(exclude='number'):
        X[col] = X[col].astype('category').cat.codes

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=.3,
        random_state=6582119,  # hbar, if you're asking
        stratify=y,
        shuffle=True)


    res = plot_results(model.fit(X_train, y_train),
                             X_test, y_test, 'test')

    return res

### feature_selection

In [19]:
df

Unnamed: 0,phase,ap_mag,freq,fr_sep,amp,pow_ex,temp,grav,metal,rad,mass
0,RGB,9.20,29.99,3.399,104.9,12.3,4751,2.383,-0.08,41.14,4.97
1,HeB,13.23,29.48,3.962,149.7,12.0,5188,2.394,-0.21,31.77,3.04
2,RGB,12.58,41.39,4.311,86.1,15.3,4728,2.522,-0.15,35.11,4.98
3,HeB,12.14,41.17,4.414,63.8,24.8,5072,2.534,-0.12,35.07,5.13
4,HeB,11.74,36.91,3.991,116.1,14.0,4718,2.471,-0.02,36.60,4.82
...,...,...,...,...,...,...,...,...,...,...,...
15383,HeB,12.78,37.81,4.335,108.0,14.4,4906,2.490,-0.09,32.66,4.01
15384,HeB,13.69,28.74,4.054,147.8,14.6,5077,2.379,-0.53,29.21,2.49
15385,RGB,12.60,92.86,8.187,61.8,31.7,4846,2.878,0.07,22.34,4.58
15386,RGB,11.84,52.02,5.354,86.6,16.2,4855,2.627,0.12,29.32,4.42


In [None]:
base = alt.Chart(upload).encode(
    x='phase:N',
    y='count(phase):Q'
).properties(width=300)

bar = base.mark_bar(
    size=100, color='indianred')

text = base.mark_text(
    baseline='bottom', size=20
).encode(text='count(phase):Q',
         color=alt.value('black'))

bar+text

changing col names from parameters to names:

In [None]:
sns.pairplot(feat_df.sample(1000), hue='phase')

there seems to be a very clear mapping between the frequency and the frequency separation, surface gravity and radius.  
all make sense physically, but it's weird that the mass doesn't have such a mapping as well.

In [None]:
feat_df.drop(columns=['fr_sep', 'grav', ''])

In [None]:
quick_test(basic_forest, trimmed_df)

SVM

logit