# Red Giant Classification Project

**The goal - predicting the evolutionary stage of Red Giant stars**

## initializing

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

while True:
    try:
        import altair as alt
        break
    except ModuleNotFoundError:
        !pip install altair

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

### upload and cleanup

In [4]:
upload = pd.read_csv('/'.join(['https:/',
                           'raw.githubusercontent.com',
                           'Morshay',
                           'red-giant-blue-giant',
                           'main',
                           'red_giant_data.csv'])
                ).rename(str.lower, axis='columns')

rename cols

remove e_s

nulls and zeroes

In [5]:
df=upload.copy()
print(df.info())
df.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15388 entries, 0 to 15387
Data columns (total 28 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   kpmag      15388 non-null  float64
 1   numax      15388 non-null  float64
 2   e_numax    15388 non-null  float64
 3   delnu      15388 non-null  float64
 4   e_delnu    15388 non-null  float64
 5   a          15388 non-null  object 
 6   e_a        15388 non-null  object 
 7   width      15388 non-null  object 
 8   e_width    15388 non-null  object 
 9   teff       15388 non-null  int64  
 10  e_teff     15388 non-null  int64  
 11  log(g)     15388 non-null  float64
 12  e_log(g)   15388 non-null  float64
 13  [fe/h]     15388 non-null  float64
 14  e_[fe/h]   15388 non-null  float64
 15  nocorm     15388 non-null  float64
 16  e_nocorm   15388 non-null  float64
 17  nocorr     15388 non-null  float64
 18  e_nocorr   15388 non-null  float64
 19  rgbcorm    15388 non-null  float64
 20  e_rgbc

Unnamed: 0,kpmag,numax,e_numax,delnu,e_delnu,a,e_a,width,e_width,teff,...,e_nocorr,rgbcorm,e_rgbcorm,rgbcorr,e_rgbcorr,clcorm,e_clcorm,clcorr,e_clcorr,phase
650,11.22,9.91,0.39,1.56,0.025,278.4,23.6,4.4,0.7,4431,...,1.13,1.09,0.15,19.67,1.04,1.23,0.17,20.9,1.12,1
10695,11.44,28.43,0.79,3.391,0.044,101.1,5.2,12.1,1.7,4773,...,0.55,1.34,0.14,12.65,0.52,1.49,0.16,13.33,0.55,2
6366,13.52,30.03,0.46,3.797,0.123,126.6,5.3,11.5,0.8,4876,...,0.78,1.03,0.15,10.73,0.72,1.15,0.17,11.33,0.78,2
14243,12.27,124.09,0.46,11.315,0.02,56.0,1.9,34.0,1.1,4819,...,0.09,0.88,0.04,4.9,0.08,0.98,0.04,5.17,0.09,1
13988,11.43,27.61,1.43,3.26,0.056,76.9,4.2,12.5,2.4,4934,...,0.91,1.51,0.26,13.51,0.86,1.68,0.29,14.26,0.92,2
9628,9.43,31.4,1.33,4.004,0.12,104.2,13.2,12.0,2.3,4902,...,0.81,1.07,0.2,10.7,0.82,1.07,0.2,10.67,0.81,2
1433,13.09,37.5,0.79,4.324,0.025,91.4,4.2,17.2,1.5,4915,...,0.35,1.31,0.12,10.82,0.34,1.35,0.12,10.99,0.35,2
13302,13.87,33.8,0.49,3.976,0.236,110.3,4.1,13.6,2.1,5141,...,1.45,1.48,0.37,11.98,1.46,1.49,0.37,12.02,1.47,2
5976,10.86,87.14,0.69,8.034,0.011,75.0,2.9,27.6,1.3,4629,...,0.13,1.2,0.06,6.89,0.13,1.3,0.07,7.17,0.13,1
6766,13.11,30.88,0.76,3.905,0.068,126.9,6.8,12.9,1.7,4775,...,0.5,0.91,0.1,10.02,0.45,1.02,0.11,10.61,0.49,2


In [6]:
df.phase.value_counts()

2    7703
1    7685
Name: phase, dtype: int64

In [7]:
def evaluate(model, X, y, stage):

    y_pred = model.predict(X)

    names = model.classes_
    
    print(f'{stage.upper()}:')
    
    print('confusion matrix:')
    print(pd.DataFrame(
        confusion_matrix(y, y_pred, labels=names),
        index=names, columns=names), end='\n\n')

    print('classification report:')
    print('\n', classification_report(
            y, y_pred, labels=names), end='\n\n')

    eval_chart = alt.Chart(
        pd.DataFrame({'feature': model.feature_names_in_,
                     'importance': model.feature_importances_}
                     ).sort_values('importance').tail(10),
        title=f'{stage} feature importances'
    ).mark_bar(color='firebrick' # red fire
               ).encode(
        y=alt.Y('feature:N', sort='-x'),
        x=alt.X('importance:Q', axis=alt.Axis(format='.0%'))
    ).properties(height=150, width=400)

    return eval_chart

In [8]:
def quick_test(model, data, target='phase'):

    y = data[target]
    X = data.drop(columns=target)
    
    for col in X.select_dtypes(exclude='number'):
        X[col] = X[col].astype('category').cat.codes

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=.3,
        random_state=6582119, # hbar, if you're asking
        stratify=y,
        shuffle=True)

    train_eval = evaluate(model.fit(X_train, y_train),
                          X_train, y_train, 'train')

    test_eval = evaluate(model.fit(X_train, y_train),
                         X_test, y_test, 'test')

    return train_eval & test_eval

In [13]:
#forest
fit_evaluate(df)

TRAIN:
confusion matrix:
      1     2
1  5379     0
2     0  5392

classification report:

               precision    recall  f1-score   support

           1       1.00      1.00      1.00      5379
           2       1.00      1.00      1.00      5392

    accuracy                           1.00     10771
   macro avg       1.00      1.00      1.00     10771
weighted avg       1.00      1.00      1.00     10771


TEST:
confusion matrix:
      1     2
1  2217    89
2    96  2215

classification report:

               precision    recall  f1-score   support

           1       0.96      0.96      0.96      2306
           2       0.96      0.96      0.96      2311

    accuracy                           0.96      4617
   macro avg       0.96      0.96      0.96      4617
weighted avg       0.96      0.96      0.96      4617




SVM

logit