In [1]:
import pprint
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, PowerTransformer, RobustScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [2]:
df = pd.read_csv('../data/star_data.csv')
df.sample(5)

Unnamed: 0,Radius,Temperature,Luminosity,Absolute_Magnitude,Star_Color,Spectral_Class,Star_Type
23432,1.198906,3550.725213,0.204988,6.550678,Blue White,F,Main Sequence
95456,1.157502,5757.68442,1.321067,4.527688,Yellow White,K,White Dwarf
96705,0.986047,5711.298447,0.928163,4.910939,Blue White,G,White Dwarf
64221,0.851308,2000.0,0.010404,9.787042,Blue,G,Main Sequence
84855,0.95128,7135.545605,2.10482,4.021962,Blue,A,White Dwarf


In [3]:
df.shape

(100000, 7)

# Star Type Predictor

In [4]:
X = df.drop(columns=['Star_Type'])
y = df['Star_Type']

In [5]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [7]:
numerical_cols = ['Radius', 'Temperature', 'Luminosity', 'Absolute_Magnitude']
categorical_cols = ['Star_Color', 'Spectral_Class']

In [8]:
numerical_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('pow_tnfr', PowerTransformer())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

In [9]:
preprocesses = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_cols),
        ('categorical', categorical_pipeline, categorical_cols)
    ]
)

In [10]:
star_type_predictor = Pipeline([
    ('preprocesses', preprocesses),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [11]:
star_type_predictor.fit(X_train, y_train)

In [12]:
y_pred = star_type_predictor.predict(X_test)

In [13]:
print(f"Accuracy Score: {accuracy_score(y_test, y_pred) * 100:.4f}%")

Accuracy Score: 16.6000%


In [14]:
mean_score = np.mean(cross_val_score(star_type_predictor, X, y_encoded, cv=5))
print(f"Mean Cross Val Score: {mean_score * 100:.4f}%")

Mean Cross Val Score: 16.8600%


In [15]:
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

report = classification_report(y_test_decoded, y_pred_decoded, output_dict=True)
pprint.pprint(report)
report_df = pd.DataFrame(report).transpose()
report_df

{'Brown Dwarf': {'f1-score': 0.16981883336651404,
                 'precision': 0.16784730421094057,
                 'recall': 0.17183722804190169,
                 'support': 4964.0},
 'Hypergiant': {'f1-score': 0.16344516775162743,
                'precision': 0.16310213871676993,
                'recall': 0.16378964271376958,
                'support': 4982.0},
 'Main Sequence': {'f1-score': 0.16790669615926002,
                   'precision': 0.17166940789473684,
                   'recall': 0.16430539157811885,
                   'support': 5082.0},
 'Red Dwarf': {'f1-score': 0.1680874536154849,
               'precision': 0.16743256743256743,
               'recall': 0.16874748288360855,
               'support': 4966.0},
 'Supergiant': {'f1-score': 0.16478956727919383,
                'precision': 0.16407633287428683,
                'recall': 0.165509029569359,
                'support': 5039.0},
 'White Dwarf': {'f1-score': 0.1619335347432024,
                 'precision': 0.

Unnamed: 0,precision,recall,f1-score,support
Brown Dwarf,0.167847,0.171837,0.169819,4964.0
Hypergiant,0.163102,0.16379,0.163445,4982.0
Main Sequence,0.171669,0.164305,0.167907,5082.0
Red Dwarf,0.167433,0.168747,0.168087,4966.0
Supergiant,0.164076,0.165509,0.16479,5039.0
White Dwarf,0.161999,0.161868,0.161934,4967.0
accuracy,0.166,0.166,0.166,0.166
macro avg,0.166021,0.16601,0.165997,30000.0
weighted avg,0.166036,0.166,0.166,30000.0


# Star Color Predictor

In [16]:
X = df.drop(columns=['Star_Color'])
y = df['Star_Color']

In [17]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [19]:
numerical_cols = ['Radius', 'Temperature', 'Luminosity', 'Absolute_Magnitude']
categorical_cols = ['Star_Type', 'Spectral_Class']

In [20]:
numerical_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('pow_tnfr', PowerTransformer())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

In [21]:
preprocesses = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_cols),
        ('categorical', categorical_pipeline, categorical_cols)
    ]
)

In [22]:
star_color_predictor = Pipeline([
    ('preprocesses', preprocesses),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [23]:
star_color_predictor.fit(X_train, y_train)

In [24]:
y_pred = star_color_predictor.predict(X_test)

In [25]:
print(f"Accuracy Score: {accuracy_score(y_test, y_pred) * 100:.4f}%")

Accuracy Score: 16.9933%


In [26]:
mean_score = np.mean(cross_val_score(star_color_predictor, X, y_encoded, cv=5))
print(f"Mean Cross Val Score: {mean_score * 100:.4f}%")

Mean Cross Val Score: 16.5670%


In [27]:
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

report = classification_report(y_test_decoded, y_pred_decoded, output_dict=True)
pprint.pprint(report)
report_df = pd.DataFrame(report).transpose()
report_df

{'Blue': {'f1-score': 0.17365153279006595,
          'precision': 0.16921913405180564,
          'recall': 0.17832237497509465,
          'support': 5019.0},
 'Blue White': {'f1-score': 0.1605324190783503,
                'precision': 0.16071068039571976,
                'recall': 0.16035455278001612,
                'support': 4964.0},
 'Orange': {'f1-score': 0.1700990099009901,
            'precision': 0.17238611278346377,
            'recall': 0.16787179988274378,
            'support': 5117.0},
 'Red': {'f1-score': 0.17420132610006028,
         'precision': 0.17219463753723932,
         'recall': 0.17625533645049807,
         'support': 4919.0},
 'White': {'f1-score': 0.16751269035532995,
           'precision': 0.16707168894289187,
           'recall': 0.1679560260586319,
           'support': 4912.0},
 'Yellow White': {'f1-score': 0.17343734170803363,
                  'precision': 0.17825905872553102,
                  'recall': 0.16886959952653383,
                  'support': 

Unnamed: 0,precision,recall,f1-score,support
Blue,0.169219,0.178322,0.173652,5019.0
Blue White,0.160711,0.160355,0.160532,4964.0
Orange,0.172386,0.167872,0.170099,5117.0
Red,0.172195,0.176255,0.174201,4919.0
White,0.167072,0.167956,0.167513,4912.0
Yellow White,0.178259,0.16887,0.173437,5069.0
accuracy,0.169933,0.169933,0.169933,0.169933
macro avg,0.169974,0.169938,0.169906,30000.0
weighted avg,0.170015,0.169933,0.169924,30000.0


# Spectral Class Predictor

In [28]:
X = df.drop(columns=['Spectral_Class'])
y = df['Spectral_Class']

In [29]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [31]:
numerical_cols = ['Radius', 'Temperature', 'Luminosity', 'Absolute_Magnitude']
categorical_cols = ['Star_Type', 'Star_Color']

In [32]:
numerical_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('pow_tnfr', PowerTransformer())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

In [33]:
preprocesses = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_cols),
        ('categorical', categorical_pipeline, categorical_cols)
    ]
)

In [34]:
spectral_class_predictor = Pipeline([
    ('preprocesses', preprocesses),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [35]:
spectral_class_predictor.fit(X_train, y_train)

In [36]:
y_pred = spectral_class_predictor.predict(X_test)

In [37]:
print(f"Accuracy Score: {accuracy_score(y_test, y_pred) * 100:.4f}%")

Accuracy Score: 22.9633%


In [38]:
mean_score = np.mean(cross_val_score(spectral_class_predictor, X, y_encoded, cv=5))
print(f"Mean Cross Val Score: {mean_score * 100:.4f}%")

Mean Cross Val Score: 23.3130%


In [39]:
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

report = classification_report(y_test_decoded, y_pred_decoded, output_dict=True)
pprint.pprint(report)
report_df = pd.DataFrame(report).transpose()
report_df

{'A': {'f1-score': 0.08540646425073457,
       'precision': 0.10856573705179283,
       'recall': 0.07039070067807555,
       'support': 3097.0},
 'B': {'f1-score': 0.040479140850888065,
       'precision': 0.05651672433679354,
       'recall': 0.03153153153153153,
       'support': 1554.0},
 'F': {'f1-score': 0.19628378378378378,
       'precision': 0.1991772368872129,
       'recall': 0.19347319347319347,
       'support': 6006.0},
 'G': {'f1-score': 0.3298072170044488,
       'precision': 0.29611219598792826,
       'recall': 0.37215528781793844,
       'support': 8964.0},
 'K': {'f1-score': 0.2540766999677731,
       'precision': 0.24243542435424353,
       'recall': 0.26689234935680434,
       'support': 7385.0},
 'M': {'f1-score': 0.06876272336575436,
       'precision': 0.08740655549166187,
       'recall': 0.05667412378821775,
       'support': 2682.0},
 'O': {'f1-score': 0.004273504273504274,
       'precision': 0.00641025641025641,
       'recall': 0.003205128205128205,
     

Unnamed: 0,precision,recall,f1-score,support
A,0.108566,0.070391,0.085406,3097.0
B,0.056517,0.031532,0.040479,1554.0
F,0.199177,0.193473,0.196284,6006.0
G,0.296112,0.372155,0.329807,8964.0
K,0.242435,0.266892,0.254077,7385.0
M,0.087407,0.056674,0.068763,2682.0
O,0.00641,0.003205,0.004274,312.0
accuracy,0.229633,0.229633,0.229633,0.229633
macro avg,0.142375,0.142046,0.13987,30000.0
weighted avg,0.210049,0.229633,0.217493,30000.0


# Radius Predictor

In [40]:
X = df.drop(columns=['Radius'])
y = df['Radius']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [42]:
numerical_cols = ['Temperature', 'Luminosity', 'Absolute_Magnitude']
categorical_cols = ['Star_Type', 'Star_Color', 'Spectral_Class']

In [43]:
numerical_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('pow_tnfr', PowerTransformer())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

In [44]:
preprocesses = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_cols),
        ('categorical', categorical_pipeline, categorical_cols)
    ]
)

In [45]:
radius_predictor = Pipeline([
    ('preprocesses', preprocesses),
    ('regressor', RandomForestRegressor())
])

In [None]:
radius_predictor.fit(X_train, y_train)

In [None]:
y_pred = radius_predictor.predict(X_test)

In [None]:
print(f"R2 Score: {r2_score(y_test, y_pred) * 100:.4f}%")

In [None]:
mean_score = np.mean(cross_val_score(radius_predictor, X, y, cv=5))
print(f"Mean Cross Val Score: {mean_score * 100:.4f}%")

# Temperature Predictor

In [None]:
X = df.drop(columns=['Temperature'])
y = df['Temperature']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
numerical_cols = ['Radius', 'Luminosity', 'Absolute_Magnitude']
categorical_cols = ['Star_Type', 'Star_Color', 'Spectral_Class']

In [None]:
numerical_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('pow_tnfr', PowerTransformer())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

In [None]:
preprocesses = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_cols),
        ('categorical', categorical_pipeline, categorical_cols)
    ]
)

In [None]:
temperature_predictor = Pipeline([
    ('preprocesses', preprocesses),
    ('regressor', RandomForestRegressor())
])

In [None]:
temperature_predictor.fit(X_train, y_train)

In [None]:
y_pred = temperature_predictor.predict(X_test)

In [None]:
print(f"R2 Score: {r2_score(y_test, y_pred) * 100:.4f}%")

In [None]:
mean_score = np.mean(cross_val_score(temperature_predictor, X, y, cv=5))
print(f"Mean Cross Val Score: {mean_score * 100:.4f}%")

# Luminosity Predictor

In [None]:
X = df.drop(columns=['Luminosity'])
y = df['Luminosity']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numerical_cols = ['Radius', 'Temperature', 'Absolute_Magnitude']
categorical_cols = ['Star_Type', 'Star_Color', 'Spectral_Class']

In [None]:
numerical_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('pow_tnfr', PowerTransformer())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

In [None]:
preprocesses = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_cols),
        ('categorical', categorical_pipeline, categorical_cols)
    ]
)

In [None]:
luminosity_predictor = Pipeline([
    ('preprocesses', preprocesses),
    ('regressor', RandomFrestRegressor())
])

In [None]:
luminosity_predictor.fit(X_train, y_train)

In [None]:
y_pred = luminosity_predictor.predict(X_test)

In [None]:
print(f"R2 Score: {r2_score(y_test, y_pred) * 100:.4f}%")

In [None]:
mean_score = np.mean(cross_val_score(luminosity_predictor, X, y, cv=5))
print(f"Mean Cross Val Score: {mean_score * 100:.4f}%")

# Absolute Magnitude Predictor

In [None]:
X = df.drop(columns=['Absolute_Magnitude'])
y = df['Absolute_Magnitude']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
numerical_cols = ['Radius', 'Temperature', 'Luminosity']
categorical_cols = ['Star_Type', 'Star_Color', 'Spectral_Class']

In [None]:
numerical_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('pow_tnfr', PowerTransformer())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

In [None]:
preprocesses = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_cols),
        ('categorical', categorical_pipeline, categorical_cols)
    ]
)

In [None]:
absolute_magnitude_predictor = Pipeline([
    ('preprocesses', preprocesses),
    ('regressor', RandomFrestRegressor())
])

In [None]:
absolute_magnitude_predictor.fit(X_train, y_train)

In [None]:
y_pred = absolute_magnitude_predictor.predict(X_test)

In [None]:
print(f"R2 Score: {r2_score(y_test, y_pred) * 100:.4f}%")

In [None]:
mean_score = np.mean(cross_val_score(absolute_magnitude_predictor, X, y, cv=5))
print(f"Mean Cross Val Score: {mean_score * 100:.4f}%")