# Setup

In [1]:
# Importing the libraries
import numpy as np                                # Numerical operations
import pandas as pd                               # Data manipulation
import matplotlib.pyplot as plt                   # Basic plotting
import seaborn as sns                             # Statistical visualizations

from sklearn.preprocessing import LabelEncoder       # Encode categorical labels
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (                        # Model evaluation metrics
    accuracy_score,
    classification_report,
    confusion_matrix
)
from sklearn.model_selection import train_test_split  # Data splitting

from imblearn.over_sampling import SMOTE              # Synthetic minority oversampling
from imblearn.pipeline import Pipeline                # Build ML pipelines

#Path setup
import os                             # Operating system utilities
from pathlib import Path              # Object-oriented filesystem paths
notebook_path = Path().absolute()     # Current notebook location
project_root = notebook_path.parent   # Project root directory

In [2]:
#df = pd.read_csv('data_lakehouse/gold/starG.csv')
df = pd.read_csv(project_root/'data'/'gold'/'starG.csv')

# Imputation
#df['corrected_effective_temperature_1'] = df['effective_temperature_1'].combine_first(df['effective_temperature_2'])
#df['corrected_log_surface_gravity_1'] = df['log_surface_gravity_1'].combine_first(df['log_surface_gravity_2'])
#df['corrected_metallicity_fe_h_2'] = df['metallicity_fe_h_2'].combine_first(df['metallicity_fe_h_1'])

df

Unnamed: 0,identifier,class,full_class,effective_temperature,log_surface_gravity,metallicity_fe_h,radial_velocity,redshift
0,300702165|2015/01/19,K,K4,4729.36,4.741,-0.128,42.15,0.000141
1,215109|2011/10/23,G,G8,4642.57,4.662,-0.363,36.61,0.000122
2,18112111|2011/12/18,K,K3,4664.49,4.690,-0.207,35.04,0.000117
3,367412199|2015/10/07,G,G9,4960.44,4.613,-0.262,-35.46,-0.000118
4,18112179|2011/12/18,G,G8,5371.04,4.296,-0.248,-29.91,-0.000100
...,...,...,...,...,...,...,...,...
999995,688911092|2018/11/08,K,K7,3937.17,4.453,-0.360,6.12,0.000020
999996,82005095|2012/11/22,F,F5,6411.74,4.198,-0.598,-31.98,-0.000107
999997,392910161|2015/12/20,F,F0,6386.60,4.185,-0.580,-33.19,-0.000111
999998,184205097|2013/11/24,K,K3,4769.89,4.730,-0.221,9.04,0.000030


# 1. Class Classification

In [3]:
# Configuration
TARGET_COLUMN    = ['class']                                   # Target label
# FEATURE_COLUMNS  = ['effective_temperature_2',                 # Input features
#                     'radius', 'mass', 'distance']

FEATURE_COLUMNS = [
    'effective_temperature', 'log_surface_gravity',
    'metallicity_fe_h', 'radial_velocity', 'redshift'
]

# Data preparation
label_encoder = LabelEncoder()                                 # Initialize label encoder
y = label_encoder.fit_transform(df[TARGET_COLUMN])           # Encode target variable
X = df[FEATURE_COLUMNS]                                      # Select features

# Stratified train-test split
X_train, X_val, y_train, y_val = train_test_split(             # Split dataset
    X, y, test_size=0.1, stratify=y, random_state=42
)

# Class balancing with SMOTE
smote = SMOTE(sampling_strategy='not majority', random_state=42)  # Initialize SMOTE
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)   # Apply to training data

# Model training
model = RandomForestClassifier(                                # Random Forest classifier
    n_estimators=20,                                           # Number of trees
    max_depth=10,                                               # Maximum tree depth
    bootstrap=True,                                            # Use bootstrapping
    class_weight='balanced_subsample',                         # Class weighting
    random_state=42                                            # Seed
)
model.fit(X_train_res, y_train_res)                            # Train on resampled data

# Evaluation
y_pred = model.predict(X_val)                                  # Predict on validation set
print("\n=== Enhanced Evaluation ===")                         
print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}")  # Print accuracy
print("\nClassification Report:")                              
print(classification_report(y_val, y_pred,                      # Report metrics
                            target_names=label_encoder.classes_,
                            zero_division=0))

# Model wrapper for reuse
def class_classifier():
    return model

  y = column_or_1d(y, warn=True)



=== Enhanced Evaluation ===
Validation Accuracy: 0.8712

Classification Report:
              precision    recall  f1-score   support

           A       0.50      0.94      0.65      2043
           F       0.96      0.79      0.86     34710
           G       0.86      0.93      0.89     47761
           K       0.85      0.89      0.87     15486

    accuracy                           0.87    100000
   macro avg       0.79      0.88      0.82    100000
weighted avg       0.88      0.87      0.87    100000



In [None]:
# Initialize models
models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42, bootstrap=True, # Use bootstrapping
    class_weight='balanced_subsample'),   # Class weighting
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(random_state=42),
    'GaussianNB': GaussianNB()
}

best_accuracy = 0
best_model = None
best_model_name = ""
best_cm = None

# Train and evaluate each model
for model_name, model in models.items():
    # Apply SMOTE to training data
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    train_accuracy = accuracy_score(y_train_res, model.predict(X_train_res))
    
    # Print results
    print(f'\n[******* {model_name} ******]')
    print(f'\n   Validation Accuracy: {accuracy:.4f}')
    print(f'   Training Accuracy: {train_accuracy:.4f}')
    print('\n', classification_report(y_val, y_pred, target_names=label_encoder.classes_, zero_division=0))
    
    # Store best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = model_name
        best_cm = confusion_matrix(y_val, y_pred)

# Print model scores
print("\nModel Performance Summary:")
for model_name, model in models.items():
    print(f"{model_name}: {model.score(X_val, y_val)*100:.2f}%")

# Set best model to original variable names
model = best_model
print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy:.4f}")

# Confusion matrix of best model (matches original variable name)
y_pred = model.predict(X_val)
cm = confusion_matrix(y_val, y_pred)

# For next cell compatibility:
# model, y_pred, and cm variables are ready for plotting

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



[******* LogisticRegression ******]

   Validation Accuracy: 0.6502
   Training Accuracy: 0.7438

               precision    recall  f1-score   support

           A       0.22      0.84      0.35      2043
           F       0.67      0.78      0.72     34710
           G       0.79      0.49      0.61     47761
           K       0.57      0.82      0.67     15486

    accuracy                           0.65    100000
   macro avg       0.56      0.73      0.59    100000
weighted avg       0.70      0.65      0.65    100000


[******* DecisionTreeClassifier ******]

   Validation Accuracy: 0.8280
   Training Accuracy: 1.0000

               precision    recall  f1-score   support

           A       0.56      0.85      0.67      2043
           F       0.84      0.84      0.84     34710
           G       0.87      0.80      0.83     47761
           K       0.75      0.86      0.81     15486

    accuracy                           0.83    100000
   macro avg       0.76      0.84  

In [None]:
# Confusion matrix visualizations
cm = confusion_matrix(y_val, class_classifier().predict(X_val))     # Compute confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # Normalize by row

plt.figure(figsize=(8, 6))                                          # Set figure size
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',                  # Absolute values heatmap
            xticklabels=label_encoder.classes_,                    
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix (Absolute Values)')                     
plt.ylabel('True Label')                                            
plt.xlabel('Predicted Label')                                       
plt.show()

plt.figure(figsize=(8, 6))                                          # Set figure size
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Greens',   # Normalized heatmap
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix (Percentages)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
# 1. Count and display the number of entries per category
print("\n=== Category Counts for 'sub_class' ===")
print(df['full_class'].value_counts())

# 2. Define minimum count threshold (customizable)
MIN_COUNT = 10  # Modify this value as needed

# 3. Filter out classes with fewer entries than MIN_COUNT
valid_classes = df['full_class'].dropna().value_counts()
valid_classes = valid_classes[valid_classes >= MIN_COUNT].index

df_filtered = df[df['full_class'].isin(valid_classes)].reset_index(drop=True).dropna()

print("\n=== Remaining Classes After Filtering ===")
print(df_filtered['full_class'].value_counts())

# 2. SubClass Classification

In [None]:
# Drop NaNs and convert subclass to string type
#df_filtered['type'] = df_filtered['subclass'].astype(str)                  # Create 'type' column

# Filter groups by type
#group_A_types = ['A1', 'A2', 'A3', 'A5', 'A6', 'A7', 'A8', 'A9']           # Group A types
group_F_types = ['F0','F1', 'F2','F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9']  # Group F types
group_G_types = ['G0','G1', 'G2','G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9']  # Group G types
group_K_types = ['K0','K1', 'K2','K3', 'K4', 'K5', 'K6', 'K7', 'K8', 'K9']  # Group K types

#df_A = df_filtered[df_filtered['type'].isin(group_A_types)].copy()         # Filter Group A
df_F = df_filtered[df_filtered['full_class'].isin(group_F_types)].copy()    # Filter Group F
df_G = df_filtered[df_filtered['full_class'].isin(group_G_types)].copy()    # Filter Group G
df_K = df_filtered[df_filtered['full_class'].isin(group_K_types)].copy()    # Filter Group K

# Define training function with SMOTE
def train_subset_model_with_smote(df_subset):                                
    X = df_subset[FEATURE_COLUMNS]                                          # Features
    y = df_subset['full_class']                                             # Target
    
    le = LabelEncoder()                                                     # Label encoder
    y_encoded = le.fit_transform(y)                                         # Encode target

    X_train, X_val, y_train, y_val = train_test_split(                      # Stratified split
        X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42    
    )

    smote = SMOTE(sampling_strategy='not majority', random_state=42)        # Apply SMOTE
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)         

    model = RandomForestClassifier(                                         # Random forest config
        n_estimators=20,
        max_depth=15,
        bootstrap=True,
        random_state=42,
        class_weight='balanced_subsample'
    )
    model.fit(X_train_res, y_train_res)                                     # Train model
    y_pred = model.predict(X_val)                                           # Predict validation

    return y_val, y_pred, le                                                # Return results

# Train all groups separately
#y_val_A, y_pred_A, le_A = train_subset_model_with_smote(df_A)              # Group A results
y_val_F, y_pred_F, le_F = train_subset_model_with_smote(df_F)              # Group F results
y_val_G, y_pred_G, le_G = train_subset_model_with_smote(df_G)              # Group G results
y_val_K, y_pred_K, le_K = train_subset_model_with_smote(df_K)              # Group K results

# Combine results for final evaluation
y_val_combined = (list(le_F.inverse_transform(y_val_F)) + 
                 list(le_G.inverse_transform(y_val_G)) + 
                 list(le_K.inverse_transform(y_val_K)))

y_pred_combined = (list(le_F.inverse_transform(y_pred_F)) + 
                  list(le_G.inverse_transform(y_pred_G)) + 
                  list(le_K.inverse_transform(y_pred_K)))

all_classes = sorted(set(y_val_combined + y_pred_combined))                # All unique classes

# Accuracy evaluation
print(f"\nCombined Accuracy: {accuracy_score(y_val_combined, y_pred_combined):.4f}")  # Accuracy

# Absolute confusion matrix
cm = confusion_matrix(y_val_combined, y_pred_combined, labels=all_classes)           # Matrix
print("\nConfusion Matrix:")
print(pd.DataFrame(cm, index=all_classes, columns=all_classes))                      # Print matrix

plt.figure(figsize=(10, 8))                                                          # Plot size
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',                                   # Heatmap
            xticklabels=all_classes,
            yticklabels=all_classes)
plt.title('Confusion Matrix (Absolute Values)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Normalized confusion matrix
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]                        # Normalize
plt.figure(figsize=(10, 8))
sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Greens',                          # Heatmap
            xticklabels=all_classes,
            yticklabels=all_classes)
plt.title('Normalized Confusion Matrix (Percentages)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
%%time

# Initialize models with optimized parameters
models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=5000, multi_class='multinomial', solver='lbfgs'),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42, max_depth=15),
    'RandomForestClassifier': RandomForestClassifier(                                         # Random forest config
        n_estimators=20,
        max_depth=15,
        bootstrap=True,
        random_state=42,
        class_weight='balanced_subsample'
    ),
    'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=10, weights='distance'),
    'SVC': SVC(random_state=42, kernel='rbf', gamma='scale', C=200.0, class_weight='balanced'),
    'GaussianNB': GaussianNB(var_smoothing=1e-9)
}

best_accuracy = 0
best_model = None
best_model_name = ""
best_cm = None
best_le = None

# Define training function with SMOTE and StandardScaler
def train_subset_model_with_smote(df_subset, model, model_name):
    X = df_subset[FEATURE_COLUMNS]
    y = df_subset['full_class']
    
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    X_train, X_val, y_train, y_val = train_test_split(
        X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
    )

    smote = SMOTE(sampling_strategy='not majority', random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    
    if model_name in ['LogisticRegression', 'SVC', 'KNeighborsClassifier']:
        scaler = StandardScaler()
        X_train_res = scaler.fit_transform(X_train_res)
        X_val = scaler.transform(X_val)

    model.fit(X_train_res, y_train_res)
    y_train_pred = model.predict(X_train_res)
    y_val_pred = model.predict(X_val)
    
    train_acc = accuracy_score(y_train_res, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    
    return y_val, y_val_pred, le, train_acc, val_acc

# Train and evaluate each model
for model_name, model in models.items():
    try:
        # Train Groups F, G, K
        y_val_F, y_pred_F, le_F, train_acc_F, val_acc_F = train_subset_model_with_smote(df_F, model, model_name)
        y_val_G, y_pred_G, le_G, train_acc_G, val_acc_G = train_subset_model_with_smote(df_G, model, model_name)
        y_val_K, y_pred_K, le_K, train_acc_K, val_acc_K = train_subset_model_with_smote(df_K, model, model_name)
        
        # Combine results
        y_val_combined = (list(le_F.inverse_transform(y_val_F)) + 
                         list(le_G.inverse_transform(y_val_G)) + 
                         list(le_K.inverse_transform(y_val_K)))
        y_pred_combined = (list(le_F.inverse_transform(y_pred_F)) + 
                          list(le_G.inverse_transform(y_pred_G)) + 
                          list(le_K.inverse_transform(y_pred_K)))
        
        combined_val_acc = accuracy_score(y_val_combined, y_pred_combined)
        combined_train_acc = (train_acc_F + train_acc_G + train_acc_K)/3
        
        print(f'\n[******* {model_name} ******]')
        print(f'  Training Accuracy (Group F): {train_acc_F:.4f}')
        print(f'  Validation Accuracy (Group F): {val_acc_F:.4f}')
        print(f'  Training Accuracy (Group G): {train_acc_G:.4f}')
        print(f'  Validation Accuracy (Group G): {val_acc_G:.4f}')
        print(f'  Training Accuracy (Group K): {train_acc_K:.4f}')
        print(f'  Validation Accuracy (Group K): {val_acc_K:.4f}')
        print(f'\n  Combined Validation Accuracy: {combined_val_acc:.4f}')
        print(f'  Approx. Combined Training Accuracy: {combined_train_acc:.4f}')
        
        print('\nClassification Report:')
        print(classification_report(y_val_combined, y_pred_combined, zero_division=0))
        
        if combined_val_acc > best_accuracy:
            best_accuracy = combined_val_acc
            best_model = model
            best_model_name = model_name
            best_cm = confusion_matrix(y_val_combined, y_pred_combined, 
                                     labels=sorted(set(y_val_combined + y_pred_combined)))
            best_le = le_F

    except Exception as e:
        print(f"\n[!!! Failed to train {model_name} !!!]")
        print(f"Error: {str(e)}")
        continue

print(f"\nBest Model: {best_model_name} with validation accuracy {best_accuracy:.4f}")

In [None]:
# Create confusion matrix
labels = sorted(set(y_val_combined))                                          # Sorted unique labels
cm = confusion_matrix(y_val_combined, y_pred_combined, labels=labels)        # Absolute confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]           # Normalized matrix (percentages)

# Plot absolute confusion matrix
plt.figure(figsize=(10, 8))                                                  # Figure size
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',                           # Absolute heatmap
            xticklabels=labels,
            yticklabels=labels)
plt.title('Confusion Matrix (Absolute Values)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Plot normalized confusion matrix
plt.figure(figsize=(10, 8))                                                  # Figure size
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Greens',            # Percentage heatmap
            xticklabels=labels,
            yticklabels=labels)
plt.title('Confusion Matrix (Percentages)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Display remaining classes
print("\n=== Remaining Classes After Filtering ===")
print(df_filtered['full_class'].value_counts())                                # Count remaining subclasses


In [None]:
df_lum.columns

In [None]:
# Configuration
TARGET_COLUMN_lum    = 'luminosity_class'                                    # Target column name
FEATURE_COLUMNS_lum  = ['luminosity', 'visual_magnitude',                   # Selected features
                        'mass', 'log_surface_gravity_2', 'corrected_log_surface_gravity_1',
                        'metallicity_fe_h_1', 'corrected_metallicity_fe_h_2',
                        'effective_temperature_2', 'corrected_effective_temperature_1',
                        'distance']

# Prepare feature data
X_lum = df_lum[FEATURE_COLUMNS_lum]                                         # Feature matrix

# Prepare target data
y_lum = df_lum[TARGET_COLUMN_lum]                                           # Target labels

# Split data (stratified)
X_train_lum, X_val_lum, y_train_lum, y_val_lum = train_test_split(         # Stratified split
    X_lum, y_lum, test_size=0.3, stratify=y_lum, random_state=42
)

# Handle class imbalance with SMOTE
smote_lum = SMOTE(sampling_strategy='not majority', random_state=42)       # SMOTE setup
X_train_res_lum, y_train_res_lum = smote_lum.fit_resample(X_train_lum, y_train_lum)  # Oversample training set

# Train model
model_lum = RandomForestClassifier(                                        # RandomForest setup
    n_estimators=100,
    max_depth=7,
    class_weight='balanced_subsample',
    random_state=42
)
model_lum.fit(X_train_res_lum, y_train_res_lum)                            # Fit model on resampled data

# Evaluation
y_pred_lum = model_lum.predict(X_val_lum)                                  # Predict validation data

# Create confusion matrix
classes_lum = np.unique(y_lum)                                             # Sorted class labels
cm_lum = confusion_matrix(y_val_lum, y_pred_lum, labels=classes_lum)       # Confusion matrix

# Print results
print(f"Validation Accuracy: {accuracy_score(y_val_lum, y_pred_lum):.4f}") # Print accuracy
print("\nConfusion Matrix (True vs Predicted):")
print(pd.DataFrame(                                                        # Format confusion matrix
    cm_lum,
    index=classes_lum,
    columns=classes_lum
))


In [None]:
# Configuration
TARGET_COLUMN_lum = 'luminosity_class'
FEATURE_COLUMNS_lum = [
    'luminosity', 'visual_magnitude', 'mass', 
    'log_surface_gravity_2', 'corrected_log_surface_gravity_1',
    'metallicity_fe_h_1', 'corrected_metallicity_fe_h_2',
    'effective_temperature_2', 'corrected_effective_temperature_1',
    'distance'
]

# Prepare data
X_lum = df_lum[FEATURE_COLUMNS_lum]
y_lum = df_lum[TARGET_COLUMN_lum]

# Split data (stratified)
X_train_lum, X_val_lum, y_train_lum, y_val_lum = train_test_split(
    X_lum, y_lum, test_size=0.3, stratify=y_lum, random_state=42
)

# Handle class imbalance with SMOTE
smote_lum = SMOTE(sampling_strategy='not majority', random_state=42)
X_train_res_lum, y_train_res_lum = smote_lum.fit_resample(X_train_lum, y_train_lum)

# Initialize models with optimized parameters
models_lum = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=5000, multi_class='multinomial', solver='lbfgs'),
    'DecisionTree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=15, class_weight='balanced_subsample', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5, weights='distance'),
    'SVC': SVC(random_state=42, kernel='rbf', gamma='scale', C=10, class_weight='balanced', probability=True),
    'GaussianNB': GaussianNB(var_smoothing=1e-9)
}

best_accuracy_lum = 0
best_model_lum = None
best_model_name_lum = ""
best_cm_lum = None
classes_lum = np.unique(y_lum)

# Train and evaluate each model
for model_name, model in models_lum.items():
    try:
        # Create fresh instance (alternative to clone)
        current_model = model.__class__(**model.get_params())
        
        # Scale data for sensitive models
        if model_name in ['LogisticRegression', 'SVC', 'KNN']:
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_res_lum)
            X_val_scaled = scaler.transform(X_val_lum)
            X_to_predict = X_val_scaled
        else:
            X_train_scaled = X_train_res_lum
            X_val_scaled = X_val_lum
            X_to_predict = X_val_lum
        
        # Train model
        current_model.fit(X_train_scaled, y_train_res_lum)
        
        # Get predictions
        y_train_pred = current_model.predict(X_train_scaled)
        y_val_pred = current_model.predict(X_to_predict)
        
        # Calculate accuracies
        train_acc = accuracy_score(y_train_res_lum, y_train_pred)
        val_acc = accuracy_score(y_val_lum, y_val_pred)
        
        # Print results
        print(f'\n[******* {model_name} ******]')
        print(f'Training Accuracy: {train_acc:.4f}')
        print(f'Validation Accuracy: {val_acc:.4f}')
        print('\nClassification Report:')
        print(classification_report(y_val_lum, y_val_pred, zero_division=0, target_names=classes_lum))
        
        # Track best model
        if val_acc > best_accuracy_lum:
            best_accuracy_lum = val_acc
            best_model_lum = current_model
            best_model_name_lum = model_name
            best_cm_lum = confusion_matrix(y_val_lum, y_val_pred, labels=classes_lum)
            
    except Exception as e:
        print(f"\n[!!! Failed to train {model_name} !!!]")
        print(f"Error: {str(e)}")
        continue

# Final summary
print("\n=== Model Performance Summary ===")
for model_name, model in models_lum.items():
    try:
        if model_name in ['LogisticRegression', 'SVC', 'KNN']:
            X_train_eval = scaler.transform(X_train_res_lum)
            X_val_eval = scaler.transform(X_val_lum)
        else:
            X_train_eval = X_train_res_lum
            X_val_eval = X_val_lum
            
        train_score = accuracy_score(y_train_res_lum, model.predict(X_train_eval))
        val_score = accuracy_score(y_val_lum, model.predict(X_val_eval))
        print(f"{model_name}:")
        print(f"  Training: {train_score:.4f}  Validation: {val_score:.4f}  Diff: {train_score-val_score:.4f}")
    except:
        print(f"{model_name}: Evaluation failed")
        continue

print(f"\nBest Model: {best_model_name_lum} with validation accuracy {best_accuracy_lum:.4f}")

# Set up variables for plotting
model_lum = best_model_lum
y_pred_lum = best_model_lum.predict(X_val_scaled if best_model_name_lum in ['LogisticRegression', 'SVC', 'KNN'] else X_val_lum)
cm_lum = best_cm_lum

In [None]:
# Plot confusion matrix (absolute values) - Luminosity Model
plt.figure(figsize=(10, 8))                                                # Set figure size
sns.heatmap(                                                                # Draw heatmap
    cm_lum,
    annot=True,                                                             # Annotate cells
    fmt="d",                                                                # Integer format
    cmap="Blues",                                                           # Blue color scheme
    xticklabels=classes_lum,                                                # Predicted labels
    yticklabels=classes_lum,                                                # True labels
    cbar=False                                                              # No color bar
)
plt.title("Confusion Matrix (True vs Predicted) - Luminosity Model")        # Title
plt.xlabel("Predicted Label")                                               # X-axis label
plt.ylabel("True Label")                                                    # Y-axis label
plt.tight_layout()                                                          # Fit layout
plt.show()                                                                  # Display plot

# Normalized confusion matrix - Luminosity Model
cm_normalized_lum = cm_lum.astype('float') / cm_lum.sum(axis=1)[:, np.newaxis]  # Normalize by row

plt.figure(figsize=(10, 8))                                                # Set figure size
sns.heatmap(                                                                # Draw heatmap
    cm_normalized_lum,
    annot=True,                                                             # Annotate cells
    fmt=".2%",                                                              # Percentage format
    cmap="Greens",                                                          # Green color scheme
    xticklabels=classes_lum,                                                # Predicted labels
    yticklabels=classes_lum                                                 # True labels
)
plt.title("Normalized Confusion Matrix (Percentages) - Luminosity Model")   # Title
plt.xlabel("Predicted Label")                                               # X-axis label
plt.ylabel("True Label")                                                    # Y-axis label
plt.tight_layout()                                                          # Fit layout
plt.show()                                                                  # Display plot
