In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils import shuffle

In [None]:
# Install Optuna for Google Colab
try:
    import google.colab
    !pip install -q optuna
except:
    pass  # For local use: install via pip install optuna or requirements.txt

import optuna

In [None]:
# ====== CONFIGURATION ======
PROPERTY_SELECTION = 1  # 1: Hc (Coercivity) | 2: Js (Saturation Polarization) | 3: rho (Resistivity)
RUN_OPTIMIZATION = False  # True: run hyperparameter tuning | False: use pre-optimized parameters

# Set property name based on selection
if PROPERTY_SELECTION == 1:
    property_name = 'Hc'
elif PROPERTY_SELECTION == 2:
    property_name = 'Js'
elif PROPERTY_SELECTION == 3:
    property_name = 'rho'

# Optuna configuration for hyperparameter tuning
OPTUNA_CONFIG = {
    'optimization': {'n_trials': 500, 'timeout': None},      # For hyperparameter tuning
    'feature_selection': {'n_trials': None, 'timeout':1200 }   # For feature selection
}

In [None]:
# ====== LOAD DATA ======
try:
    # For Colab: upload file
    from google.colab import files
    uploaded = files.upload()
    filename = next(iter(uploaded))
    df = pd.read_csv(filename)
except:
    # For local execution: use predefined path based on property selection
    if PROPERTY_SELECTION == 1:
        df = pd.read_csv('../data/raw/oliynyk_Hc_raw.csv')
    elif PROPERTY_SELECTION == 2:
        df = pd.read_csv('../data/raw/oliynyk_Js_raw.csv')
    elif PROPERTY_SELECTION == 3:
        df = pd.read_csv('../data/raw/oliynyk_rho_raw.csv')

print(f"Loaded {len(df)} samples with {len(df.columns)} columns")
display(df.head())

In [None]:
# Separate features (all columns except last) and target variable
df_drop_column = df.iloc[:, :-1]
target = df.loc[:, 'target']

In [None]:
# Clean feature names by replacing special characters with underscores
# LightGBM and some ML libraries don't accept [, ], < in column names
import re

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

df_drop_column.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df_drop_column.columns.values]

In [None]:
#Get features after cleaning
features=df_drop_column

In [None]:
# Shuffle dataset to ensure random distribution
features, target = shuffle(features, target, random_state=42)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
# ================================================================================
# LIGHTGBM HYPERPARAMETER TUNING FOR FEATURE SELECTION
# ================================================================================

def objective_tune_lgbm(trial):
    """
    Objective function for Optuna hyperparameter optimization.

    Suggests hyperparameters from predefined ranges and evaluates model
    performance using cross-validation. Returns mean R² score across folds.
    """

    # ====================================================================
    # HYPERPARAMETER SEARCH SPACE
    # Insert your search ranges below
    # ====================================================================

    param = {
        'objective': 'regression',
        'metric': 'r2',
        'verbosity': -1,

        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),

        # Number of boosting iterations
        'n_estimators': trial.suggest_int('n_estimators', MIN_ESTIMATORS, MAX_ESTIMATORS),

        # Learning rate (log-uniform)
        'learning_rate': trial.suggest_float('learning_rate', MIN_LR, MAX_LR, log=True),

        # Maximum leaves per tree
        'num_leaves': trial.suggest_int('num_leaves', MIN_LEAVES, MAX_LEAVES),

        # Maximum tree depth
        'max_depth': trial.suggest_int('max_depth', MIN_DEPTH, MAX_DEPTH),

        # Minimum samples per leaf
        'min_child_samples': trial.suggest_int('min_child_samples', MIN_SAMPLES, MAX_SAMPLES),

        # Feature fraction per tree
        'colsample_bytree': trial.suggest_float('colsample_bytree', MIN_COLSAMPLE, MAX_COLSAMPLE),

        # L1 regularization (log-uniform)
        'reg_alpha': trial.suggest_float('reg_alpha', MIN_ALPHA, MAX_ALPHA, log=True),

        # L2 regularization (log-uniform)
        'reg_lambda': trial.suggest_float('reg_lambda', MIN_LAMBDA, MAX_LAMBDA, log=True),
    }

    # Add subsample parameters (not compatible with goss)
    if param['boosting_type'] != 'goss':
        # Row subsampling fraction
        param['subsample'] = trial.suggest_float('subsample', MIN_SUBSAMPLE, MAX_SUBSAMPLE)
        # Bagging frequency
        param['bagging_freq'] = trial.suggest_int('bagging_freq', MIN_FREQ, MAX_FREQ)

    # Train model with suggested hyperparameters
    model = lgb.LGBMRegressor(random_state=42, **param)

    # Evaluate using 5-fold cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

    # Return mean R² score across all folds
    return np.mean(scores)

In [None]:
# ====== RUN OPTIMIZATION OR USE PRE-OPTIMIZED PARAMETERS ======
if RUN_OPTIMIZATION:
    print("Running hyperparameter optimization...")
    study_lgbm = optuna.create_study(direction='maximize')

    # Extract config and filter out None values
    config = OPTUNA_CONFIG['optimization']
    optimize_kwargs = {k: v for k, v in config.items() if v is not None}

    study_lgbm.optimize(objective_tune_lgbm, **optimize_kwargs)

    # Display results
    print('\nBest trial results:')
    trial = study_lgbm.best_trial
    print(f'R² score: {trial.value:.4f}\n')

    print('Best parameters:')
    print('best_lgbm_params = {')
    for key, value in trial.params.items():
        if isinstance(value, float):
            print(f"    '{key}': {value},")
        elif isinstance(value, str):
            print(f"    '{key}': '{value}',")
        else:
            print(f"    '{key}': {value},")
    print('}\n')

    best_lgbm_params = study_lgbm.best_params

# ====== PRE-OPTIMIZED PARAMETERS ======
else:
    # Insert your optimized parameters below
    print("Using pre-optimized hyperparameters...")

    best_lgbm_params = {
        # ====== REPLACE WITH YOUR OPTIMIZED PARAMETERS ======
        'boosting_type': 'gbdt',           # gbdt, dart, or goss
        'n_estimators': 100,                # Number of iterations
        'learning_rate': 0.01,              # Step size
        'num_leaves': 31,                   # Max leaves per tree
        'max_depth': 10,                    # Tree depth
        'min_child_samples': 10,            # Min samples per leaf
        'colsample_bytree': 0.8,            # Feature fraction
        'reg_alpha': 0.1,                   # L1 regularization
        'reg_lambda': 0.1,                  # L2 regularization
        # 'subsample': 0.8,                 # Row subsampling (gbdt/dart only)
        # 'bagging_freq': 5,                # Bagging frequency (gbdt/dart only)
        # ====================================================
    }

    print("Parameters:")
    for key, value in best_lgbm_params.items():
        print(f"  {key}: {value}")
    print()

In [None]:
# ================================================================================
# FEATURE SELECTION USING OPTUNA
# ================================================================================
# Uses LightGBM with optimized hyperparameters to select the best feature subset.
# Features are selected individually (True/False) to maximize R² with a penalty
# for deviating from the target number of features (10).
# ================================================================================


def objective_feature_selection(trial):
    """
    Objective function for feature selection optimization.

    For each feature, suggests whether to include it (True/False).
    Evaluates model performance on selected features using cross-validation.
    Applies penalty for feature count deviation from target (10 features).

    Returns: R² score minus penalty
    """

    # Select features by suggesting True/False for each feature
    selected_features = []
    for feature_name in X_train.columns:
        if trial.suggest_categorical(feature_name, [True, False]):
            selected_features.append(feature_name)

    # Return worst score if no features selected
    if not selected_features:
        return -np.inf

    # Train model on selected features only
    X_selected = X_train[selected_features]

    # Use LightGBM with pre-optimized hyperparameters
    model = lgb.LGBMRegressor(random_state=42, **best_lgbm_params)

    # Evaluate using 5-fold cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_selected, y_train, cv=cv, scoring='r2')
    r2_score = np.mean(scores)


    # Apply penalty: 0.025 for each feature away from target (10 features)
    # Adjust penalty if optimization struggles (e.g., if R² is negative, reduce penalty to 0.01 or 0.005)
    penalty = 0.025 * abs(len(selected_features) - 10)

    return r2_score - penalty

In [None]:
# ====== RUN FEATURE SELECTION ======
print("Running feature selection optimization...")

# Create Optuna study to maximize (R² - penalty)
study_fs = optuna.create_study(direction='maximize')

# Extract config and filter out None values
config = OPTUNA_CONFIG['feature_selection']
optimize_kwargs = {k: v for k, v in config.items() if v is not None}

# Run feature selection optimization
study_fs.optimize(objective_feature_selection, **optimize_kwargs)

In [None]:
# ====== DISPLAY FEATURE SELECTION RESULTS ======
best_trial = study_fs.best_trial

# Extract feature names where Optuna selected True
selected_features = [name for name, selected in best_trial.params.items() if selected]

print("\n" + "="*50)
print("FEATURE SELECTION COMPLETED")
print(f"Objective value (R² - penalty): {best_trial.value:.4f}")
print(f"Number of selected features: {len(selected_features)}")
print("\nSelected features:")
# Print each feature name on a separate line (sorted alphabetically)
for feature in sorted(selected_features):
    print(f"  - {feature}")
print("="*50)

In [None]:
# ====== SAVE RESULTS ======
# Create new dataset with selected features only
new_dataset = df_drop_column[selected_features].copy()
print("\nCreated new dataset 'new_dataset' with shape:", new_dataset.shape)

# Combine selected features with target column
df_to_download = pd.concat([new_dataset, target], axis=1)

# Save to CSV and download
df_to_download.to_csv(f'Oliynyk_{property_name}_optunaFS.csv', index=False)
files.download(f'Oliynyk_{property_name}_optunaFS.csv')