<a href="https://www.kaggle.com/code/shiwayz/obesity-classification-98-11-orig-91-47-comp?scriptVersionId=162397511" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
'''

https://www.kaggle.com/code/oscarm524/ps-s3-ep23-eda-modeling-submission/notebook

TOP MODELS
-- XGBoost
-- LGBM
-- CatBoost
-- Hist GradBoost

'''

In [None]:
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import logging
from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from scipy.stats import randint, uniform


from sklearn.ensemble import RandomForestClassifier, VotingClassifier,  HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from lightgbm import LGBMClassifier


In [None]:
comp_df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')
original_df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')

random_state = 42

# <span style="color:red"> Optuna Ensemble Original Data</span>


## <span style="color:orange"> Without CV, XGB+LGBM [97.87%]</span>


In [None]:
original_df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')

df = original_df.copy()
categorical_features = df.select_dtypes(include=['object']).columns
non_categorical_features = df.select_dtypes(exclude=['object'])

df['CALC'] = np.where(df['CALC'] == 'Always', 'Frequently', df['CALC'])

ordinal_encoder = OrdinalEncoder()
df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])

scaler = StandardScaler()
df[non_categorical_features.columns] = scaler.fit_transform(df[non_categorical_features.columns])

y = df['NObeyesdad']
X = df.drop(columns=['NObeyesdad'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

accuracies = []

def objective(trial):
    
    # LightGBM parameters
    lgb_params = {
        'objective': 'multiclass',
        'num_class': 7,
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state' : random_state,
        'lambda_l1': trial.suggest_float('lgb_lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lgb_lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('lgb_num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('lgb_feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('lgb_bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('lgb_bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('lgb_min_child_samples', 5, 100),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3)
    }

    # XGBoost parameters
    xgb_params = {
        'objective': 'multi:softprob',
        'num_class': 7,
        'verbosity': 0,
        'random_state' : random_state,
        'lambda': trial.suggest_float('xgb_lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('xgb_alpha', 1e-8, 10.0, log=True),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'eta': trial.suggest_float('xgb_eta', 0.01, 0.3),
        'subsample': trial.suggest_float('xgb_subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.4, 1.0),
    }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)

    # Predictions
    lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    xgb_pred = xgb_model.predict(xgb.DMatrix(X_test))

    ensemble_pred = np.mean([lgb_pred, xgb_pred], axis=0)

    final_pred = np.argmax(ensemble_pred, axis=1)

    accuracy = accuracy_score(y_test, final_pred)
    accuracies.append(accuracy)
    
    return accuracy 

optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100) 

print("Best trial:", study.best_trial.params)

plt.figure(figsize=(12, 6))
plt.plot(accuracies, label='Accuracy')
plt.title('Accuracy over Trials')
plt.xlabel('Trial')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
max(accuracies)

In [None]:
'''
best_trial_params = {
    'lgb_lambda_l1': 0.00022450967926345529,
    'lgb_lambda_l2': 3.0346284577653767e-07,
    'lgb_num_leaves': 147,
    'lgb_feature_fraction': 0.7138759815745731,
    'lgb_bagging_fraction': 0.9295853923373891,
    'lgb_bagging_freq': 2,
    'lgb_min_child_samples': 48,
    'lgb_learning_rate': 0.18936631482820396,
    'xgb_lambda': 5.363552546585817e-06,
    'xgb_alpha': 0.002670337201689084,
    'xgb_max_depth': 8,
    'xgb_eta': 0.09938218925297368,
    'xgb_subsample': 0.6931943353971597,
    'xgb_colsample_bytree': 0.7136240974017022
}

# Converted parameters
lgb_params = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state' : 42,
    'lambda_l1': best_trial_params['lgb_lambda_l1'],
    'lambda_l2': best_trial_params['lgb_lambda_l2'],
    'num_leaves': best_trial_params['lgb_num_leaves'],
    'feature_fraction': best_trial_params['lgb_feature_fraction'],
    'bagging_fraction': best_trial_params['lgb_bagging_fraction'],
    'bagging_freq': best_trial_params['lgb_bagging_freq'],
    'min_child_samples': best_trial_params['lgb_min_child_samples'],
    'learning_rate': best_trial_params['lgb_learning_rate']
}

xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'verbosity': 0,
    'random_state' : 42,
    'lambda': best_trial_params['xgb_lambda'],
    'alpha': best_trial_params['xgb_alpha'],
    'max_depth': best_trial_params['xgb_max_depth'],
    'eta': best_trial_params['xgb_eta'],
    'subsample': best_trial_params['xgb_subsample'],
    'colsample_bytree': best_trial_params['xgb_colsample_bytree']
}
'''

## <span style="color:orange"> With CV, XGB+LGBM [97.92%]</span>


In [None]:
original_df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')

df = original_df.copy()
categorical_features = df.select_dtypes(include=['object']).columns
non_categorical_features = df.select_dtypes(exclude=['object'])

df['CALC'] = np.where(df['CALC'] == 'Always', 'Frequently', df['CALC'])

ordinal_encoder = OrdinalEncoder()
df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])

scaler = StandardScaler()
df[non_categorical_features.columns] = scaler.fit_transform(df[non_categorical_features.columns])

y = df['NObeyesdad']
X = df.drop(columns=['NObeyesdad'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

accuracies = []

def objective(trial):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    accuracies = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # LightGBM parameters
        lgb_params = {
            'objective': 'multiclass',
            'num_class': 7,
            'metric': 'multi_logloss',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'random_state' : random_state,
            'lambda_l1': trial.suggest_float('lgb_lambda_l1', 1e-8, 10.0, log=True),
            'lambda_l2': trial.suggest_float('lgb_lambda_l2', 1e-8, 10.0, log=True),
            'num_leaves': trial.suggest_int('lgb_num_leaves', 2, 256),
            'feature_fraction': trial.suggest_float('lgb_feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('lgb_bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('lgb_bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('lgb_min_child_samples', 5, 100),
            'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3)
        }
        
        # XGBoost parameters
        xgb_params = {
            'objective': 'multi:softprob',
            'num_class': 7,
            'verbosity': 0,
            'random_state' : random_state,
            'lambda': trial.suggest_float('xgb_lambda', 1e-8, 10.0, log=True),
            'alpha': trial.suggest_float('xgb_alpha', 1e-8, 10.0, log=True),
            'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
            'eta': trial.suggest_float('xgb_eta', 0.01, 0.3),
            'subsample': trial.suggest_float('xgb_subsample', 0.4, 1.0),
            'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.4, 1.0),
        }

        # Training LightGBM
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

        # Training XGBoost
        xgb_train = xgb.DMatrix(X_train, label=y_train)
        xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)

        lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
        xgb_pred = xgb_model.predict(xgb.DMatrix(X_test))

        ensemble_pred = np.mean([lgb_pred, xgb_pred], axis=0)
        final_pred = np.argmax(ensemble_pred, axis=1)

        accuracy = accuracy_score(y_test, final_pred)
        accuracies.append(accuracy)
        
    mean_accuracies.append(np.mean(accuracies))
    return np.mean(accuracies)
            
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100) 

print("Best trial:", study.best_trial.params)
print("\n Best score:", study.best_trial.value, "\n")

plt.figure(figsize=(12, 6))
plt.plot(mean_accuracies, label='Accuracy')
plt.title('Accuracy over Trials')
plt.xlabel('Trial')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
'''
best_trial_params = {
    'lgb_lambda_l1': 1.7468312934870682e-08,
    'lgb_lambda_l2': 4.514597403500374e-07,
    'lgb_num_leaves': 108,
    'lgb_feature_fraction': 0.8355879678246828,
    'lgb_bagging_fraction': 0.9429435908403985,
    'lgb_bagging_freq': 3,
    'lgb_min_child_samples': 53,
    'lgb_learning_rate': 0.13936934533091533,
    'xgb_lambda': 1.549322630830155e-07,
    'xgb_alpha': 0.002155193159445727,
    'xgb_max_depth': 7,
    'xgb_eta': 0.2604963851766582,
    'xgb_subsample': 0.8935684621215217,
    'xgb_colsample_bytree': 0.868808428718869
}

# Converted parameters
lgb_params = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state' : 42,
    'lambda_l1': best_trial_params['lgb_lambda_l1'],
    'lambda_l2': best_trial_params['lgb_lambda_l2'],
    'num_leaves': best_trial_params['lgb_num_leaves'],
    'feature_fraction': best_trial_params['lgb_feature_fraction'],
    'bagging_fraction': best_trial_params['lgb_bagging_fraction'],
    'bagging_freq': best_trial_params['lgb_bagging_freq'],
    'min_child_samples': best_trial_params['lgb_min_child_samples'],
    'learning_rate': best_trial_params['lgb_learning_rate']
}

xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'verbosity': 0,
    'random_state' : 42,
    'lambda': best_trial_params['xgb_lambda'],
    'alpha': best_trial_params['xgb_alpha'],
    'max_depth': best_trial_params['xgb_max_depth'],
    'eta': best_trial_params['xgb_eta'],
    'subsample': best_trial_params['xgb_subsample'],
    'colsample_bytree': best_trial_params['xgb_colsample_bytree']
}

'''

## <span style="color:orange"> With CV, XGB+LGBM Best Parameters [97.92%]</span>


In [None]:
original_df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')

df = original_df.copy()
categorical_features = df.select_dtypes(include=['object']).columns
non_categorical_features = df.select_dtypes(exclude=['object'])

df['CALC'] = np.where(df['CALC'] == 'Always', 'Frequently', df['CALC'])

ordinal_encoder = OrdinalEncoder()
df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])

scaler = StandardScaler()
df[non_categorical_features.columns] = scaler.fit_transform(df[non_categorical_features.columns])

y = df['NObeyesdad']
X = df.drop(columns=['NObeyesdad'])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)


best_params = {
    'lgb_lambda_l1': 1.7468312934870682e-08,
    'lgb_lambda_l2': 4.514597403500374e-07,
    'lgb_num_leaves': 108,
    'lgb_feature_fraction': 0.8355879678246828,
    'lgb_bagging_fraction': 0.9429435908403985,
    'lgb_bagging_freq': 3,
    'lgb_min_child_samples': 53,
    'lgb_learning_rate': 0.13936934533091533,
    'xgb_lambda': 1.549322630830155e-07,
    'xgb_alpha': 0.002155193159445727,
    'xgb_max_depth': 7,
    'xgb_eta': 0.2604963851766582,
    'xgb_subsample': 0.8935684621215217,
    'xgb_colsample_bytree': 0.868808428718869
}

# LightGBM parameters
lgb_params = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': 42,
    'lambda_l1': best_params['lgb_lambda_l1'],
    'lambda_l2': best_params['lgb_lambda_l2'],
    'num_leaves': best_params['lgb_num_leaves'],
    'feature_fraction': best_params['lgb_feature_fraction'],
    'bagging_fraction': best_params['lgb_bagging_fraction'],
    'bagging_freq': best_params['lgb_bagging_freq'],
    'min_child_samples': best_params['lgb_min_child_samples'],
    'learning_rate': best_params['lgb_learning_rate']
}

# XGBoost parameters
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'verbosity': 0,
    'random_state': 42,
    'lambda': best_params['xgb_lambda'],
    'alpha': best_params['xgb_alpha'],
    'max_depth': best_params['xgb_max_depth'],
    'eta': best_params['xgb_eta'],
    'subsample': best_params['xgb_subsample'],
    'colsample_bytree': best_params['xgb_colsample_bytree']
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_accuracies = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Training LightGBM
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

    # Training XGBoost
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)

    lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    xgb_pred = xgb_model.predict(xgb.DMatrix(X_test))

    ensemble_pred = np.mean([lgb_pred, xgb_pred], axis=0)
    final_pred = np.argmax(ensemble_pred, axis=1)

    test_accuracy = accuracy_score(y_test, final_pred)
    test_accuracies.append(test_accuracy)

print("Average test set accuracy:", np.mean(test_accuracies))

## <span style="color:orange"> With CV, XGB+LGBM+CB+HGB [98.11%]</span>


In [None]:
original_df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')

df = original_df.copy()
categorical_features = df.select_dtypes(include=['object']).columns
non_categorical_features = df.select_dtypes(exclude=['object'])

df['CALC'] = np.where(df['CALC'] == 'Always', 'Frequently', df['CALC'])

ordinal_encoder = OrdinalEncoder()
df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])

scaler = StandardScaler()
df[non_categorical_features.columns] = scaler.fit_transform(df[non_categorical_features.columns])

y = df['NObeyesdad']
X = df.drop(columns=['NObeyesdad'])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

mean_accuracies = []

def objective(trial):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    accuracies = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # LightGBM parameters
        lgb_params = {
            'objective': 'multiclass',
            'num_class': 7,
            'metric': 'multi_logloss',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'random_state' : 42,
            'lambda_l1': trial.suggest_float('lgb_lambda_l1', 1e-8, 10.0, log=True),
            'lambda_l2': trial.suggest_float('lgb_lambda_l2', 1e-8, 10.0, log=True),
            'num_leaves': trial.suggest_int('lgb_num_leaves', 2, 256),
            'feature_fraction': trial.suggest_float('lgb_feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('lgb_bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('lgb_bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('lgb_min_child_samples', 5, 100),
            'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3)
        }

        # XGBoost parameters
        xgb_params = {
            'objective': 'multi:softprob',
            'num_class': 7,
            'verbosity': 0,
            'random_state' : 42,
            'lambda': trial.suggest_float('xgb_lambda', 1e-8, 10.0, log=True),
            'alpha': trial.suggest_float('xgb_alpha', 1e-8, 10.0, log=True),
            'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
            'eta': trial.suggest_float('xgb_eta', 0.01, 0.3),
            'subsample': trial.suggest_float('xgb_subsample', 0.4, 1.0),
            'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.4, 1.0),
        }

        # CatBoost parameters
        cb_params = {
            'objective': 'MultiClass',
            'verbose': 0,
            'random_seed': 42,
            'iterations' : trial.suggest_int('iterations', 50, 500),
            'l2_leaf_reg': trial.suggest_float('cb_l2_leaf_reg', 1e-8, 10.0, log=True),
            'depth': trial.suggest_int('cb_depth', 3, 10),
            'learning_rate': trial.suggest_float('cb_learning_rate', 0.01, 0.3),
            'colsample_bylevel': trial.suggest_float('cb_colsample_bylevel', 0.4, 1.0),
        }

        # HistGradientBoosting parameters
        hgb_params = {
            'max_iter': 100,
            'random_state' : random_state,
            'learning_rate': trial.suggest_float('hgb_learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('hgb_max_depth', 3, 10),
            'min_samples_leaf': trial.suggest_int('hgb_min_samples_leaf', 5, 100),
            'max_bins': trial.suggest_int('hgb_max_bins', 50, 255),
        }

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

        xgb_train = xgb.DMatrix(X_train, label=y_train)
        xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)

        cb_model = cb.CatBoostClassifier(**cb_params)
        cb_model.fit(X_train, y_train)

        hgb_model = HistGradientBoostingClassifier(**hgb_params)
        hgb_model.fit(X_train, y_train)

        # Predictions
        lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
        xgb_pred = xgb_model.predict(xgb.DMatrix(X_test))
        cb_pred = cb_model.predict_proba(X_test)
        hgb_pred = hgb_model.predict_proba(X_test)

        ensemble_pred = np.mean([lgb_pred, xgb_pred, cb_pred, hgb_pred], axis=0)

        final_pred = np.argmax(ensemble_pred, axis=1)

        accuracy = accuracy_score(y_test, final_pred)
        accuracies.append(accuracy)
        
    mean_accuracies.append(np.mean(accuracies))
    return np.mean(accuracies)

optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100) 

print("Best trial:", study.best_trial.params)
print("\n Best score:", study.best_trial.value, "\n")

plt.figure(figsize=(12, 6))
plt.plot(mean_accuracies, label='Accuracy')
plt.title('Accuracy over Trials')
plt.xlabel('Trial')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
'''
best_trial_params = {
    'lgb_lambda_l1': 6.428285788433516e-05,
    'lgb_lambda_l2': 5.023813910951442e-06,
    'lgb_num_leaves': 95,
    'lgb_feature_fraction': 0.9090165394440615,
    'lgb_bagging_fraction': 0.941177308780672,
    'lgb_bagging_freq': 1,
    'lgb_min_child_samples': 49,
    'lgb_learning_rate': 0.17493106395036273,
    'xgb_lambda': 0.0002340403415754626,
    'xgb_alpha': 0.03382506356447083,
    'xgb_max_depth': 10,
    'xgb_eta': 0.056646106213682795,
    'xgb_subsample': 0.6136751692955198,
    'xgb_colsample_bytree': 0.5094177734897046,
    'iterations': 313,
    'cb_l2_leaf_reg': 2.1202520834506392e-07,
    'cb_depth': 8,
    'cb_learning_rate': 0.06816277412483977,
    'cb_colsample_bylevel': 0.919245284773473,
    'hgb_learning_rate': 0.2156436115943959,
    'hgb_max_depth': 4,
    'hgb_min_samples_leaf': 88,
    'hgb_max_bins': 65
}
'''

In [None]:
original_df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')

df = original_df.copy()
categorical_features = df.select_dtypes(include=['object']).columns
non_categorical_features = df.select_dtypes(exclude=['object'])

df['CALC'] = np.where(df['CALC'] == 'Always', 'Frequently', df['CALC'])

ordinal_encoder = OrdinalEncoder()
df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])

scaler = StandardScaler()
df[non_categorical_features.columns] = scaler.fit_transform(df[non_categorical_features.columns])

y = df['NObeyesdad']
X = df.drop(columns=['NObeyesdad'])

best_trial_params = {
    'lgb_lambda_l1': 6.428285788433516e-05,
    'lgb_lambda_l2': 5.023813910951442e-06,
    'lgb_num_leaves': 95,
    'lgb_feature_fraction': 0.9090165394440615,
    'lgb_bagging_fraction': 0.941177308780672,
    'lgb_bagging_freq': 1,
    'lgb_min_child_samples': 49,
    'lgb_learning_rate': 0.17493106395036273,
    'xgb_lambda': 0.0002340403415754626,
    'xgb_alpha': 0.03382506356447083,
    'xgb_max_depth': 10,
    'xgb_eta': 0.056646106213682795,
    'xgb_subsample': 0.6136751692955198,
    'xgb_colsample_bytree': 0.5094177734897046,
    'iterations': 313,
    'cb_l2_leaf_reg': 2.1202520834506392e-07,
    'cb_depth': 8,
    'cb_learning_rate': 0.06816277412483977,
    'cb_colsample_bylevel': 0.919245284773473,
    'hgb_learning_rate': 0.2156436115943959,
    'hgb_max_depth': 4,
    'hgb_min_samples_leaf': 88,
    'hgb_max_bins': 65
}


# LightGBM parameters
lgb_params = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': random_state,
    'lambda_l1': best_trial_params['lgb_lambda_l1'],
    'lambda_l2': best_trial_params['lgb_lambda_l2'],
    'num_leaves': best_trial_params['lgb_num_leaves'],
    'feature_fraction': best_trial_params['lgb_feature_fraction'],
    'bagging_fraction': best_trial_params['lgb_bagging_fraction'],
    'bagging_freq': best_trial_params['lgb_bagging_freq'],
    'min_child_samples': best_trial_params['lgb_min_child_samples'],
    'learning_rate': best_trial_params['lgb_learning_rate']
}

# XGBoost parameters
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'verbosity': 0,
    'random_state': 42,
    'lambda': best_trial_params['xgb_lambda'],
    'alpha': best_trial_params['xgb_alpha'],
    'max_depth': best_trial_params['xgb_max_depth'],
    'eta': best_trial_params['xgb_eta'],
    'subsample': best_trial_params['xgb_subsample'],
    'colsample_bytree': best_trial_params['xgb_colsample_bytree']
}

# CatBoost parameters
cb_params = {
    'objective': 'MultiClass',
    'verbose': 0,
    'random_seed': random_state,
    'iterations': best_trial_params['iterations'],
    'learning_rate': best_trial_params['cb_learning_rate'],
    'depth': best_trial_params['cb_depth'],
    'l2_leaf_reg': best_trial_params['cb_l2_leaf_reg'],
    'colsample_bylevel': best_trial_params['cb_colsample_bylevel']
}

# HistBoost parameters
hgb_params = {
    'max_iter': 100,
    'random_state' : random_state,
    'learning_rate': best_trial_params['hgb_learning_rate'],
    'max_depth': best_trial_params['hgb_max_depth'],
    'min_samples_leaf': best_trial_params['hgb_min_samples_leaf'],
    'max_bins': best_trial_params['hgb_max_bins']
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_accuracies = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Training LightGBM
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

    # Training XGBoost
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)
    
    # Training CatBoost
    cb_model = cb.CatBoostClassifier(**cb_params)
    cb_model.fit(X_train, y_train)
    
    # Training HistGradBoost
    hgb_model = HistGradientBoostingClassifier(**hgb_params)
    hgb_model.fit(X_train, y_train)

    lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    xgb_pred = xgb_model.predict(xgb.DMatrix(X_test))
    cb_pred = cb_model.predict_proba(X_test)
    hgb_pred = hgb_model.predict_proba(X_test)

    ensemble_pred = np.mean([lgb_pred, xgb_pred, cb_pred, hgb_pred], axis=0)
    final_pred = np.argmax(ensemble_pred, axis=1)

    test_accuracy = accuracy_score(y_test, final_pred)
    test_accuracies.append(test_accuracy)

print("Average test set accuracy:", np.mean(test_accuracies))


# <span style="color:red"> Optuna Ensemble Original+Comp Data</span>

## <span style="color:orange"> Optuna Search XGB + LGBM + CB + HGB [91.77%]</span>


In [None]:
df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')
comp_df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
comp_df = comp_df.drop(columns=['id'])

combined_df = pd.concat([df, comp_df], axis=0)
combined_df = combined_df.drop_duplicates()
combined_df.reset_index(drop=True, inplace=True)

categorical_features = combined_df.select_dtypes(include=['object']).columns
categorical_features = [col for col in categorical_features if col != 'NObeyesdad']
non_categorical_features = combined_df.select_dtypes(exclude=['object'])
combined_df['CALC'] = np.where(combined_df['CALC'] == 'Always', 'Frequently', combined_df['CALC'])

ordinal_encoder = OrdinalEncoder()
combined_df[categorical_features] = ordinal_encoder.fit_transform(combined_df[categorical_features])

scaler = StandardScaler()
combined_df[non_categorical_features.columns] = scaler.fit_transform(combined_df[non_categorical_features.columns])

X = combined_df.drop(columns=['NObeyesdad'])
y = combined_df['NObeyesdad']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

mean_accuracies = []

def objective(trial):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    accuracies = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # LightGBM parameters
        lgb_params = {
            'objective': 'multiclass',
            'num_class': 7,
            'metric': 'multi_logloss',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'random_state' : 42,
            'lambda_l1': trial.suggest_float('lgb_lambda_l1', 1e-8, 10.0, log=True),
            'lambda_l2': trial.suggest_float('lgb_lambda_l2', 1e-8, 10.0, log=True),
            'num_leaves': trial.suggest_int('lgb_num_leaves', 2, 256),
            'feature_fraction': trial.suggest_float('lgb_feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('lgb_bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('lgb_bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('lgb_min_child_samples', 5, 100),
            'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3)
        }

        # XGBoost parameters
        xgb_params = {
            'objective': 'multi:softprob',
            'num_class': 7,
            'verbosity': 0,
            'random_state' : 42,
            'lambda': trial.suggest_float('xgb_lambda', 1e-8, 10.0, log=True),
            'alpha': trial.suggest_float('xgb_alpha', 1e-8, 10.0, log=True),
            'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
            'eta': trial.suggest_float('xgb_eta', 0.01, 0.3),
            'subsample': trial.suggest_float('xgb_subsample', 0.4, 1.0),
            'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.4, 1.0),
        }

        # CatBoost parameters
        cb_params = {
            'objective': 'MultiClass',
            'verbose': 0,
            'random_seed': 42,
            'iterations' : trial.suggest_int('iterations', 50, 500),
            'l2_leaf_reg': trial.suggest_float('cb_l2_leaf_reg', 1e-8, 10.0, log=True),
            'depth': trial.suggest_int('cb_depth', 3, 10),
            'learning_rate': trial.suggest_float('cb_learning_rate', 0.01, 0.3),
            'colsample_bylevel': trial.suggest_float('cb_colsample_bylevel', 0.4, 1.0),
        }

        # HistGradientBoosting parameters
        hgb_params = {
            'max_iter': 100,
            'random_state' : random_state,
            'learning_rate': trial.suggest_float('hgb_learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('hgb_max_depth', 3, 10),
            'min_samples_leaf': trial.suggest_int('hgb_min_samples_leaf', 5, 100),
            'max_bins': trial.suggest_int('hgb_max_bins', 50, 255),
        }

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

        xgb_train = xgb.DMatrix(X_train, label=y_train)
        xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)

        cb_model = cb.CatBoostClassifier(**cb_params)
        cb_model.fit(X_train, y_train)

        hgb_model = HistGradientBoostingClassifier(**hgb_params)
        hgb_model.fit(X_train, y_train)

        # Predictions
        lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
        xgb_pred = xgb_model.predict(xgb.DMatrix(X_test))
        cb_pred = cb_model.predict_proba(X_test)
        hgb_pred = hgb_model.predict_proba(X_test)

        ensemble_pred = np.mean([lgb_pred, xgb_pred, cb_pred, hgb_pred], axis=0)

        final_pred = np.argmax(ensemble_pred, axis=1)

        accuracy = accuracy_score(y_test, final_pred)
        accuracies.append(accuracy)
        
    mean_accuracies.append(np.mean(accuracies))
    return np.mean(accuracies)

optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100) 

print("Best trial:", study.best_trial.params)
print("\n Best score:", study.best_trial.value, "\n")

plt.figure(figsize=(12, 6))
plt.plot(mean_accuracies, label='Accuracy')
plt.title('Accuracy over Trials')
plt.xlabel('Trial')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

## <span style="color:orange"> Testing Original Params on Original+Comp, [91.47%]  </span>


In [None]:
df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')
comp_df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
comp_df = comp_df.drop(columns=['id'])

combined_df = pd.concat([df, comp_df], axis=0)
combined_df = combined_df.drop_duplicates()
combined_df.reset_index(drop=True, inplace=True)

categorical_features = combined_df.select_dtypes(include=['object']).columns
categorical_features = [col for col in categorical_features if col != 'NObeyesdad']
non_categorical_features = combined_df.select_dtypes(exclude=['object'])
combined_df['CALC'] = np.where(combined_df['CALC'] == 'Always', 'Frequently', combined_df['CALC'])

ordinal_encoder = OrdinalEncoder()
combined_df[categorical_features] = ordinal_encoder.fit_transform(combined_df[categorical_features])

scaler = StandardScaler()
combined_df[non_categorical_features.columns] = scaler.fit_transform(combined_df[non_categorical_features.columns])

X = combined_df.drop(columns=['NObeyesdad'])
y = combined_df['NObeyesdad']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


best_trial_params = {
    'lgb_lambda_l1': 6.428285788433516e-05,
    'lgb_lambda_l2': 5.023813910951442e-06,
    'lgb_num_leaves': 95,
    'lgb_feature_fraction': 0.9090165394440615,
    'lgb_bagging_fraction': 0.941177308780672,
    'lgb_bagging_freq': 1,
    'lgb_min_child_samples': 49,
    'lgb_learning_rate': 0.17493106395036273,
    'xgb_lambda': 0.0002340403415754626,
    'xgb_alpha': 0.03382506356447083,
    'xgb_max_depth': 10,
    'xgb_eta': 0.056646106213682795,
    'xgb_subsample': 0.6136751692955198,
    'xgb_colsample_bytree': 0.5094177734897046,
    'iterations': 313,
    'cb_l2_leaf_reg': 2.1202520834506392e-07,
    'cb_depth': 8,
    'cb_learning_rate': 0.06816277412483977,
    'cb_colsample_bylevel': 0.919245284773473,
    'hgb_learning_rate': 0.2156436115943959,
    'hgb_max_depth': 4,
    'hgb_min_samples_leaf': 88,
    'hgb_max_bins': 65
}


# LightGBM parameters
lgb_params = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': random_state,
    'lambda_l1': best_trial_params['lgb_lambda_l1'],
    'lambda_l2': best_trial_params['lgb_lambda_l2'],
    'num_leaves': best_trial_params['lgb_num_leaves'],
    'feature_fraction': best_trial_params['lgb_feature_fraction'],
    'bagging_fraction': best_trial_params['lgb_bagging_fraction'],
    'bagging_freq': best_trial_params['lgb_bagging_freq'],
    'min_child_samples': best_trial_params['lgb_min_child_samples'],
    'learning_rate': best_trial_params['lgb_learning_rate']
}

# XGBoost parameters
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'verbosity': 0,
    'random_state': 42,
    'lambda': best_trial_params['xgb_lambda'],
    'alpha': best_trial_params['xgb_alpha'],
    'max_depth': best_trial_params['xgb_max_depth'],
    'eta': best_trial_params['xgb_eta'],
    'subsample': best_trial_params['xgb_subsample'],
    'colsample_bytree': best_trial_params['xgb_colsample_bytree']
}

# CatBoost parameters
cb_params = {
    'objective': 'MultiClass',
    'verbose': 0,
    'random_seed': random_state,
    'iterations': best_trial_params['iterations'],
    'learning_rate': best_trial_params['cb_learning_rate'],
    'depth': best_trial_params['cb_depth'],
    'l2_leaf_reg': best_trial_params['cb_l2_leaf_reg'],
    'colsample_bylevel': best_trial_params['cb_colsample_bylevel']
}

# HistBoost parameters
hgb_params = {
    'max_iter': 100,
    'random_state' : random_state,
    'learning_rate': best_trial_params['hgb_learning_rate'],
    'max_depth': best_trial_params['hgb_max_depth'],
    'min_samples_leaf': best_trial_params['hgb_min_samples_leaf'],
    'max_bins': best_trial_params['hgb_max_bins']
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

m_test_accuracies = []
s_test_accuracies = []
h_test_accuracies = []


for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Training LightGBM
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

    # Training XGBoost
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)
    
    # Training CatBoost
    cb_model = cb.CatBoostClassifier(**cb_params)
    cb_model.fit(X_train, y_train)
    
    # Training HistGradBoost
    hgb_model = HistGradientBoostingClassifier(**hgb_params)
    hgb_model.fit(X_train, y_train)

    lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    xgb_pred = xgb_model.predict(xgb.DMatrix(X_test))
    cb_pred = cb_model.predict_proba(X_test)
    hgb_pred = hgb_model.predict_proba(X_test)

    # Mean 
    ensemble_pred = np.mean([lgb_pred, xgb_pred, cb_pred, hgb_pred], axis=0)
    final_pred = np.argmax(ensemble_pred, axis=1)
    test_accuracy = accuracy_score(y_test, final_pred)
    m_test_accuracies.append(test_accuracy)
    
print(f"Mean ensemble accuracy: {np.mean(m_test_accuracies)}")

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

test_ids = test_df['id'].copy()
test_df.drop(columns=['id'], inplace=True)

test_df['CALC'] = np.where(test_df['CALC'] == 'Always', 'Frequently', test_df['CALC'])
test_df[categorical_features] = ordinal_encoder.transform(test_df[categorical_features])
test_df[non_categorical_features.columns] = scaler.transform(test_df[non_categorical_features.columns])

X_test = test_df  
X_test_dmatrix = xgb.DMatrix(test_df) 

lgb_pred_test = lgb_model.predict(test_df, num_iteration=lgb_model.best_iteration)
xgb_pred_test = xgb_model.predict(X_test_dmatrix)
cb_pred_test = cb_model.predict_proba(X_test)
hgb_pred_test = hgb_model.predict_proba(X_test)


ensemble_pred_test = np.mean([lgb_pred_test, xgb_pred_test, cb_pred_test, hgb_pred_test], axis=0)
final_pred_test = np.argmax(ensemble_pred_test, axis=1)
y_pred_submission_labels = label_encoder.inverse_transform(final_pred_test)

submission_df = pd.DataFrame({
    'id': test_ids,
    'NObeyesdad': y_pred_submission_labels
})

submission_df.to_csv('ensemble_submission_pre01.csv', index=False)

print("Ensemble submission file created successfully.")

## <span style="color:orange"> Best Params on Original+Comp, [91.77%]  </span>


In [None]:
# best_trial_params = {
#     'lgb_lambda_l1': 0.04809795705900344,
#     'lgb_lambda_l2': 8.32132677405249e-05,
#     'lgb_num_leaves': 124,
#     'lgb_feature_fraction': 0.40306668077710556,
#     'lgb_bagging_fraction': 0.5415059760725607,
#     'lgb_bagging_freq': 5,
#     'lgb_min_child_samples': 27,
#     'lgb_learning_rate': 0.0648115973409096,
#     'xgb_lambda': 0.022079309936168762,
#     'xgb_alpha': 1.1861352308254374e-07,
#     'xgb_max_depth': 10,
#     'xgb_eta': 0.24898070594702448,
#     'xgb_subsample': 0.5587544137677529,
#     'xgb_colsample_bytree': 0.42772532158291765,
#     'iterations': 337,
#     'cb_l2_leaf_reg': 0.0022099509714579076,
#     'cb_depth': 5,
#     'cb_learning_rate': 0.08836676515397127,
#     'cb_colsample_bylevel': 0.6368269699495822,
#     'hgb_learning_rate': 0.16627306296115918,
#     'hgb_max_depth': 3,
#     'hgb_min_samples_leaf': 8,
#     'hgb_max_bins': 147
# }

In [None]:
df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')
comp_df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
comp_df = comp_df.drop(columns=['id'])

combined_df = pd.concat([df, comp_df], axis=0)
combined_df = combined_df.drop_duplicates()
combined_df.reset_index(drop=True, inplace=True)

categorical_features = combined_df.select_dtypes(include=['object']).columns
categorical_features = [col for col in categorical_features if col != 'NObeyesdad']
non_categorical_features = combined_df.select_dtypes(exclude=['object'])
combined_df['CALC'] = np.where(combined_df['CALC'] == 'Always', 'Frequently', combined_df['CALC'])

ordinal_encoder = OrdinalEncoder()
combined_df[categorical_features] = ordinal_encoder.fit_transform(combined_df[categorical_features])

scaler = StandardScaler()
combined_df[non_categorical_features.columns] = scaler.fit_transform(combined_df[non_categorical_features.columns])

X = combined_df.drop(columns=['NObeyesdad'])
y = combined_df['NObeyesdad']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)



best_trial_params = {
    'lgb_lambda_l1': 0.04809795705900344,
    'lgb_lambda_l2': 8.32132677405249e-05,
    'lgb_num_leaves': 124,
    'lgb_feature_fraction': 0.40306668077710556,
    'lgb_bagging_fraction': 0.5415059760725607,
    'lgb_bagging_freq': 5,
    'lgb_min_child_samples': 27,
    'lgb_learning_rate': 0.0648115973409096,
    'xgb_lambda': 0.022079309936168762,
    'xgb_alpha': 1.1861352308254374e-07,
    'xgb_max_depth': 10,
    'xgb_eta': 0.24898070594702448,
    'xgb_subsample': 0.5587544137677529,
    'xgb_colsample_bytree': 0.42772532158291765,
    'iterations': 337,
    'cb_l2_leaf_reg': 0.0022099509714579076,
    'cb_depth': 5,
    'cb_learning_rate': 0.08836676515397127,
    'cb_colsample_bylevel': 0.6368269699495822,
    'hgb_learning_rate': 0.16627306296115918,
    'hgb_max_depth': 3,
    'hgb_min_samples_leaf': 8,
    'hgb_max_bins': 147
}

# LightGBM parameters
lgb_params = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': 42,
    'lambda_l1': best_trial_params['lgb_lambda_l1'],
    'lambda_l2': best_trial_params['lgb_lambda_l2'],
    'num_leaves': best_trial_params['lgb_num_leaves'],
    'feature_fraction': best_trial_params['lgb_feature_fraction'],
    'bagging_fraction': best_trial_params['lgb_bagging_fraction'],
    'bagging_freq': best_trial_params['lgb_bagging_freq'],
    'min_child_samples': best_trial_params['lgb_min_child_samples'],
    'learning_rate': best_trial_params['lgb_learning_rate']
}

# XGBoost parameters
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'verbosity': 0,
    'random_state': 42,
    'lambda': best_trial_params['xgb_lambda'],
    'alpha': best_trial_params['xgb_alpha'],
    'max_depth': best_trial_params['xgb_max_depth'],
    'eta': best_trial_params['xgb_eta'],
    'subsample': best_trial_params['xgb_subsample'],
    'colsample_bytree': best_trial_params['xgb_colsample_bytree']
}

# CatBoost parameters
cb_params = {
    'objective': 'MultiClass',
    'verbose': 0,
    'random_seed': 42,
    'iterations': best_trial_params['iterations'],
    'learning_rate': best_trial_params['cb_learning_rate'],
    'depth': best_trial_params['cb_depth'],
    'l2_leaf_reg': best_trial_params['cb_l2_leaf_reg'],
    'colsample_bylevel': best_trial_params['cb_colsample_bylevel']
}

# HistBoost parameters
hgb_params = {
    'max_iter': 100,
    'random_state' : 42,
    'learning_rate': best_trial_params['hgb_learning_rate'],
    'max_depth': best_trial_params['hgb_max_depth'],
    'min_samples_leaf': best_trial_params['hgb_min_samples_leaf'],
    'max_bins': best_trial_params['hgb_max_bins']
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_accuracies = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Training LightGBM
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

    # Training XGBoost
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)
    
    # Training CatBoost
    cb_model = cb.CatBoostClassifier(**cb_params)
    cb_model.fit(X_train, y_train)
    
    # Training HistGradBoost
    hgb_model = HistGradientBoostingClassifier(**hgb_params)
    hgb_model.fit(X_train, y_train)

    lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    xgb_pred = xgb_model.predict(xgb.DMatrix(X_test))
    cb_pred = cb_model.predict_proba(X_test)
    hgb_pred = hgb_model.predict_proba(X_test)

    ensemble_pred = np.mean([lgb_pred, xgb_pred, cb_pred, hgb_pred], axis=0)
    final_pred = np.argmax(ensemble_pred, axis=1)

    test_accuracy = accuracy_score(y_test, final_pred)
    test_accuracies.append(test_accuracy)

print("Average test set accuracy:", np.mean(test_accuracies))


In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

test_ids = test_df['id'].copy()
test_df.drop(columns=['id'], inplace=True)

test_df['CALC'] = np.where(test_df['CALC'] == 'Always', 'Frequently', test_df['CALC'])
test_df[categorical_features] = ordinal_encoder.transform(test_df[categorical_features])
test_df[non_categorical_features.columns] = scaler.transform(test_df[non_categorical_features.columns])

X_test = test_df  
X_test_dmatrix = xgb.DMatrix(test_df) 

lgb_pred_test = lgb_model.predict(test_df, num_iteration=lgb_model.best_iteration)
xgb_pred_test = xgb_model.predict(X_test_dmatrix)
cb_pred_test = cb_model.predict_proba(X_test)
hgb_pred_test = hgb_model.predict_proba(X_test)


ensemble_pred_test = np.mean([lgb_pred_test, xgb_pred_test, cb_pred_test, hgb_pred_test], axis=0)
final_pred_test = np.argmax(ensemble_pred_test, axis=1)
y_pred_submission_labels = label_encoder.inverse_transform(final_pred_test)

submission_df = pd.DataFrame({
    'id': test_ids,
    'NObeyesdad': y_pred_submission_labels
})

submission_df.to_csv('ensemble_submission_pred02.csv', index=False)

print("Ensemble submission file created successfully.")

## <span style="color:orange"> Optuna Ensemble Original+Comp, 100% Train </span>


In [None]:
df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')
comp_df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
comp_df = comp_df.drop(columns=['id'])

combined_df = pd.concat([df, comp_df], axis=0)
combined_df = combined_df.drop_duplicates()
combined_df.reset_index(drop=True, inplace=True)

categorical_features = combined_df.select_dtypes(include=['object']).columns
categorical_features = [col for col in categorical_features if col != 'NObeyesdad']
non_categorical_features = combined_df.select_dtypes(exclude=['object'])
combined_df['CALC'] = np.where(combined_df['CALC'] == 'Always', 'Frequently', combined_df['CALC'])

ordinal_encoder = OrdinalEncoder()
combined_df[categorical_features] = ordinal_encoder.fit_transform(combined_df[categorical_features])

scaler = StandardScaler()
combined_df[non_categorical_features.columns] = scaler.fit_transform(combined_df[non_categorical_features.columns])

X = combined_df.drop(columns=['NObeyesdad'])
y = combined_df['NObeyesdad']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

lgb_params = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state' : 42,
    'lambda_l1': 0.0013018662948233166,
    'lambda_l2': 4.739507857074739e-06,
    'num_leaves': 212,
    'feature_fraction': 0.7326733564705078,
    'bagging_fraction': 0.8171584187213554,
    'bagging_freq': 6,
    'min_child_samples': 24,
    'learning_rate': 0.028803812378459442
}
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'verbosity': 0,
    'random_state' : 42,
    'lambda': 0.33966275620551073,
    'alpha': 0.345850236628533,
    'max_depth': 5,
    'eta': 0.1752332016094693,
    'subsample': 0.8692386920145995,
    'colsample_bytree': 0.48796459030657685
}

lgb_train = lgb.Dataset(X, y)
lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)  

xgb_train = xgb.DMatrix(X, label=y)
xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=100)  

test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

test_ids = test_df['id'].copy()
test_df.drop(columns=['id'], inplace=True)

test_df['CALC'] = np.where(test_df['CALC'] == 'Always', 'Frequently', test_df['CALC'])
test_df[categorical_features] = ordinal_encoder.transform(test_df[categorical_features])
test_df[non_categorical_features.columns] = scaler.transform(test_df[non_categorical_features.columns])

X_test = test_df  
X_test_dmatrix = xgb.DMatrix(test_df) 

lgb_pred_test = lgb_model.predict(test_df, num_iteration=lgb_model.best_iteration)
xgb_pred_test = xgb_model.predict(X_test_dmatrix)

ensemble_pred_test = np.mean([lgb_pred_test, xgb_pred_test], axis=0)
final_pred_test = np.argmax(ensemble_pred_test, axis=1)
y_pred_submission_labels = label_encoder.inverse_transform(final_pred_test)

submission_df = pd.DataFrame({
    'id': test_ids,
    'NObeyesdad': y_pred_submission_labels
})

submission_df.to_csv('ensemble_submission_100%.csv', index=False)

print("Ensemble submission file created successfully.")