<a href="https://colab.research.google.com/github/NaniiiGock/ISIC-2024---Skin-Cancer-Detection-with-3D-TBP/blob/main/Xgboost%20and%20LGBM%20model%20fine%20tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Machine Learning Project/train-metadata.csv', low_memory=False)
test_data = pd.read_csv('/content/drive/MyDrive/Machine Learning Project/test-metadata.csv', low_memory=False)

In [None]:
useless_cols = set(train_data.columns) - set(test_data.columns)
useless_cols.remove('target')
useless_cols = list(useless_cols)
train_data = train_data.drop(columns=useless_cols)

In [None]:
X_train = train_data.drop(columns='target')
y_train = train_data['target']
X_train.fillna({'age_approx': X_train['age_approx'].mean(),'anatom_site_general':'NA','sex':'NA'}, inplace=True)
test_data.fillna({'age_approx': X_train['age_approx'].mean(),'anatom_site_general':'NA','sex':'NA'}, inplace=True)

In [None]:
all_data = X_train

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Step 1: Handle Missing Values in Test Data
numerical_cols = test_data.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_cols = test_data.select_dtypes(include=["object", "category"]).columns.tolist()

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

imputation_transformer = ColumnTransformer(
    transformers=[
        ('num', num_imputer, numerical_cols),
        ('cat', cat_imputer, categorical_cols)
    ],
    remainder='passthrough'  # Keep other columns as they are
)
imputation_transformer.fit(X_train)

# Apply the transformation
test_data = pd.DataFrame(
    imputation_transformer.transform(test_data),
    columns=numerical_cols + categorical_cols
)

# Step 2: Restore Original Data Types
# Ensure numerical columns are float
for col in numerical_cols:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

# Ensure categorical columns are category
for col in categorical_cols:
    test_data[col] = test_data[col].astype('object')

In [None]:
all_data["lesion_size_ratio"] = all_data["tbp_lv_minorAxisMM"] / all_data["clin_size_long_diam_mm"]
all_data["lesion_shape_index"] = all_data["tbp_lv_areaMM2"] / (all_data["tbp_lv_perimeterMM"] ** 2)
all_data["hue_contrast"] = (all_data["tbp_lv_H"] - all_data["tbp_lv_Hext"]).abs()
all_data["luminance_contrast"] = (all_data["tbp_lv_L"] - all_data["tbp_lv_Lext"]).abs()
all_data["lesion_color_difference"] = np.sqrt(all_data["tbp_lv_deltaA"] ** 2 + all_data["tbp_lv_deltaB"] ** 2 + all_data["tbp_lv_deltaL"] ** 2)
all_data["border_complexity"] = all_data["tbp_lv_norm_border"] + all_data["tbp_lv_symm_2axis"]
all_data["3d_position_distance"] = np.sqrt(all_data["tbp_lv_x"] ** 2 + all_data["tbp_lv_y"] ** 2 + all_data["tbp_lv_z"] ** 2)
all_data["perimeter_to_area_ratio"] = all_data["tbp_lv_perimeterMM"] / all_data["tbp_lv_areaMM2"]
all_data["area_to_perimeter_ratio"] = all_data["tbp_lv_areaMM2"] / all_data["tbp_lv_perimeterMM"]
all_data["lesion_visibility_score"] = all_data["tbp_lv_deltaLBnorm"] + all_data["tbp_lv_norm_color"]
all_data["combined_anatomical_site"] = all_data["anatom_site_general"] + "_" + all_data["tbp_lv_location"]
all_data["symmetry_border_consistency"] = all_data["tbp_lv_symm_2axis"] * all_data["tbp_lv_norm_border"]
all_data["consistency_symmetry_border"] = all_data["tbp_lv_symm_2axis"] * all_data["tbp_lv_norm_border"] / (all_data["tbp_lv_symm_2axis"] + all_data["tbp_lv_norm_border"])
all_data["color_consistency"] = all_data["tbp_lv_stdL"] / all_data["tbp_lv_Lext"]
all_data["consistency_color"] = all_data["tbp_lv_stdL"] * all_data["tbp_lv_Lext"] / (all_data["tbp_lv_stdL"] + all_data["tbp_lv_Lext"])
all_data["size_age_interaction"] = all_data["clin_size_long_diam_mm"] * all_data["age_approx"]
all_data["hue_color_std_interaction"] = all_data["tbp_lv_H"] * all_data["tbp_lv_color_std_mean"]
all_data["lesion_severity_index"] = (all_data["tbp_lv_norm_border"] + all_data["tbp_lv_norm_color"] + all_data["tbp_lv_eccentricity"]) / 3
all_data["shape_complexity_index"] = all_data["border_complexity"] + all_data["lesion_shape_index"]
all_data["color_contrast_index"] = all_data["tbp_lv_deltaA"] + all_data["tbp_lv_deltaB"] + all_data["tbp_lv_deltaL"] + all_data["tbp_lv_deltaLBnorm"]
all_data["log_lesion_area"] = np.log(all_data["tbp_lv_areaMM2"] + 1)
all_data["mean_hue_difference"] = (all_data["tbp_lv_H"] + all_data["tbp_lv_Hext"]) / 2
all_data["std_dev_contrast"] = np.sqrt((all_data["tbp_lv_deltaA"] ** 2 + all_data["tbp_lv_deltaB"] ** 2 + all_data["tbp_lv_deltaL"] ** 2) / 3)
all_data["color_shape_composite_index"] = (all_data["tbp_lv_color_std_mean"] + all_data["tbp_lv_area_perim_ratio"] + all_data["tbp_lv_symm_2axis"]) / 3
all_data["3d_lesion_orientation"] = np.arctan2(all_data["tbp_lv_y"], all_data["tbp_lv_x"])
all_data["overall_color_difference"] = (all_data["tbp_lv_deltaA"] + all_data["tbp_lv_deltaB"] + all_data["tbp_lv_deltaL"]) / 3
all_data["symmetry_perimeter_interaction"] = all_data["tbp_lv_symm_2axis"] * all_data["tbp_lv_perimeterMM"]
all_data["comprehensive_lesion_index"] = (all_data["tbp_lv_area_perim_ratio"] + all_data["tbp_lv_eccentricity"] + all_data["tbp_lv_norm_color"] + all_data["tbp_lv_symm_2axis"]) / 4
all_data["color_variance_ratio"] = all_data["tbp_lv_color_std_mean"] / all_data["tbp_lv_stdLExt"]
all_data["border_color_interaction"] = all_data["tbp_lv_norm_border"] * all_data["tbp_lv_norm_color"]
all_data["border_color_interaction_2"] = all_data["tbp_lv_norm_border"] * all_data["tbp_lv_norm_color"] / (all_data["tbp_lv_norm_border"] + all_data["tbp_lv_norm_color"])
all_data["size_color_contrast_ratio"] = all_data["clin_size_long_diam_mm"] / all_data["tbp_lv_deltaLBnorm"]
all_data["age_normalized_nevi_confidence_2"] = np.sqrt(all_data["clin_size_long_diam_mm"]**2 + all_data["age_approx"]**2)
all_data["color_asymmetry_index"] = all_data["tbp_lv_radial_color_std_max"] * all_data["tbp_lv_symm_2axis"]
all_data["volume_approximation_3d"] = all_data["tbp_lv_areaMM2"] * np.sqrt((all_data["tbp_lv_x"]**2 + all_data["tbp_lv_y"]**2 + all_data["tbp_lv_z"]**2))
all_data["color_range "] = (all_data["tbp_lv_L"] - all_data["tbp_lv_Lext"]).abs() + (all_data["tbp_lv_A"] - all_data["tbp_lv_Aext"]).abs() + (all_data["tbp_lv_B"] - all_data["tbp_lv_Bext"]).abs()
all_data["shape_color_consistency"] = all_data["tbp_lv_eccentricity"] * all_data["tbp_lv_color_std_mean"]
all_data["border_length_ratio"] = all_data["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(all_data["tbp_lv_areaMM2"] / np.pi))
all_data["age_size_symmetry_index"] = all_data["age_approx"] * all_data["clin_size_long_diam_mm"] * all_data["tbp_lv_symm_2axis"]
all_data["index_age_size_symmetry"] = all_data["age_approx"] * all_data["tbp_lv_areaMM2"] * all_data["tbp_lv_symm_2axis"]

In [None]:
test_data["lesion_size_ratio"] = test_data["tbp_lv_minorAxisMM"] / test_data["clin_size_long_diam_mm"]
test_data["lesion_shape_index"] = test_data["tbp_lv_areaMM2"] / (test_data["tbp_lv_perimeterMM"] ** 2)
test_data["hue_contrast"] = (test_data["tbp_lv_H"] - test_data["tbp_lv_Hext"]).abs()
test_data["luminance_contrast"] = (test_data["tbp_lv_L"] - test_data["tbp_lv_Lext"]).abs()
test_data["lesion_color_difference"] = np.sqrt(test_data["tbp_lv_deltaA"] ** 2 + test_data["tbp_lv_deltaB"] ** 2 + test_data["tbp_lv_deltaL"] ** 2)
test_data["border_complexity"] = test_data["tbp_lv_norm_border"] + test_data["tbp_lv_symm_2axis"]
test_data["3d_position_distance"] = np.sqrt(test_data["tbp_lv_x"] ** 2 + test_data["tbp_lv_y"] ** 2 + test_data["tbp_lv_z"] ** 2)
test_data["perimeter_to_area_ratio"] = test_data["tbp_lv_perimeterMM"] / test_data["tbp_lv_areaMM2"]
test_data["area_to_perimeter_ratio"] = test_data["tbp_lv_areaMM2"] / test_data["tbp_lv_perimeterMM"]
test_data["lesion_visibility_score"] = test_data["tbp_lv_deltaLBnorm"] + test_data["tbp_lv_norm_color"]
test_data["combined_anatomical_site"] = test_data["anatom_site_general"] + "_" + test_data["tbp_lv_location"]
test_data["symmetry_border_consistency"] = test_data["tbp_lv_symm_2axis"] * test_data["tbp_lv_norm_border"]
test_data["consistency_symmetry_border"] = test_data["tbp_lv_symm_2axis"] * test_data["tbp_lv_norm_border"] / (test_data["tbp_lv_symm_2axis"] + test_data["tbp_lv_norm_border"])
test_data["color_consistency"] = test_data["tbp_lv_stdL"] / test_data["tbp_lv_Lext"]
test_data["consistency_color"] = test_data["tbp_lv_stdL"] * test_data["tbp_lv_Lext"] / (test_data["tbp_lv_stdL"] + test_data["tbp_lv_Lext"])
test_data["size_age_interaction"] = test_data["clin_size_long_diam_mm"] * test_data["age_approx"]
test_data["hue_color_std_interaction"] = test_data["tbp_lv_H"] * test_data["tbp_lv_color_std_mean"]
test_data["lesion_severity_index"] = (test_data["tbp_lv_norm_border"] + test_data["tbp_lv_norm_color"] + test_data["tbp_lv_eccentricity"]) / 3
test_data["shape_complexity_index"] = test_data["border_complexity"] + test_data["lesion_shape_index"]
test_data["color_contrast_index"] = test_data["tbp_lv_deltaA"] + test_data["tbp_lv_deltaB"] + test_data["tbp_lv_deltaL"] + test_data["tbp_lv_deltaLBnorm"]
test_data["log_lesion_area"] = np.log(test_data["tbp_lv_areaMM2"] + 1)
test_data["mean_hue_difference"] = (test_data["tbp_lv_H"] + test_data["tbp_lv_Hext"]) / 2
test_data["std_dev_contrast"] = np.sqrt((test_data["tbp_lv_deltaA"] ** 2 + test_data["tbp_lv_deltaB"] ** 2 + test_data["tbp_lv_deltaL"] ** 2) / 3)
test_data["color_shape_composite_index"] = (test_data["tbp_lv_color_std_mean"] + test_data["tbp_lv_area_perim_ratio"] + test_data["tbp_lv_symm_2axis"]) / 3
test_data["3d_lesion_orientation"] = np.arctan2(test_data["tbp_lv_y"], test_data["tbp_lv_x"])
test_data["overall_color_difference"] = (test_data["tbp_lv_deltaA"] + test_data["tbp_lv_deltaB"] + test_data["tbp_lv_deltaL"]) / 3
test_data["symmetry_perimeter_interaction"] = test_data["tbp_lv_symm_2axis"] * test_data["tbp_lv_perimeterMM"]
test_data["comprehensive_lesion_index"] = (test_data["tbp_lv_area_perim_ratio"] + test_data["tbp_lv_eccentricity"] + test_data["tbp_lv_norm_color"] + test_data["tbp_lv_symm_2axis"]) / 4
test_data["color_variance_ratio"] = test_data["tbp_lv_color_std_mean"] / test_data["tbp_lv_stdLExt"]
test_data["border_color_interaction"] = test_data["tbp_lv_norm_border"] * test_data["tbp_lv_norm_color"]
test_data["border_color_interaction_2"] = test_data["tbp_lv_norm_border"] * test_data["tbp_lv_norm_color"] / (test_data["tbp_lv_norm_border"] + test_data["tbp_lv_norm_color"])
test_data["size_color_contrast_ratio"] = test_data["clin_size_long_diam_mm"] / test_data["tbp_lv_deltaLBnorm"]
test_data["age_normalized_nevi_confidence_2"] = np.sqrt(test_data["clin_size_long_diam_mm"]**2 + test_data["age_approx"]**2)
test_data["color_asymmetry_index"] = test_data["tbp_lv_radial_color_std_max"] * test_data["tbp_lv_symm_2axis"]
test_data["volume_approximation_3d"] = test_data["tbp_lv_areaMM2"] * np.sqrt((test_data["tbp_lv_x"]**2 + test_data["tbp_lv_y"]**2 + test_data["tbp_lv_z"]**2))
test_data["color_range "] = (test_data["tbp_lv_L"] - test_data["tbp_lv_Lext"]).abs() + (test_data["tbp_lv_A"] - test_data["tbp_lv_Aext"]).abs() + (test_data["tbp_lv_B"] - test_data["tbp_lv_Bext"]).abs()
test_data["shape_color_consistency"] = test_data["tbp_lv_eccentricity"] * test_data["tbp_lv_color_std_mean"]
test_data["border_length_ratio"] = test_data["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(test_data["tbp_lv_areaMM2"] / np.pi))
test_data["age_size_symmetry_index"] = test_data["age_approx"] * test_data["clin_size_long_diam_mm"] * test_data["tbp_lv_symm_2axis"]
test_data["index_age_size_symmetry"] = test_data["age_approx"] * test_data["tbp_lv_areaMM2"] * test_data["tbp_lv_symm_2axis"]


In [None]:
all_data = all_data.drop(columns=['copyright_license', 'attribution', 'image_type'])
test_data = test_data.drop(columns=['copyright_license', 'attribution', 'image_type'])
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
col_num = []
col_cat = []
for col in all_data:
  if col in ['isic_id', 'patient_id']: continue;
  if all_data[col].dtype == 'object':
    col_cat.append(col)
    all_data[col] = all_data[col].astype('category')
    test_data[col] = test_data[col].astype('category')
  else:
    col_num.append(col)
    all_data[col] = all_data[col].astype('float')
    test_data[col] = test_data[col].astype('float')

X_train = all_data.copy()

In [None]:
from sklearn.pipeline import Pipeline
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, col_num),
        ('cat', cat_transformer, col_cat)
    ]
)

In [None]:
import pandas.api.types
from sklearn.metrics import roc_curve, auc, roc_auc_score

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80) -> float:
    '''
    2024 ISIC Challenge metric: pAUC

    Given a solution file and submission file, this function returns the
    the partial area under the receiver operating characteristic (pAUC)
    above a given true positive rate (TPR) = 0.80.
    https://en.wikipedia.org/wiki/Partial_Area_Under_the_ROC_Curve.

    (c) 2024 Nicholas R Kurtansky, MSKCC

    Args:
        solution: ground truth pd.DataFrame of 1s and 0s
        submission: solution dataframe of predictions of scores ranging [0, 1]

    Returns:
        Float value range [0, max_fpr]
    '''

    #del solution[row_id_column_name]
    #del submission[row_id_column_name]

    # check submission is numeric
    if not pandas.api.types.is_numeric_dtype(submission.values):
        raise ParticipantVisibleError('Submission target column must be numeric')

    # rescale the target. set 0s to 1s and 1s to 0s (since sklearn only has max_fpr)
    v_gt = abs(np.asarray(solution.values)-1)

    # flip the submissions to their compliments
    v_pred = -1.0*np.asarray(submission.values)

    max_fpr = abs(1-min_tpr)

    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)

    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)

#     # Equivalent code that uses sklearn's roc_auc_score
#     v_gt = abs(np.asarray(solution.values)-1)
#     v_pred = np.array([1.0 - x for x in submission.values])
#     max_fpr = abs(1-min_tpr)
#     partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
#     # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
#     # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
#     partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return(partial_auc)

In [None]:
from sklearn.metrics import make_scorer
def custom_auc_scorer(y_true, y_pred_proba):
    solution = pd.DataFrame(y_true, columns=["target"])
    submission = pd.DataFrame(y_pred_proba, columns=["pred"])
    return score(solution, submission, row_id_column_name="target", min_tpr=0.80)
custom_scorer = make_scorer(custom_auc_scorer, needs_proba=True, greater_is_better=True)



In [None]:
pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-2.0.1-py3-none-any.whl.metadata (8.9 kB)
Collecting colorama<0.5.0,>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading bayesian_optimization-2.0.1-py3-none-any.whl (31 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-2.0.1 colorama-0.4.6


In [None]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterGrid, train_test_split, StratifiedKFold
def evaluate_model(n_estimators, learning_rate, max_depth, gamma, reg_alpha, reg_lambda, subsample, colsample_bytree, min_child_weight):
    params = {
        'n_estimators': int(n_estimators),
        'learning_rate': learning_rate,
        'max_depth': int(max_depth),
        'gamma': gamma,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_child_weight': min_child_weight,
        'eval_metric': 'logloss',
        'random_state': 100
    }

    # Use StratifiedKFold to maintain class balance
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        # Split the data
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Preprocess the data
        X_train_fold_preprocessed = preprocessor.fit_transform(X_train_fold)
        X_val_fold_preprocessed = preprocessor.transform(X_val_fold)

        # Apply SMOTE
        X_resampled, y_resampled = SMOTE(random_state=100).fit_resample(X_train_fold_preprocessed, y_train_fold)

        # Train the XGBoost model
        model = XGBClassifier(**params)
        model.fit(X_resampled, y_resampled)

        # Predict probabilities on the validation set
        y_pred_proba = model.predict_proba(X_val_fold_preprocessed)[:, 1]

        # Custom scoring function
        solution = pd.DataFrame(y_val_fold, columns=["target"])
        submission = pd.DataFrame(y_pred_proba, columns=["pred"])
        fold_score = score(solution, submission, row_id_column_name="target", min_tpr=0.80)
        fold_scores.append(fold_score)

    # Return the average score across folds
    return np.mean(fold_scores)

# Set bounds for hyperparameters
param_bounds = {
    'n_estimators': (100, 1000),  # Number of trees
    'learning_rate': (0.01, 0.15),  # Learning rate
    'max_depth': (4, 15),  # Depth of trees
    'gamma': (0, 0.3),  # Minimum loss reduction
    'reg_alpha': (0, 1),  # L1 regularization
    'reg_lambda': (0.1, 1),  # L2 regularization
    'subsample': (0.6, 1.0),  # Subsample ratio
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns
    'min_child_weight': (1, 20)  # Minimum sum of instance weight
}

# Initialize Bayesian Optimization
optimizer = BayesianOptimization(
    f=evaluate_model,  # Objective function
    pbounds=param_bounds,
    random_state=100,
    verbose=2
)

# Perform optimization
optimizer.maximize(
    init_points=10,  # Random points to explore first
    n_iter=30  # Number of optimization iterations
)

# Best parameters and score
print("Best Parameters:", optimizer.max['params'])
print("Best Score:", optimizer.max['target'])

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.1541   [39m | [39m0.8174   [39m | [39m0.08351  [39m | [39m0.06943  [39m | [39m13.29    [39m | [39m1.09     [39m | [39m209.4    [39m | [39m0.6707   [39m | [39m0.8433   [39m | [39m0.6547   [39m |
| [39m2        [39m | [39m0.1468   [39m | [39m0.83     [39m | [39m0.2674   [39m | [39m0.03929  [39m | [39m6.039    [39m | [39m3.059    [39m | [39m297.7    [39m | [39m0.9786   [39m | [39m0.8305   [39m | [39m0.6688   [39m |
| [35m3        [39m | [35m0.1542   [39m | [35m0.9265   [39m | [35m0.08222  [39m | [35m0.07044  [39m | [35m14.34    [39m | [35m16.54    [39m | [35m402.5    [39m | [35m0.1754   [39m | [35m0.4355   [39m | [35m0.6023   [39m |
| [39m4        [39m | [39m0.1404   [39m | [39m0.701    [39m | [39m0.2387   [39m | [39m0.01214  [39m | [39m10.59    [39m | [39m12.47    [39m | [39m194.6    [39m | [39m0.3819   [39m | [39m0.1328   [39m | [39m0.9562   [39m |
| [39m5        [39m | 

In [None]:
best_xgb_params = {
    'colsample_bytree': 0.8659,
    'gamma': 0.1787,
    'learning_rate': 0.0214,
    'max_depth': int(4.9),  # Convert to int as required by XGBoost
    'min_child_weight': 17.38,
    'n_estimators': int(130.67),  # Convert to int as required by XGBoost
    'reg_alpha': 0.8864,
    'reg_lambda': 0.3854,
    'subsample': 0.9212,
    'random_state': 42
}

In [None]:
pip install "dask[dataframe]"

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.20-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [None]:
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier
from sklearn.model_selection import ParameterGrid, train_test_split, StratifiedKFold
def evaluate_lgbm(num_leaves, learning_rate, max_depth, reg_alpha, reg_lambda, subsample, colsample_bytree, min_child_weight, n_estimators):
    params = {
        'n_estimators': int(n_estimators),
        'learning_rate': learning_rate,
        'max_depth': int(max_depth),
        'num_leaves': int(num_leaves),
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_child_weight': min_child_weight,
        'random_state': 100,
        'verbose': -1
    }
    return cross_validate_model(LGBMClassifier, params)

def cross_validate_model(ModelClass, params):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        # Split the data
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Preprocess the data
        X_train_fold_preprocessed = preprocessor.fit_transform(X_train_fold)
        X_val_fold_preprocessed = preprocessor.transform(X_val_fold)

        # Apply SMOTE
        X_resampled, y_resampled = SMOTE(random_state=100).fit_resample(X_train_fold_preprocessed, y_train_fold)

        # Train the model
        model = ModelClass(**params)
        model.fit(X_resampled, y_resampled)

        # Predict probabilities on the validation set
        y_pred_proba = model.predict_proba(X_val_fold_preprocessed)[:, 1]

        # Custom scoring function
        solution = pd.DataFrame(y_val_fold, columns=["target"])
        submission = pd.DataFrame(y_pred_proba, columns=["pred"])
        fold_score = score(solution, submission, row_id_column_name="target", min_tpr=0.80)
        fold_scores.append(fold_score)

    # Return the average score across folds
    return np.mean(fold_scores)

lgbm_param_bounds = {
    'num_leaves': (10, 50),
    'learning_rate': (0.01, 0.3),
    'max_depth': (3, 15),
    'reg_alpha': (0, 1),
    'reg_lambda': (0.1, 1),
    'subsample': (0.6, 1.0),
    'colsample_bytree': (0.6, 1.0),
    'min_child_weight': (1, 20),
    'n_estimators': (100, 1000)
}

lgbm_optimizer = BayesianOptimization(
    f=evaluate_lgbm,
    pbounds=lgbm_param_bounds,
    random_state=100,
    verbose=2
)

lgbm_optimizer.maximize(init_points=10, n_iter=30)
print("Best Parameters for LightGBM:", lgbm_optimizer.max['params'])
print("Best Score for LightGBM:", lgbm_optimizer.max['target'])

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.1508   [39m | [39m0.8174   [39m | [39m0.09073  [39m | [39m8.094    [39m | [39m17.05    [39m | [39m104.2    [39m | [39m14.86    [39m | [39m0.6707   [39m | [39m0.8433   [39m | [39m0.6547   [39m |
| [39m2        [39m | [39m0.1447   [39m | [39m0.83     [39m | [39m0.2685   [39m | [39m5.51     [39m | [39m4.521    [39m | [39m197.5    [39m | [39m18.79    [39m | [39m0.9786   [39m | [39m0.8305   [39m | [39m0.6688   [39m |
| [35m3        [39m | [35m0.1541   [39m | [35m0.9265   [39m | [35m0.08948  [39m | [35m8.18     [39m | [35m18.86    [39m | [35m835.9    [39m | [35m23.44    [39m | [35m0.1754   [39m | [35m0.4355   [39m | [35m0.6023   [39m |
| [39m4        [39m | [39m0.1538   [39m | [39m0.701    [39m | [39m0.2407   [39m | [39m3.183    [39m | [39m12.38    [39m | [39m643.4    [39m | [39m14.21    [39m | [39m0.3819   [39m | [39m0.1328   [39m | [39m0.9562   [39m |
| [39m5        [39m | 