# <center> **Kaggle’s Spaceship Titanic Competition**
# <center> **Machine Learning**

# **Libraries**

In [6]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from bayes_opt import BayesianOptimization

import warnings
import time

import functions
import importlib
importlib.reload(functions)

<module 'functions' from 'c:\\Users\\Dell\\Documents\\AI\\Titanic\\Notebooks\\functions.py'>

# **Load Data**

In [19]:
data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\data.csv",
    index_col=False
)

train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\train.csv",
    index_col=False
)

test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Titanic\Data\Data\test.csv",
    index_col=False
)

random_state = 101
target = 'Transported'

In [20]:
train=data[data['PassengerId'].isin(train['PassengerId'].values)].copy()
test=data[data['PassengerId'].isin(test['PassengerId'].values)].copy()

In [21]:
train.drop(['PassengerId', 'Group', 'CabinNumber'], axis=1, inplace=True)
test.drop(['PassengerId', 'Group', 'CabinNumber'], axis=1, inplace=True)

In [10]:
columns_to_transform = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalExpenditure']

for col in columns_to_transform:
    train = functions.log_transform(train, col)
    test = functions.log_transform(test, col)

In [11]:
numerical_cols = [cname for cname in train.columns if train[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in train.columns if train[cname].dtype in ["object", "bool"]]
categorical_cols.remove(target)

In [22]:
X = train.drop(target, axis=1)
y = train[target]
y = y.astype(bool)


X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [27]:
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor  = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
        remainder='passthrough')

In [29]:
# Separate features and target
X = train.drop(columns=[target])
y = train[target]

# Fit and transform the features
X_transformed = preprocessor.fit_transform(X)

# Convert to DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=preprocessor.get_feature_names_out())

# Combine with target
final_df = pd.concat([X_transformed_df, y.reset_index(drop=True)], axis=1)

## **Bayesian Optimization**

### **LGBM**

In [31]:
def lgb_eval(num_leaves, max_depth, learning_rate, min_child_samples, subsample, colsample_bytree, min_data_in_leaf):
    params = {
        'objective': 'binary',
        'num_leaves': int(num_leaves),  # must be integer
        'max_depth': int(max_depth),  # must be integer
        'learning_rate': learning_rate,
        'min_child_samples': int(min_child_samples),  # must be integer
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_data_in_leaf': int(min_data_in_leaf),  # new hyperparameter added
        'metric': 'auc',  # AUC metric for evaluation
        'verbose': -1,
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'seed': 42
    }

    # Perform cross-validation
    lgb_train = lgb.Dataset(X_train, y_train)
    cv_result = lgb.cv(params, lgb_train, nfold=5, metrics=['auc'], seed=42, stratified=True)

    # Accessing the correct key for AUC mean
    best_score = np.max(cv_result['valid auc-mean'])  # Use the correct key format

    return best_score

# Set the parameter bounds, including min_data_in_leaf
param_bounds = {
    'num_leaves': (20, 50),
    'max_depth': (5, 15),
    'learning_rate': (0.01, 0.3),
    'min_child_samples': (5, 30),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'min_data_in_leaf': (10, 50)  # bounds for the new parameter
}

# Run Bayesian Optimization
optimizer = BayesianOptimization(
    f=lgb_eval,
    pbounds=param_bounds,
    random_state=42,
    verbose=2
)

# Optimize
optimizer.maximize(init_points=10, n_iter=30)

# Print the best parameters
print("Best parameters found: ", optimizer.max)

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_da... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------


ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: HomePlanet: object, CabinDeck: object, CabinSide: object, Destination: object, AgeGroup: object, LastName: object

In [24]:
train.dtypes

GroupSize             int64
SoloTraveler          int64
HomePlanet           object
CabinDeck            object
CabinSide            object
CryoSleep              bool
Destination          object
Age                 float64
AgeGroup             object
VIP                    bool
RoomService         float64
FoodCourt           float64
ShoppingMall        float64
Spa                 float64
VRDeck              float64
TotalExpenditure    float64
Expenditure           int64
LastName             object
FamilySize            int64
LoneTraveler          int64
Transported          object
dtype: object

In [5]:
# Import necessary libraries
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization
import numpy as np

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function to be optimized
def lgb_eval(num_leaves, max_depth, learning_rate, min_child_samples, subsample, colsample_bytree, min_data_in_leaf):
    params = {
        'objective': 'binary',
        'num_leaves': int(num_leaves),  # must be integer
        'max_depth': int(max_depth),  # must be integer
        'learning_rate': learning_rate,
        'min_child_samples': int(min_child_samples),  # must be integer
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_data_in_leaf': int(min_data_in_leaf),  # new hyperparameter added
        'metric': 'auc',  # AUC metric for evaluation
        'verbose': -1,
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'seed': 42
    }

    # Perform cross-validation
    lgb_train = lgb.Dataset(X_train, y_train)
    cv_result = lgb.cv(params, lgb_train, nfold=5, metrics=['auc'], seed=42, stratified=True)

    # Accessing the correct key for AUC mean
    best_score = np.max(cv_result['valid auc-mean'])  # Use the correct key format

    return best_score

# Set the parameter bounds, including min_data_in_leaf
param_bounds = {
    'num_leaves': (20, 50),
    'max_depth': (5, 15),
    'learning_rate': (0.01, 0.3),
    'min_child_samples': (5, 30),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'min_data_in_leaf': (10, 50)  # bounds for the new parameter
}

# Run Bayesian Optimization
optimizer = BayesianOptimization(
    f=lgb_eval,
    pbounds=param_bounds,
    random_state=42,
    verbose=2
)

# Optimize
optimizer.maximize(init_points=10, n_iter=30)

# Print the best parameters
print("Best parameters found: ", optimizer.max)


|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_da... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.9942   [39m | [39m0.6873   [39m | [39m0.2857   [39m | [39m12.32    [39m | [39m19.97    [39m | [39m16.24    [39m | [39m24.68    [39m | [39m0.529    [39m |
| [39m2        [39m | [39m0.9939   [39m | [39m0.9331   [39m | [39m0.1843   [39m | [39m12.08    [39m | [39m5.515    [39m | [39m48.8     [39m | [39m44.97    [39m | [39m0.6062   [39m |
| [39m3        [39m | [39m0.9919   [39m | [39m0.5909   [39m | [39m0.06319  [39m | [39m8.042    [39m | [39m18.12    [39m | [39m27.28    [39m | [39m28.74    [39m | [39m0.8059   [39m |
| [39m4        [39m | [39m0.9928   [39m | [39m0.5697   [39m | [39m0.09472  [39m | [39m8.664    [39m | [39m16.4     [39m | [39m41.41    [39m | [39m25.99    [39m | [