In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier 
from sklearn import preprocessing
from sklearn.svm import SVC
import optuna
from catboost import CatBoostClassifier

In [13]:
train = pd.read_csv("train_folds.csv")
test = pd.read_csv("Test.csv")
sample = pd.read_csv("SampleSubmission.csv")


useful_features = [i for i in train.columns if i not in("region_area_", "Potability", "kfold")]
test = test[useful_features]

for col in test.columns:
    train[col] = np.log1p(train[col])
test[col] = np.log1p(test[col])

def run(trial):
            fold=0
            learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
            reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
            reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
            sub_sample = trial.suggest_float("sub_sample", 0.1, 1.0)
            max_depth = trial.suggest_int("max_depth", 1,7)
            colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)

            x_train = train[train.kfold != fold].reset_index(drop=True)
            x_valid = train[train.kfold == fold].reset_index(drop=True)

            y_train = x_train.Potability
            y_valid = x_valid.Potability

            x_train = x_train[useful_features]
            x_valid = x_valid[useful_features]

            sc = preprocessing.Normalizer()
            scaled_x_train = pd.DataFrame(sc.fit_transform(x_train))
            scaled_x_valid = pd.DataFrame(sc.transform(x_valid))
            #scaled_test = pd.DataFrame(sc.transform(df_test))

            scaled_x_train.columns = x_train.columns
            scaled_x_valid.columns = x_valid.columns
            #scaled_test.columns = df_test.columns


            model = XGBClassifier(random_state=42,
                n_estimators=7000,
                learning_rate = learning_rate,
                reg_lambda = reg_lambda,
                reg_alpha = reg_alpha,
                sub_sample = sub_sample,
                max_depth = max_depth,
                colsample_bytree = colsample_bytree,
             )
            model.fit(scaled_x_train, y_train)
            preds = model.predict(scaled_x_valid)
            score = model.score(scaled_x_valid, y_valid)
            #score = model.score(scaled_x_train, y_train)
        
            return(score)

In [14]:
study = optuna.create_study(direction="maximize")
study.optimize(run,n_trials=5)

[32m[I 2021-09-22 07:32:26,301][0m A new study created in memory with name: no-name-64c249e7-4841-40e9-9022-aa59fbb79bcd[0m


Parameters: { sub_sample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-09-22 07:32:38,373][0m Trial 0 finished with value: 0.6482300884955752 and parameters: {'learning_rate': 0.18575843133189782, 'reg_lambda': 28.40017935663664, 'reg_alpha': 0.0024694384631797654, 'sub_sample': 0.9772611014683914, 'max_depth': 6, 'colsample_bytree': 0.28508665839493885}. Best is trial 0 with value: 0.6482300884955752.[0m


Parameters: { sub_sample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-09-22 07:32:49,030][0m Trial 1 finished with value: 0.6150442477876106 and parameters: {'learning_rate': 0.05793570035650512, 'reg_lambda': 5.655237322849009e-07, 'reg_alpha': 3.125582722504243e-07, 'sub_sample': 0.9726727032735669, 'max_depth': 4, 'colsample_bytree': 0.14034063960129942}. Best is trial 0 with value: 0.6482300884955752.[0m


Parameters: { sub_sample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-09-22 07:33:01,752][0m Trial 2 finished with value: 0.661504424778761 and parameters: {'learning_rate': 0.04569642602233985, 'reg_lambda': 0.5775506658770446, 'reg_alpha': 2.7447735238679504e-07, 'sub_sample': 0.13633748376033483, 'max_depth': 7, 'colsample_bytree': 0.4198261718388938}. Best is trial 2 with value: 0.661504424778761.[0m


Parameters: { sub_sample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-09-22 07:33:17,101][0m Trial 3 finished with value: 0.584070796460177 and parameters: {'learning_rate': 0.07966649103206408, 'reg_lambda': 18.936054828839115, 'reg_alpha': 2.4861953594811386e-05, 'sub_sample': 0.4253022009777645, 'max_depth': 6, 'colsample_bytree': 0.11092049950389965}. Best is trial 2 with value: 0.661504424778761.[0m


Parameters: { sub_sample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-09-22 07:33:22,773][0m Trial 4 finished with value: 0.6415929203539823 and parameters: {'learning_rate': 0.016502208837650974, 'reg_lambda': 0.013599644757561705, 'reg_alpha': 65.92024764142482, 'sub_sample': 0.7194132110802006, 'max_depth': 6, 'colsample_bytree': 0.5612523460496128}. Best is trial 2 with value: 0.661504424778761.[0m


In [15]:
study.best_params

{'learning_rate': 0.04569642602233985,
 'reg_lambda': 0.5775506658770446,
 'reg_alpha': 2.7447735238679504e-07,
 'sub_sample': 0.13633748376033483,
 'max_depth': 7,
 'colsample_bytree': 0.4198261718388938}

In [None]:
#with 5 folds
{'learning_rate': 0.058937452875490244,
 'reg_lambda': 0.27685962282866644,
 'reg_alpha': 5.1246361644341025,
 'sub_sample': 0.7649184695897516,
 'max_depth': 2,
 'colsample_bytree': 0.7450191595678853}


In [None]:
{'learning_rate': 0.04569642602233985,
 'reg_lambda': 0.5775506658770446,
 'reg_alpha': 2.7447735238679504e-07,
 'sub_sample': 0.13633748376033483,
 'max_depth': 7,
 'colsample_bytree': 0.4198261718388938}