In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import power_transform
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.base import clone
from sklearn.impute import SimpleImputer

import lightgbm as lgbm
import optuna  # pip install optuna

In [2]:
from optuna.integration import LightGBMPruningCallback
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def objective(base_model, trial, X, y):
    
    
    if type(base_model) == Pipeline:
        param_grid = {
            "lgbmclassifier__device_type": trial.suggest_categorical("device_type", ['gpu']),
            "lgbmclassifier__n_estimators": trial.suggest_int("n_estimators", 10, 10000),
            "lgbmclassifier__learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "lgbmclassifier__num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
            "lgbmclassifier__max_depth": trial.suggest_int("max_depth", 3, 12),
            "lgbmclassifier__min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
            "lgbmclassifier__lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
            "lgbmclassifier__lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
            "lgbmclassifier__min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
            "lgbmclassifier__bagging_fraction": trial.suggest_float(
                "bagging_fraction", 0.2, 0.95, step=0.1
            ),
            "lgbmclassifier__bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
            "lgbmclassifier__feature_fraction": trial.suggest_float(
                "feature_fraction", 0.2, 0.95, step=0.1
            ),
        }
    else:
        param_grid = {
            "device_type": trial.suggest_categorical("device_type", ['gpu']),
            "n_estimators": trial.suggest_int("n_estimators", 10, 10000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
            "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
            "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
            "bagging_fraction": trial.suggest_float(
                "bagging_fraction", 0.2, 0.95, step=0.1
            ),
            "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
            "feature_fraction": trial.suggest_float(
                "feature_fraction", 0.2, 0.95, step=0.1
            ),
        }
    
    #cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

    #cv_scores = np.empty(5)
    #for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    #model = lgbm.LGBMClassifier(objective="binary", **param_grid)
    
    model = clone(base_model)
    model.set_params(**param_grid)
    
    if type(base_model) == Pipeline:
        model.fit(
            X_train,
            y_train,
            lgbmclassifier__eval_set=[(X_test, y_test)],
            lgbmclassifier__eval_metric="auc",
            lgbmclassifier__early_stopping_rounds=100,
            lgbmclassifier__callbacks=[
                LightGBMPruningCallback(trial, "auc")
            ],  # Add a pruning callback
            lgbmclassifier__verbose = 0
        )
    else: 
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "auc")
            ],  # Add a pruning callback
            verbose = 0
        )
    preds = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, preds)

In [3]:
def tune_and_train_lightgbm_model(base_model, X, y, study_name = "LGBM Classifier"):
    
    study = optuna.create_study(direction="maximize", study_name=study_name)
    func = lambda trial: objective(base_model, trial, X, y)
    study.optimize(func, n_trials=100)
    
    print(f"\tBest value (AUC): {study.best_value:.5f}")
    print(f"\tBest params:")

    for key, value in study.best_params.items():
        print(f"\t\t{key}: {value}")
        
    model = clone(base_model)
    model.set_params(**study.best_params)
    model.fit(X, y)
    
    return model, study

# Get Data

In [4]:
train = pd.read_csv("Data/train.csv", index_col = 0)
train.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [5]:
train.shape

(957919, 119)

In [6]:
test = pd.read_csv("Data/test.csv", index_col = 0)
test.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,6221.0,...,0.16253,-22.189,2.0655,0.43088,-10.741,81606.0,1.194,198040000000000.0,2017.1,0.46357
957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,934.43,...,0.81528,-1.6342,1.5736,-1.0712,11.832,90114.0,1.1507,4.388e+16,6638.9,0.28125
957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,131.81,...,0.81831,-32.78,2.1364,-1.9312,-3.2804,37739.0,1.1548,171810000000000.0,5844.0,0.13797
957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,-26.473,...,0.86559,-2.4162,1.5199,-0.011633,1.384,26849.0,1.149,2.1388e+17,6173.3,0.3291
957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,5999.4,...,0.2519,-18.63,3.7387,0.75708,-4.9405,50336.0,1.2488,2.1513e+17,2250.1,0.33796


In [None]:
sample_submission = pd.read_csv("Data/sample_solution.csv")

In [7]:
def feature_generation(data):
    
    data = data.copy()

    # add feature with number of na's per row + standard deviation
    features = [col for col in data.columns if 'f' in col] # if 'f' is in the name
    data['count_na'] = data[features].isna().sum(axis=1)

    features += ['count_na']

    # now do imputation with mean
    data[features] = data[features].fillna(data[features].mean())

    return data

train_w_nan = feature_generation(train)

In [8]:
train_w_nan.to_csv("Data/train_w_nan.csv")

In [13]:
test_w_nan = feature_generation(test)
test_w_nan.to_csv("Data/test_w_nan.csv")

# Fill in missing values with mean

In [9]:
X = train_w_nan.drop("claim", axis = 1)
y = train_w_nan.claim

In [10]:
model = make_pipeline(SimpleImputer(strategy = 'mean'), lgbm.LGBMClassifier(objective="binary"))

trained_model, study = tune_and_train_lightgbm_model(model, X, y, study_name = "LGBM Classifier with mean imputation")

[32m[I 2021-09-11 22:42:56,609][0m A new study created in memory with name: LGBM Classifier with mean imputation[0m




[32m[I 2021-09-11 22:43:25,159][0m Trial 0 finished with value: 0.8093150083264089 and parameters: {'device_type': 'gpu', 'n_estimators': 6681, 'learning_rate': 0.28441310612188986, 'num_leaves': 1180, 'max_depth': 9, 'min_data_in_leaf': 7100, 'lambda_l1': 5, 'lambda_l2': 15, 'min_gain_to_split': 3.7415490955668744, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 0 with value: 0.8093150083264089.[0m




[32m[I 2021-09-11 22:55:04,856][0m Trial 1 finished with value: 0.8149850101625152 and parameters: {'device_type': 'gpu', 'n_estimators': 6127, 'learning_rate': 0.013771215541806329, 'num_leaves': 2640, 'max_depth': 8, 'min_data_in_leaf': 9900, 'lambda_l1': 65, 'lambda_l2': 40, 'min_gain_to_split': 4.012947696524615, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 1 with value: 0.8149850101625152.[0m




[32m[I 2021-09-11 22:56:15,854][0m Trial 2 finished with value: 0.8133727861349842 and parameters: {'device_type': 'gpu', 'n_estimators': 3519, 'learning_rate': 0.17975341755229668, 'num_leaves': 140, 'max_depth': 7, 'min_data_in_leaf': 9900, 'lambda_l1': 55, 'lambda_l2': 10, 'min_gain_to_split': 11.81083721878871, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 1 with value: 0.8149850101625152.[0m




[32m[I 2021-09-11 22:57:47,807][0m Trial 3 finished with value: 0.8131168854631725 and parameters: {'device_type': 'gpu', 'n_estimators': 8544, 'learning_rate': 0.13658494912038044, 'num_leaves': 1100, 'max_depth': 3, 'min_data_in_leaf': 2300, 'lambda_l1': 60, 'lambda_l2': 20, 'min_gain_to_split': 7.345772677859825, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 1 with value: 0.8149850101625152.[0m




[32m[I 2021-09-11 22:59:21,913][0m Trial 4 finished with value: 0.8126882532269217 and parameters: {'device_type': 'gpu', 'n_estimators': 8302, 'learning_rate': 0.14037840244868893, 'num_leaves': 720, 'max_depth': 9, 'min_data_in_leaf': 9800, 'lambda_l1': 90, 'lambda_l2': 100, 'min_gain_to_split': 8.65920550356534, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 1 with value: 0.8149850101625152.[0m




[32m[I 2021-09-11 22:59:28,731][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 22:59:36,337][0m Trial 6 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-11 23:01:22,406][0m Trial 7 finished with value: 0.8144162485505355 and parameters: {'device_type': 'gpu', 'n_estimators': 9674, 'learning_rate': 0.18836060056024664, 'num_leaves': 1260, 'max_depth': 8, 'min_data_in_leaf': 6000, 'lambda_l1': 95, 'lambda_l2': 100, 'min_gain_to_split': 8.463900829991958, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 1 with value: 0.8149850101625152.[0m




[32m[I 2021-09-11 23:01:30,500][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:01:57,042][0m Trial 9 finished with value: 0.8121348690652839 and parameters: {'device_type': 'gpu', 'n_estimators': 2555, 'learning_rate': 0.2712419261590967, 'num_leaves': 700, 'max_depth': 4, 'min_data_in_leaf': 5300, 'lambda_l1': 0, 'lambda_l2': 0, 'min_gain_to_split': 0.1252668019709191, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 1 with value: 0.8149850101625152.[0m




[32m[I 2021-09-11 23:02:06,332][0m Trial 10 pruned. Trial was pruned at iteration 4.[0m
[32m[I 2021-09-11 23:02:14,060][0m Trial 11 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:02:22,416][0m Trial 12 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:02:32,527][0m Trial 13 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:02:42,308][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:02:53,794][0m Trial 15 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:03:05,213][0m Trial 16 pruned. Trial was pruned at iteration 4.[0m
[32m[I 2021-09-11 23:03:15,074][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:03:25,973][0m Trial 18 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:03:35,022][0m Trial 19 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:03:45,143][0m Trial 20 pruned. Trial was pruned at iteration 4.[0m



[32m[I 2021-09-11 23:06:58,775][0m Trial 28 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:07:07,505][0m Trial 29 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:08:53,014][0m Trial 30 finished with value: 0.8157342880402558 and parameters: {'device_type': 'gpu', 'n_estimators': 8017, 'learning_rate': 0.2484019052000147, 'num_leaves': 1300, 'max_depth': 10, 'min_data_in_leaf': 8700, 'lambda_l1': 25, 'lambda_l2': 20, 'min_gain_to_split': 6.644633325828346, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 30 with value: 0.8157342880402558.[0m




[32m[I 2021-09-11 23:09:01,460][0m Trial 31 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-11 23:11:05,733][0m Trial 32 finished with value: 0.815217456274008 and parameters: {'device_type': 'gpu', 'n_estimators': 9294, 'learning_rate': 0.2376045951560918, 'num_leaves': 1200, 'max_depth': 9, 'min_data_in_leaf': 9100, 'lambda_l1': 10, 'lambda_l2': 35, 'min_gain_to_split': 4.976758064262973, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 30 with value: 0.8157342880402558.[0m




[32m[I 2021-09-11 23:11:13,299][0m Trial 33 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:11:20,208][0m Trial 34 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:11:27,593][0m Trial 35 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:11:37,185][0m Trial 36 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-11 23:11:44,797][0m Trial 37 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:13:02,353][0m Trial 38 finished with value: 0.8151839398568492 and parameters: {'device_type': 'gpu', 'n_estimators': 8540, 'learning_rate': 0.29977939232440975, 'num_leaves': 1300, 'max_depth': 11, 'min_data_in_leaf': 7500, 'lambda_l1': 40, 'lambda_l2': 50, 'min_gain_to_split': 7.613392343307048, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 30 with value: 0.8157342880402558.[0m




[32m[I 2021-09-11 23:13:09,905][0m Trial 39 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:13:19,732][0m Trial 40 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:13:28,092][0m Trial 41 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:13:36,760][0m Trial 42 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:13:46,627][0m Trial 43 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:13:53,912][0m Trial 44 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:14:02,680][0m Trial 45 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-11 23:14:10,869][0m Trial 46 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:14:18,709][0m Trial 47 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:14:27,503][0m Trial 48 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:14:36,772][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m



[32m[I 2021-09-11 23:21:35,994][0m Trial 88 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:21:43,739][0m Trial 89 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:21:50,760][0m Trial 90 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:21:57,586][0m Trial 91 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:22:05,269][0m Trial 92 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-11 23:22:12,696][0m Trial 93 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2021-09-11 23:22:19,745][0m Trial 94 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:22:27,961][0m Trial 95 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:22:36,344][0m Trial 96 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:22:45,709][0m Trial 97 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:22:54,671][0m Trial 98 pruned. Trial was pruned at iteration 0.[0m

	Best value (AUC): 0.81573
	Best params:
		device_type: gpu
		n_estimators: 8017
		learning_rate: 0.2484019052000147
		num_leaves: 1300
		max_depth: 10
		min_data_in_leaf: 8700
		lambda_l1: 25
		lambda_l2: 20
		min_gain_to_split: 6.644633325828346
		bagging_fraction: 0.8
		bagging_freq: 1
		feature_fraction: 0.30000000000000004


ValueError: Invalid parameter device_type for estimator Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('lgbmclassifier', LGBMClassifier(objective='binary'))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [12]:
best_params = {
            "lgbmclassifier__device_type": 'gpu',
            "lgbmclassifier__n_estimators": 8017,
            "lgbmclassifier__learning_rate": 0.2484019052000147,
            "lgbmclassifier__num_leaves": 1300,
            "lgbmclassifier__max_depth": 10,
            "lgbmclassifier__min_data_in_leaf": 8700,
            "lgbmclassifier__lambda_l1": 25,
            "lgbmclassifier__lambda_l2": 20,
            "lgbmclassifier__min_gain_to_split": 6.644633325828346,
            "lgbmclassifier__bagging_fraction": 0.8,
            "lgbmclassifier__bagging_freq": 1,
            "lgbmclassifier__feature_fraction": 0.3
        }

model.set_params(**best_params)
model.fit(X, y)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('lgbmclassifier',
                 LGBMClassifier(bagging_fraction=0.8, bagging_freq=1,
                                device_type='gpu', feature_fraction=0.3,
                                lambda_l1=25, lambda_l2=20,
                                learning_rate=0.2484019052000147, max_depth=10,
                                min_data_in_leaf=8700,
                                min_gain_to_split=6.644633325828346,
                                n_estimators=8017, num_leaves=1300,
                                objective='binary'))])

In [17]:
predictions = model.predict_proba(test_w_nan)

In [19]:
sample_submission.claim = predictions[:, 1]
sample_submission

Unnamed: 0,id,claim
0,957919,0.603748
1,957920,0.126944
2,957921,0.649066
3,957922,0.100918
4,957923,0.130741
...,...,...
493469,1451388,0.824863
493470,1451389,0.116021
493471,1451390,0.694402
493472,1451391,0.132100


In [20]:
sample_submission.to_csv("Submissions/lightgbm_w_mean_imputation.csv", index = False)

# Drop missing values

In [21]:
temp = train_w_nan.dropna().copy()
X = temp.drop("claim", axis = 1)
y = temp.claim

In [23]:
base_model = lgbm.LGBMClassifier(objective="binary")
study = optuna.create_study(direction="maximize", study_name="LightGBM dropping missing values")
func = lambda trial: objective(base_model, trial, X, y)
study.optimize(func, n_trials=100)

print(f"\tBest value (AUC): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

model = clone(base_model)
model.set_params(**study.best_params)
model.fit(X, y)

[32m[I 2021-09-11 23:44:17,876][0m A new study created in memory with name: LightGBM dropping missing values[0m




[32m[I 2021-09-11 23:47:08,969][0m Trial 0 finished with value: 0.815019711281604 and parameters: {'device_type': 'gpu', 'n_estimators': 9218, 'learning_rate': 0.22573906357492232, 'num_leaves': 2000, 'max_depth': 7, 'min_data_in_leaf': 3400, 'lambda_l1': 90, 'lambda_l2': 55, 'min_gain_to_split': 2.6564271439803417, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 0 with value: 0.815019711281604.[0m




[32m[I 2021-09-11 23:48:03,621][0m Trial 1 finished with value: 0.8134557544000776 and parameters: {'device_type': 'gpu', 'n_estimators': 773, 'learning_rate': 0.2757912114441294, 'num_leaves': 1240, 'max_depth': 3, 'min_data_in_leaf': 9100, 'lambda_l1': 75, 'lambda_l2': 5, 'min_gain_to_split': 11.032217259948618, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 0 with value: 0.815019711281604.[0m




[32m[I 2021-09-11 23:50:08,554][0m Trial 2 finished with value: 0.8153911644695946 and parameters: {'device_type': 'gpu', 'n_estimators': 3052, 'learning_rate': 0.10349950719079135, 'num_leaves': 540, 'max_depth': 4, 'min_data_in_leaf': 5700, 'lambda_l1': 55, 'lambda_l2': 85, 'min_gain_to_split': 2.4288895631404035, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 2 with value: 0.8153911644695946.[0m




[32m[I 2021-09-11 23:51:39,887][0m Trial 3 finished with value: 0.8141437492288247 and parameters: {'device_type': 'gpu', 'n_estimators': 7329, 'learning_rate': 0.10611240175696472, 'num_leaves': 1580, 'max_depth': 8, 'min_data_in_leaf': 7200, 'lambda_l1': 30, 'lambda_l2': 60, 'min_gain_to_split': 13.164760621375674, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 2 with value: 0.8153911644695946.[0m




[32m[I 2021-09-11 23:53:03,973][0m Trial 4 finished with value: 0.8133397251727377 and parameters: {'device_type': 'gpu', 'n_estimators': 2224, 'learning_rate': 0.2813898759545546, 'num_leaves': 2300, 'max_depth': 7, 'min_data_in_leaf': 3800, 'lambda_l1': 95, 'lambda_l2': 60, 'min_gain_to_split': 7.36129419584745, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 2 with value: 0.8153911644695946.[0m




[32m[I 2021-09-11 23:53:10,825][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:53:18,237][0m Trial 6 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:53:25,061][0m Trial 7 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:53:32,177][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:53:40,502][0m Trial 9 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-11 23:53:47,905][0m Trial 10 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:53:55,578][0m Trial 11 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-11 23:54:04,629][0m Trial 12 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-11 23:54:11,314][0m Trial 13 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:54:18,249][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-11 23:54:25,821][0m Trial 15 pruned. Trial was pruned at iteration 0.[0m
[32

	Best value (AUC): 0.81539
	Best params:
		device_type: gpu
		n_estimators: 3052
		learning_rate: 0.10349950719079135
		num_leaves: 540
		max_depth: 4
		min_data_in_leaf: 5700
		lambda_l1: 55
		lambda_l2: 85
		min_gain_to_split: 2.4288895631404035
		bagging_fraction: 0.4
		bagging_freq: 1
		feature_fraction: 0.4


LGBMClassifier(bagging_fraction=0.4, bagging_freq=1, device_type='gpu',
               feature_fraction=0.4, lambda_l1=55, lambda_l2=85,
               learning_rate=0.10349950719079135, max_depth=4,
               min_data_in_leaf=5700, min_gain_to_split=2.4288895631404035,
               n_estimators=3052, num_leaves=540, objective='binary')

In [24]:
predictions = model.predict_proba(test_w_nan)

In [25]:
sample_submission.claim = predictions[:, 1]
sample_submission

Unnamed: 0,id,claim
0,957919,0.566248
1,957920,0.121081
2,957921,0.660913
3,957922,0.137200
4,957923,0.157628
...,...,...
493469,1451388,0.832642
493470,1451389,0.103087
493471,1451390,0.794027
493472,1451391,0.142980


In [26]:
sample_submission.to_csv("Submissions/lightgbm_w_dropping_missing.csv", index = False)

# Generate new features as required

In [72]:
def split_multimodal_features(data):
    
    df = data.copy()
    
    def split_feature(df, feature, threshold):

        df.loc[df[feature] <= threshold, feature + '_0'] = df[feature]
        df.loc[df[feature] > threshold, feature + '_1'] = df[feature]
        df = df.drop(feature, axis = 1)

        return df
    
    df = split_feature(df, 'f2', 0.025)
    df = split_feature(df, 'f5', 0.05)
    df = split_feature(df, 'f11', 0)
    df = split_feature(df, 'f13', 0.04)
    df = split_feature(df, 'f23', 6.2)
    df = split_feature(df, 'f26', 1e13)
    df = split_feature(df, 'f29', 0.05)
    df = split_feature(df, 'f29_1', 0.5)
    df = split_feature(df, 'f40', 0.05)
    df = split_feature(df, 'f40_1', 0.5)
    df = split_feature(df, 'f42', 0.3)
    df = split_feature(df, 'f42_1', 0.7)
    df = split_feature(df, 'f47', 0)
    df = split_feature(df, 'f49', -0.05)
    df = split_feature(df, 'f49_1', 0.05)
    df = split_feature(df, 'f50', 0.05)
    df = split_feature(df, 'f55', 0.035)
    df = split_feature(df, 'f58', -0.97)
    df = split_feature(df, 'f65', 40000)
    df = split_feature(df, 'f66', 4)
    df = split_feature(df, 'f70', 0.5)
    df = split_feature(df, 'f75', 0.5)
    df = split_feature(df, 'f75_0', 0.05)
    df = split_feature(df, 'f75_1', 0.95)
    df = split_feature(df, 'f91', 0.1)
    df = split_feature(df, 'f112', 30)
    
    return df

In [73]:
train_w_nan_new_features = split_multimodal_features(train_w_nan)
test_w_nan_new_features = split_multimodal_features(test_w_nan)

In [75]:
X = train_w_nan_new_features.drop('claim', axis = 1)
y = train_w_nan_new_features.claim

trained_model, study = tune_and_train_lightgbm_model(model, X, y, study_name = "LGBM Classifier with new features")

[32m[I 2021-09-12 22:26:17,852][0m A new study created in memory with name: LGBM Classifier with new features[0m




[32m[I 2021-09-12 22:27:48,859][0m Trial 0 finished with value: 0.8144281010472123 and parameters: {'device_type': 'gpu', 'n_estimators': 2668, 'learning_rate': 0.28861785824133335, 'num_leaves': 1020, 'max_depth': 7, 'min_data_in_leaf': 6300, 'lambda_l1': 50, 'lambda_l2': 90, 'min_gain_to_split': 2.1303387759183003, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 0 with value: 0.8144281010472123.[0m




[32m[I 2021-09-12 22:28:45,091][0m Trial 1 finished with value: 0.8112886733473452 and parameters: {'device_type': 'gpu', 'n_estimators': 9982, 'learning_rate': 0.24661482314007654, 'num_leaves': 1400, 'max_depth': 8, 'min_data_in_leaf': 500, 'lambda_l1': 10, 'lambda_l2': 0, 'min_gain_to_split': 4.043964359011708, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 0 with value: 0.8144281010472123.[0m




[32m[I 2021-09-12 22:39:32,432][0m Trial 2 finished with value: 0.8146275167101759 and parameters: {'device_type': 'gpu', 'n_estimators': 3128, 'learning_rate': 0.07695989927796215, 'num_leaves': 80, 'max_depth': 5, 'min_data_in_leaf': 4100, 'lambda_l1': 75, 'lambda_l2': 40, 'min_gain_to_split': 0.40377713956309735, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 2 with value: 0.8146275167101759.[0m




[32m[I 2021-09-12 22:40:39,824][0m Trial 3 finished with value: 0.8134124005442734 and parameters: {'device_type': 'gpu', 'n_estimators': 6884, 'learning_rate': 0.25847915600640237, 'num_leaves': 900, 'max_depth': 12, 'min_data_in_leaf': 5200, 'lambda_l1': 25, 'lambda_l2': 70, 'min_gain_to_split': 4.936864200075311, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 2 with value: 0.8146275167101759.[0m




[32m[I 2021-09-12 22:42:18,038][0m Trial 4 finished with value: 0.8130472744712434 and parameters: {'device_type': 'gpu', 'n_estimators': 6623, 'learning_rate': 0.2887802783157874, 'num_leaves': 1820, 'max_depth': 8, 'min_data_in_leaf': 8600, 'lambda_l1': 85, 'lambda_l2': 25, 'min_gain_to_split': 10.805884812613042, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 2 with value: 0.8146275167101759.[0m




[32m[I 2021-09-12 22:42:27,447][0m Trial 5 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 22:42:36,520][0m Trial 6 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 22:44:29,562][0m Trial 7 pruned. Trial was pruned at iteration 803.[0m
[32m[I 2021-09-12 22:44:38,686][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 22:44:48,364][0m Trial 9 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2021-09-12 22:44:57,603][0m Trial 10 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 22:45:06,789][0m Trial 11 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 22:45:16,169][0m Trial 12 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 22:45:26,158][0m Trial 13 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 22:45:35,690][0m Trial 14 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 22:45:44,643][0m Trial 15 pruned. Trial was pruned at iteration 0.[0m
[



[32m[I 2021-09-12 22:52:02,398][0m Trial 45 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 22:52:12,278][0m Trial 46 pruned. Trial was pruned at iteration 4.[0m
[32m[I 2021-09-12 22:52:22,252][0m Trial 47 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 22:52:33,538][0m Trial 48 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 22:52:43,879][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 22:52:53,707][0m Trial 50 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 22:53:04,930][0m Trial 51 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2021-09-12 22:55:28,709][0m Trial 52 finished with value: 0.8140839817666394 and parameters: {'device_type': 'gpu', 'n_estimators': 2068, 'learning_rate': 0.20056783586339566, 'num_leaves': 1740, 'max_depth': 9, 'min_data_in_leaf': 2100, 'lambda_l1': 20, 'lambda_l2': 15, 'min_gain_to_split': 3.277502220486123, 'bagging_fraction': 0.7, 'bagging_freq': 1



[32m[I 2021-09-12 22:55:40,337][0m Trial 53 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 22:56:33,157][0m Trial 54 pruned. Trial was pruned at iteration 102.[0m
[32m[I 2021-09-12 22:57:57,074][0m Trial 55 finished with value: 0.8147929088363105 and parameters: {'device_type': 'gpu', 'n_estimators': 1963, 'learning_rate': 0.28949477374533344, 'num_leaves': 1840, 'max_depth': 8, 'min_data_in_leaf': 3200, 'lambda_l1': 20, 'lambda_l2': 15, 'min_gain_to_split': 5.040330880834378, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 55 with value: 0.8147929088363105.[0m




[32m[I 2021-09-12 22:59:52,425][0m Trial 56 finished with value: 0.8149619377296574 and parameters: {'device_type': 'gpu', 'n_estimators': 1992, 'learning_rate': 0.2863920495934517, 'num_leaves': 2020, 'max_depth': 8, 'min_data_in_leaf': 3300, 'lambda_l1': 20, 'lambda_l2': 15, 'min_gain_to_split': 5.662464051179183, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 56 with value: 0.8149619377296574.[0m




[32m[I 2021-09-12 23:00:02,833][0m Trial 57 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:01:19,786][0m Trial 58 finished with value: 0.8147283493947851 and parameters: {'device_type': 'gpu', 'n_estimators': 1484, 'learning_rate': 0.2822924641722981, 'num_leaves': 2500, 'max_depth': 7, 'min_data_in_leaf': 4000, 'lambda_l1': 5, 'lambda_l2': 15, 'min_gain_to_split': 4.700686636630476, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 56 with value: 0.8149619377296574.[0m




[32m[I 2021-09-12 23:03:38,489][0m Trial 59 finished with value: 0.8145748872528858 and parameters: {'device_type': 'gpu', 'n_estimators': 1526, 'learning_rate': 0.28788658107605114, 'num_leaves': 2720, 'max_depth': 8, 'min_data_in_leaf': 4100, 'lambda_l1': 5, 'lambda_l2': 10, 'min_gain_to_split': 4.632330930462926, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 56 with value: 0.8149619377296574.[0m




[32m[I 2021-09-12 23:04:02,393][0m Trial 60 pruned. Trial was pruned at iteration 45.[0m
[32m[I 2021-09-12 23:05:16,350][0m Trial 61 finished with value: 0.8150314787081024 and parameters: {'device_type': 'gpu', 'n_estimators': 2535, 'learning_rate': 0.2871793266707897, 'num_leaves': 2600, 'max_depth': 8, 'min_data_in_leaf': 4500, 'lambda_l1': 5, 'lambda_l2': 10, 'min_gain_to_split': 4.611289417704253, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 61 with value: 0.8150314787081024.[0m




[32m[I 2021-09-12 23:06:08,776][0m Trial 62 pruned. Trial was pruned at iteration 172.[0m
[32m[I 2021-09-12 23:06:19,496][0m Trial 63 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:06:35,033][0m Trial 64 pruned. Trial was pruned at iteration 15.[0m
[32m[I 2021-09-12 23:07:14,660][0m Trial 65 pruned. Trial was pruned at iteration 130.[0m
[32m[I 2021-09-12 23:07:30,725][0m Trial 66 pruned. Trial was pruned at iteration 27.[0m
[32m[I 2021-09-12 23:07:38,044][0m Trial 67 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:09:29,585][0m Trial 68 finished with value: 0.8149163307040406 and parameters: {'device_type': 'gpu', 'n_estimators': 2562, 'learning_rate': 0.24024634258079844, 'num_leaves': 2520, 'max_depth': 7, 'min_data_in_leaf': 5300, 'lambda_l1': 15, 'lambda_l2': 5, 'min_gain_to_split': 4.800229361019209, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 61 with value: 0.8150314787081024.[0m




[32m[I 2021-09-12 23:09:43,425][0m Trial 69 pruned. Trial was pruned at iteration 26.[0m
[32m[I 2021-09-12 23:12:11,558][0m Trial 70 finished with value: 0.8154161967790677 and parameters: {'device_type': 'gpu', 'n_estimators': 1625, 'learning_rate': 0.26486470340395635, 'num_leaves': 2820, 'max_depth': 8, 'min_data_in_leaf': 5300, 'lambda_l1': 15, 'lambda_l2': 0, 'min_gain_to_split': 6.398418703243587, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 70 with value: 0.8154161967790677.[0m




[32m[I 2021-09-12 23:13:03,830][0m Trial 71 pruned. Trial was pruned at iteration 247.[0m
[32m[I 2021-09-12 23:13:52,563][0m Trial 72 pruned. Trial was pruned at iteration 198.[0m
[32m[I 2021-09-12 23:14:08,290][0m Trial 73 pruned. Trial was pruned at iteration 30.[0m
[32m[I 2021-09-12 23:14:18,981][0m Trial 74 pruned. Trial was pruned at iteration 15.[0m
[32m[I 2021-09-12 23:14:26,721][0m Trial 75 pruned. Trial was pruned at iteration 3.[0m
[32m[I 2021-09-12 23:14:33,623][0m Trial 76 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 23:14:47,556][0m Trial 77 pruned. Trial was pruned at iteration 28.[0m
[32m[I 2021-09-12 23:14:55,998][0m Trial 78 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-12 23:15:02,691][0m Trial 79 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:15:09,334][0m Trial 80 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:15:16,137][0m Trial 81 pruned. Trial was pruned at iteration



[32m[I 2021-09-12 23:18:31,417][0m Trial 86 finished with value: 0.8151973370504768 and parameters: {'device_type': 'gpu', 'n_estimators': 688, 'learning_rate': 0.29151074823703976, 'num_leaves': 2120, 'max_depth': 9, 'min_data_in_leaf': 4200, 'lambda_l1': 0, 'lambda_l2': 15, 'min_gain_to_split': 6.116344835914157, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 70 with value: 0.8154161967790677.[0m




[32m[I 2021-09-12 23:18:44,289][0m Trial 87 pruned. Trial was pruned at iteration 16.[0m
[32m[I 2021-09-12 23:19:12,387][0m Trial 88 finished with value: 0.8142447520204859 and parameters: {'device_type': 'gpu', 'n_estimators': 72, 'learning_rate': 0.27591049809559104, 'num_leaves': 2100, 'max_depth': 9, 'min_data_in_leaf': 5100, 'lambda_l1': 0, 'lambda_l2': 15, 'min_gain_to_split': 6.215525867093817, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 70 with value: 0.8154161967790677.[0m




[32m[I 2021-09-12 23:19:19,647][0m Trial 89 pruned. Trial was pruned at iteration 4.[0m
[32m[I 2021-09-12 23:19:26,523][0m Trial 90 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:20:50,406][0m Trial 91 finished with value: 0.8150337310627702 and parameters: {'device_type': 'gpu', 'n_estimators': 1549, 'learning_rate': 0.28488128568706805, 'num_leaves': 2260, 'max_depth': 8, 'min_data_in_leaf': 4200, 'lambda_l1': 5, 'lambda_l2': 10, 'min_gain_to_split': 4.199809413052516, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 70 with value: 0.8154161967790677.[0m




[32m[I 2021-09-12 23:21:10,804][0m Trial 92 pruned. Trial was pruned at iteration 44.[0m
[32m[I 2021-09-12 23:21:17,611][0m Trial 93 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 23:22:04,800][0m Trial 94 pruned. Trial was pruned at iteration 187.[0m
[32m[I 2021-09-12 23:22:11,867][0m Trial 95 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:22:18,551][0m Trial 96 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:22:26,085][0m Trial 97 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-12 23:22:32,805][0m Trial 98 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:22:41,521][0m Trial 99 pruned. Trial was pruned at iteration 8.[0m


	Best value (AUC): 0.81542
	Best params:
		device_type: gpu
		n_estimators: 1625
		learning_rate: 0.26486470340395635
		num_leaves: 2820
		max_depth: 8
		min_data_in_leaf: 5300
		lambda_l1: 15
		lambda_l2: 0
		min_gain_to_split: 6.398418703243587
		bagging_fraction: 0.9
		bagging_freq: 1
		feature_fraction: 0.8


In [81]:
func = lambda trial: objective(base_model, trial, X, y)
study.optimize(func, n_trials=100)





[32m[I 2021-09-12 23:31:52,459][0m Trial 100 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:32:01,213][0m Trial 101 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-12 23:32:10,798][0m Trial 102 pruned. Trial was pruned at iteration 8.[0m
[32m[I 2021-09-12 23:32:19,093][0m Trial 103 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-12 23:33:42,399][0m Trial 104 finished with value: 0.8147761772949084 and parameters: {'device_type': 'gpu', 'n_estimators': 1471, 'learning_rate': 0.27547996376473605, 'num_leaves': 2880, 'max_depth': 8, 'min_data_in_leaf': 3700, 'lambda_l1': 10, 'lambda_l2': 25, 'min_gain_to_split': 4.594579659992838, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 70 with value: 0.8154161967790677.[0m




[32m[I 2021-09-12 23:33:49,468][0m Trial 105 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:33:56,411][0m Trial 106 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:34:04,411][0m Trial 107 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 23:34:11,178][0m Trial 108 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:34:18,176][0m Trial 109 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:35:35,767][0m Trial 110 pruned. Trial was pruned at iteration 256.[0m
[32m[I 2021-09-12 23:35:52,379][0m Trial 111 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-09-12 23:36:01,650][0m Trial 112 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 23:36:13,318][0m Trial 113 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-09-12 23:36:25,162][0m Trial 114 pruned. Trial was pruned at iteration 6.[0m
[32m[I 2021-09-12 23:36:34,406][0m Trial 115 pruned. Trial was pruned at i



[32m[I 2021-09-12 23:39:54,916][0m Trial 132 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-09-12 23:40:04,605][0m Trial 133 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 23:40:17,454][0m Trial 134 pruned. Trial was pruned at iteration 9.[0m
[32m[I 2021-09-12 23:40:28,694][0m Trial 135 pruned. Trial was pruned at iteration 3.[0m
[32m[I 2021-09-12 23:40:37,730][0m Trial 136 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:40:46,890][0m Trial 137 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:40:56,802][0m Trial 138 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:41:06,200][0m Trial 139 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:41:16,873][0m Trial 140 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-12 23:41:30,652][0m Trial 141 pruned. Trial was pruned at iteration 8.[0m
[32m[I 2021-09-12 23:42:56,067][0m Trial 142 finished with value: 0.815655852



[32m[I 2021-09-12 23:43:08,433][0m Trial 143 pruned. Trial was pruned at iteration 8.[0m
[32m[I 2021-09-12 23:43:17,955][0m Trial 144 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-09-12 23:43:26,883][0m Trial 145 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:43:36,483][0m Trial 146 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:43:46,211][0m Trial 147 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:43:54,486][0m Trial 148 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:46:08,903][0m Trial 149 finished with value: 0.8151369949889253 and parameters: {'device_type': 'gpu', 'n_estimators': 2054, 'learning_rate': 0.2947353478610325, 'num_leaves': 1100, 'max_depth': 9, 'min_data_in_leaf': 3100, 'lambda_l1': 5, 'lambda_l2': 0, 'min_gain_to_split': 5.265720889056503, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 142 with value: 0.8156558528905802.[0m




[32m[I 2021-09-12 23:46:24,939][0m Trial 150 pruned. Trial was pruned at iteration 16.[0m
[32m[I 2021-09-12 23:46:36,263][0m Trial 151 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-12 23:46:45,506][0m Trial 152 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:47:43,117][0m Trial 153 pruned. Trial was pruned at iteration 226.[0m
[32m[I 2021-09-12 23:47:52,716][0m Trial 154 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:48:01,893][0m Trial 155 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:48:13,190][0m Trial 156 pruned. Trial was pruned at iteration 6.[0m
[32m[I 2021-09-12 23:48:23,145][0m Trial 157 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:48:32,854][0m Trial 158 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:48:41,595][0m Trial 159 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:48:51,174][0m Trial 160 pruned. Trial was pruned at it



[32m[I 2021-09-12 23:55:13,204][0m Trial 188 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:55:22,914][0m Trial 189 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2021-09-12 23:55:32,001][0m Trial 190 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:55:40,742][0m Trial 191 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:55:50,152][0m Trial 192 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:55:59,639][0m Trial 193 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:56:08,555][0m Trial 194 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:56:17,248][0m Trial 195 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:56:26,443][0m Trial 196 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-09-12 23:56:37,079][0m Trial 197 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-09-12 23:56:46,728][0m Trial 198 pruned. Trial was pruned at itera

In [82]:
    print(f"\tBest value (AUC): {study.best_value:.5f}")
    print(f"\tBest params:")

    for key, value in study.best_params.items():
        print(f"\t\t{key}: {value}")
        
    model = clone(base_model)
    model.set_params(**study.best_params)
    model.fit(X, y)

	Best value (AUC): 0.81566
	Best params:
		device_type: gpu
		n_estimators: 431
		learning_rate: 0.27510378410487624
		num_leaves: 2140
		max_depth: 9
		min_data_in_leaf: 6000
		lambda_l1: 5
		lambda_l2: 15
		min_gain_to_split: 6.583518223586768
		bagging_fraction: 0.9
		bagging_freq: 1
		feature_fraction: 0.8


LGBMClassifier(bagging_fraction=0.9, bagging_freq=1, device_type='gpu',
               feature_fraction=0.8, lambda_l1=5, lambda_l2=15,
               learning_rate=0.27510378410487624, max_depth=9,
               min_data_in_leaf=6000, min_gain_to_split=6.583518223586768,
               n_estimators=431, num_leaves=2140, objective='binary')

In [None]:
predictions = trained_model.predict_proba(test_w_nan_new_features)

In [83]:
sample_submission.claim = predictions[:, 1]
sample_submission

Unnamed: 0,id,claim
0,957919,0.560970
1,957920,0.121897
2,957921,0.602786
3,957922,0.130965
4,957923,0.131961
...,...,...
493469,1451388,0.850729
493470,1451389,0.115525
493471,1451390,0.754884
493472,1451391,0.138521


In [84]:
sample_submission.to_csv("Submissions/lightgbm_new_features_extra_tuning.csv", index = False)