In [None]:
import pandas as pd 
import numpy as np

from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import xgboost as xgb
import xgboost as XGBClassifier
import optuna
from functools import partial
from optuna import create_study
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
from optuna.integration import LightGBMPruningCallback
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time
import gc
import psutil

import warnings 
warnings.filterwarnings('ignore')
from pyarrow import csv

In [None]:
X_imp = csv.read_csv("../input/imputed/X_imp_br.csv").to_pandas()
#test_imp = csv.read_csv("../input/imputed/test_imp_br.csv").to_pandas()
train = csv.read_csv("../input/tabular-playground-series-sep-2021/train.csv").to_pandas()
y=train["claim"].copy()

In [None]:
del train

In [None]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X_imp), columns=X_imp.columns)
#test = pd.DataFrame(scaler.transform(test_imp), columns=test_imp.columns)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
reduce_mem_usage(X, verbose=True)
#reduce_mem_usage(test, verbose=True)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1,stratify=y)

In [None]:
gc.collect()

In [None]:
def optimization(trial):

    xgb_params = {
        "objective": "binary:logistic",
        "booster": "gbtree",
        
        "max_depth":trial.suggest_int("max_depth", 1,12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 500,5),      

        "gamma": trial.suggest_float('gamma', 0, 1),
        
        "subsample":trial.suggest_float('subsample', 0.1, 1),
        "colsample_bytree":trial.suggest_float('colsample_bytree', 0.1, 1),
        "colsample_bylevel":trial.suggest_float('colsample_bylevel', 0.1, 1),
        
        "reg_lambda":trial.suggest_loguniform("reg_lambda", 1e-5, 100.0),
        "reg_alpha":trial.suggest_loguniform("reg_alpha", 1e-5, 100.0),
        
        "learning_rate":trial.suggest_float("learning_rate", 1e-2, 0.25, log=True) , 

        'n_estimators':trial.suggest_int("n_estimators", 2000, 10000,100),
        
        "eval_metric":"auc",
        "random_state":0, 
        'tree_method': 'gpu_hist', 
        'gpu_id': 0, 
        'predictor': 'gpu_predictor',
    }
        
        
    model = xgb.XGBClassifier(**xgb_params,
                          use_label_encoder=False,    )
    
    model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],

                  early_stopping_rounds=100,
                  verbose=1000)  

    preds = model.predict_proba(X_valid)[:, 1]
    score = roc_auc_score(y_valid, preds)
    
    return  score

In [None]:

study = optuna.create_study(
    direction="maximize")

study.optimize( optimization,   
               n_trials=100)

In [None]:
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

In [None]:
xgb_params={ "objective": "binary:logistic",
            "booster": "gbtree",
            'max_depth': 4,
            'min_child_weight': 244,
            'gamma': 0.9937125923442126,
            'subsample': 0.6975967311388934,
            'colsample_bytree': 0.7348557038212125,
            'colsample_bylevel': 0.9002630834386368, 
            'reg_lambda': 0, 
            'reg_alpha': 20,
            'learning_rate': 0.02193656821803904, 
            'n_estimators': 7377,
            "eval_metric":"auc",
            "random_state":0, 
            'tree_method': 'gpu_hist',
            'gpu_id': 0, 
            'predictor': 'gpu_predictor'}


In [None]:
gc.collect()

In [None]:
oof_preds = {}
y_values = {}
test_preds = []
mean_scores = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    model = xgb.XGBClassifier(**xgb_params)

    model.fit(X_train, y_train,eval_set=[(X_train, y_train),(X_valid, y_valid)],
              #early_stopping_rounds=200, # FOR LGBM
              verbose=1000)

    pred_valid = model.predict_proba(X_valid)[:,1]
    oof_preds.update(dict(zip(valid_idx, pred_valid)))
        
    score = roc_auc_score(y_valid, pred_valid)
    mean_scores.append(score)
   
    y_values.update(dict(zip(valid_idx, y)))

    pred = model.predict_proba(test)[:,1]
    test_preds.append(pred)

    print(f"Fold: {fold} Score: {score}")
    
print(f"Overall AUC ROC: {np.mean(mean_scores)}")    

# LGBM

In [None]:
gc.collect()

In [None]:
def objective(trial):
    lg_params = {
        "objective": "binary",
        #'device': 'gpu',
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 2000,10000, 100),
        'max_depth':trial.suggest_int("max_depth", 1,12),
        "num_leaves": trial.suggest_int("num_leaves", 4, 256),
        "min_data_in_leaf": trial.suggest_int("min_child_samples", 10, 60),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 5),
        'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-5, 100.0),
        'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-5, 100.0),
   


    }

    model = LGBMClassifier(**lg_params)
        
    model.fit( X_train,   y_train,
              eval_set=[(X_valid, y_valid)],
              early_stopping_rounds=100,
              eval_metric='auc', verbose=1000)

    preds = model.predict_proba(X_valid)[:, 1]
    score = roc_auc_score(y_valid, preds)
    
    return  score

In [None]:
study = optuna.create_study(direction="maximize")

study.optimize( objective,
               n_trials=100)

In [None]:
print(f"Best ROC AUC value: {study.best_value}")
print(f"Best params: ")
for param, value in study.best_params.items():
    print(f"\t{param} : {value}")

In [None]:
0.816281087297095 and parameters: {
     "objective": "binary",
    "min_sum_hessian_in_leaf": 256,
    'learning_rate': 0.010685033500506827,
    'iterations': 9800, 
    'max_depth': 10,
    'num_leaves': 57, 
    'min_child_samples': 19,
    'feature_fraction': 0.2650640588680905,
    'bagging_fraction': 0.373818018663584,
    'bagging_freq': 3, 
    'reg_alpha': 0.00020866527711063722,
    'reg_lambda': 8.171304639059403e-06}

## Catboost

In [None]:
gc.collect()

In [None]:
from catboost import CatBoostClassifier

def objective(trial):
    
    cat_params = {
        'n_estimators':trial.suggest_int("iterations", 1000, 20000),
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'bootstrap_type': 'Bernoulli',
        'od_wait':trial.suggest_int('od_wait', 500, 2000),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1,15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'task_type' : 'GPU',
        'devices' : '0',
        'subsample': trial.suggest_float('subsample', 0.1, 1)

    }

   
    model = CatBoostClassifier(**cat_params)

    model.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],use_best_model=True,
               early_stopping_rounds=100, verbose=1000) 
        
    preds = model.predict_proba(X_valid)[:, 1]
    score = roc_auc_score(y_valid, preds)
    
    return  score

In [None]:
study = optuna.create_study(direction="maximize")

study.optimize( objective,
               n_trials=100)

In [None]:
print(f"Best ROC AUC value: {study.best_value}")
print(f"Best params: ")
for param, value in study.best_params.items():
    print(f"\t{param} : {value}")

In [None]:
0.8161230786606084 and parameters: {
    'iterations': 11266, 
    'objective': 'Logloss',
    'od_wait': 1740, 
    'learning_rate': 0.020086706701575457,
    'reg_lambda': 14.125082825476843,
    'random_strength': 12.892612974183212,
                                    'depth': 4, 
    'min_data_in_leaf': 9,
    'leaf_estimation_iterations': 13, 
    'subsample': 0.7087011651747303
    

In [None]:
gc.collect()

## Blending

In [None]:
gc.collect()

In [None]:
xgb_params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "gamma":0.25,       
    "max_depth": 4,
    "min_child_weight":385, 
    "subsample":0.8300664041335806,
    "colsample_bytree":0.5286281769166067 , 
    "reg_lambda":0, 
    "reg_alpha": 10,  
    "learning_rate":0.0105496092641524, 
    'n_estimators':10000,  
    "eval_metric":"auc",
    "random_state":0, 
    'tree_method': 'gpu_hist',
    'gpu_id': 0, 
    'predictor': 'gpu_predictor',
    "seed":0
    }

In [None]:
lgb_params = {
    "objective": "binary",
    "device": "gpu",
    'metric' : 'auc',
    "learning_rate": 0.012259058331294691,
    'n_estimators': 10000,
    'max_depth':3,
    "num_leaves": 213,
    "min_child_samples": 385,
    'subsample':0.9351660199824693,
    'colsample_bytree': 0.2465835648365681,
    'subsample_freq': 2,
    'reg_alpha': 0.3047277199666851,
    'reg_lambda': 7.91439327504839e-07,
    "min_split_gain": 2.9178636636688675,
    "random_state":0,
    #"n_jobs":-1
}
   

In [None]:
cat_params={ 
    'objective': 'CrossEntropy',
    "bootstrap_type":"Bernoulli",
    'task_type': 'GPU',
    'devices' : '0',
    "n_estimators":17228,
    'learning_rate': 0.012254007285824443,
    'max_depth': 4, 
    'od_wait': 1210, 
    'l2_leaf_reg': 0.036297649246309474,
    'subsample': 0.6718599986423817,
    'min_data_in_leaf': 23, 
    'random_strength': 0.04584072594464698,
    "random_state":0,
    "eval_metric":"AUC",
    
           } 

In [None]:
oof_preds = {}
y_values = {}
test_preds = []
mean_scores = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for fold, (train_idx, valid_idx) in enumerate(cv.split(X_p, y)):

    X_train, y_train = X_p.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X_p.iloc[valid_idx], y.iloc[valid_idx]
    
    model = xgb.XGBClassifier(**xgb_params)

    model.fit(X_train, y_train,
              eval_set=[(X_valid,y_valid)],
              early_stopping_rounds=100,
              verbose=0  )

    pred_valid = model.predict_proba(X_valid)[:,1]
    oof_preds.update(dict(zip(valid_idx, pred_valid)))
        
    score = roc_auc_score(y_valid, pred_valid)
    mean_scores.append(score)
   
    y_values.update(dict(zip(valid_idx, y)))

    pred = model.predict_proba(test_p)[:,1]
    test_preds.append(pred)

    print(f"Fold:{fold} Score: {score}")
    

In [None]:
np.mean(mean_scores)

In [None]:
sample_solution = csv.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv").to_pandas()

In [None]:
oof_preds = pd.DataFrame.from_dict(oof_preds, orient="index").reset_index()
oof_preds.columns = ["id", "pred_train_xgb_2"]
oof_preds.to_csv("stack_train_pred_xgb_2.csv", index=False)

sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
sample_solution.columns = ["id", "test_preds_xgb_2"]
sample_solution.to_csv("stack_test_pred_xgb_2.csv", index=False)