In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier, early_stopping
from sklearn.model_selection import StratifiedKFold

In [2]:
main_train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv').drop('id', axis=1)
main_test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv').drop('id', axis=1)
extra_data = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv').drop('id', axis=1)

In [3]:
def prepare_data(df):
    
    df['Vehicle_Age'] = df['Vehicle_Age'].replace({'< 1 Year':0, '1-2 Year':1, '> 2 Years':2})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].replace({'No':0, 'Yes':1})
    df['Gender'] = df['Gender'].replace({'Male': 0, 'Female': 1})
    
    df['Vehicle_Age'] = df['Vehicle_Age'].astype('int8')
    df[['Gender', 'Vehicle_Damage']] = df[['Gender', 'Vehicle_Damage']].astype('bool')
    df[['Driving_License','Previously_Insured']] = df[['Driving_License','Previously_Insured']].astype('bool')
    df[['Age', 'Region_Code', 'Policy_Sales_Channel', 'Vintage']] = df[['Age', 'Region_Code', 'Policy_Sales_Channel', 'Vintage']].astype('int16')
    df['Annual_Premium'] = df['Annual_Premium'].astype('int32')
    
    if 'Response' in df.columns:
        df['Response'] = df['Response'].astype('bool')
    return df

In [4]:
main_train = prepare_data(main_train)
exter_data = prepare_data(extra_data)
main_test = prepare_data(main_test)

print('Train')
main_train.info()
print('External\n')
extra_data.info()
print('Test\n')
main_test.info()

In [5]:
X, y = main_train.drop('Response', axis=1), main_train['Response']

# XGB

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
xgb_stacking, xgb_stacking_test = np.zeros(len(X)), np.zeros(len(main_test))

for train_idx, val_idx in skf.split(X, y):
    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    eval_set = [(X_val, y_val)]
    
    X_tr = pd.concat([X_tr, extra_data.drop('Response', axis=1)], ignore_index=True)
    y_tr = pd.concat([y_tr, extra_data['Response']], ignore_index=True)
    
    xgb_params = {
    
    'n_estimators': 10000,
    'eta': 0.05,
    'alpha':  0.2545607592482198,
    'subsample': 0.8388163485383147, 
    'colsample_bytree': 0.2732499701466825, 
    'max_depth': 16,
    'min_child_weight': 5,
    'gamma': 0.0017688666476104672,
    'eval_metric': 'auc',
    'max_bin': 262143,
    'tree_method':"hist",
    'device':"cuda",
}

    xgb_model = XGBClassifier(**xgb_params, early_stopping_rounds=50, random_state=42)
    xgb_model.fit(X_tr, y_tr, eval_set=eval_set, verbose=False)
    best_iteration = xgb_model.best_iteration
    
    stack_preds = xgb_model.predict_proba(X_val, iteration_range=(0, best_iteration))[:, 1]
    test_preds = xgb_model.predict_proba(main_test, iteration_range=(0, best_iteration))[:, 1]
    
    xgb_stacking[val_idx] = stack_preds
    xgb_stacking_test += test_preds / skf.get_n_splits()

# LGBM

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lgbm_stacking, lgbm_stacking_test = np.zeros(len(X)), np.zeros(len(main_test))

for train_idx, val_idx in skf.split(X, y):
    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    eval_set = [(X_val, y_val)]
    
    X_tr = pd.concat([X_tr, extra_data.drop('Response', axis=1)], ignore_index=True)
    y_tr = pd.concat([y_tr, extra_data['Response']], ignore_index=True)
    
    lgbm_params = {'n_estimators': 2000,
                   "verbose": -1,
                   'depth': 6,
                   "eval_metric": "auc",
                   "max_bin": 262143,
                   'num_leaves': 223,
                   'learning_rate': 0.3, 
                   'min_child_samples': 54,
                   'subsample': 0.5395472919165504,
                   'colsample_bytree': 0.547518064129546,
                   'lambda_l1': 3.4444245446562,
                   'lambda_l2': 2.87490408088595e-05}

    lgbm_model = LGBMClassifier(**lgbm_params, random_state=42)
    lgbm_model.fit(X_tr, y_tr, eval_set=eval_set, eval_metric='auc', callbacks=[early_stopping(50)])
    best_iteration = lgbm_model.best_iteration_
    
    stack_preds = lgbm_model.predict_proba(X_val, num_iteration=best_iteration)[:, 1]
    test_preds = lgbm_model.predict_proba(main_test, num_iteration=best_iteration)[:, 1]
    
    lgbm_stacking[val_idx] = stack_preds
    lgbm_stacking_test += test_preds / skf.get_n_splits()

# CatBoost

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cat_stacking, cat_stacking_test = np.zeros(len(X)), np.zeros(len(main_test))

for train_idx, val_idx in skf.split(X, y):
    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    eval_set = [(X_val, y_val)]
    
    X_tr = pd.concat([X_tr, extra_data.drop('Response', axis=1)], ignore_index=True)
    y_tr = pd.concat([y_tr, extra_data['Response']], ignore_index=True)
    
    cat_params = {
    'iterations': 10000,
    'eval_metric': 'AUC',
    'task_type': 'GPU',
    'learning_rate': 0.05,
    'depth': 9,
    'l2_leaf_reg': 55.37964307854247,
    'max_bin': 404,
    'bagging_temperature': 0.017138393608280057,
    'random_strength': 9.256288011643901
}

    cat_model = CatBoostClassifier(**cat_params, random_state=42, logging_level='Silent',cat_features=main_test.columns.values)
    cat_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=False)

    stack_preds = cat_model.predict_proba(X_val)[:, 1]
    test_preds = cat_model.predict_proba(main_test)[:, 1]
    
    cat_stacking[val_idx] = stack_preds
    cat_stacking_test += test_preds / skf.get_n_splits()

In [9]:
stacking_df = pd.DataFrame({
    'xgb_proba': xgb_stacking,
    'lgbm_proba': lgbm_stacking,
    'cat_proba': cat_stacking
})

stacking_test_df = pd.DataFrame({
    'xgb_proba': xgb_stacking_test,
    'lgbm_proba': lgbm_stacking_test,
    'cat_proba': cat_stacking_test
})

In [10]:
meta_model_params = {
    'n_estimators': 43,
    'alpha': 0.000759453839369262,
    'subsample': 0.8635904939859487,
    'colsample_bytree': 0.7579443772400538,
    'max_depth': 7, 'min_child_weight': 5,
    'learning_rate': 0.13688008280542863,
    'gamma': 0.19965095682630274
}

meta_model = XGBClassifier(**meta_model_params, random_state=42)
meta_model.fit(stacking_df, y)

test_preds = meta_model.predict_proba(stacking_test_df)[:,1]

In [11]:
sub = pd.read_csv("/kaggle/input/playground-series-s4e7/sample_submission.csv")
blend = pd.read_parquet("/kaggle/input/stacking-xgb-lgbm-catb-ann/submission.parquet")
sub['Response'] = test_preds
sub['Response'] = np.mean([sub['Response'], blend['Response']], axis=0)
sub.to_csv('submission.csv', index=False)