In [1]:
# !pip install -U -q kaggle
# !mkdir -p ~/.kaggle
# !echo '{"username":"harrierdb","key":"d35f4ee749dde0ec819807713df28790"}' > ~/.kaggle/kaggle.json
# !chmod 600 ~/.kaggle/kaggle.json
 
# !kaggle competitions download -c tabular-playground-series-oct-2021 --force

In [2]:
import pandas as pd
import numpy as np
!pip install catboost
import gc

In [3]:
data = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv', index_col = 0)
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv', index_col = 0)


In [4]:
target = data['target']
feats = data.drop('target', axis = 1)


del data
gc.collect()

In [5]:
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

kfold = KFold(n_splits = 5, shuffle = True, random_state = 2021)


In [6]:
lgb_params = {
        'objective': 'binary',
        'n_estimators': 20000,
        'random_state': 42,
        'learning_rate': 8e-3,
        'subsample': 0.6,
        'subsample_freq': 1,
        'colsample_bytree': 0.4,
        'reg_alpha': 10.0,
        'reg_lambda': 1e-1,
        'min_child_weight': 256,
        'min_child_samples': 500,
        'device': 'gpu',
}
xgb_params = {'n_estimators': 10000,
        'learning_rate': 0.03689407512484644,
        'max_depth': 8,
        'colsample_bytree': 0.3723914688159835,
        'subsample': 0.780714581166012,
        'eval_metric': 'auc',
        'use_label_encoder': False,
        'gamma': 0,
        'reg_lambda': 50.0,
        'tree_method': 'gpu_hist',
        'gpu_id': 0,
        'predictor': 'gpu_predictor',
        'random_state': 42}

cat_params = {'iterations': 17298,
        'learning_rate': 0.03429054860458741,
        'reg_lambda': 0.3242286463210283,
        'subsample': 0.9433911589913944,
        'random_strength': 22.4849972385133,
        'depth': 8,
        'min_data_in_leaf': 4,
        'leaf_estimation_iterations': 8,
        'task_type':"GPU",
        'bootstrap_type':'Poisson',
        'verbose' : 500,
        'early_stopping_rounds' : 200,
        'eval_metric' : 'AUC'}
lgb = LGBMClassifier(**lgb_params)
xgb = XGBClassifier(**xgb_params)
cat = CatBoostClassifier(**cat_params)

In [7]:


def get_oof(feats, target, test, kfold, clf):
    oof_preds = np.zeros(feats.shape[0])
    sub_preds = np.zeros(test.shape[0])
    for i, (train_idx, valid_idx) in enumerate(kfold.split(feats)):
        train_X, train_y = feats.loc[train_idx], target.loc[train_idx]
        valid_X, valid_y = feats.loc[valid_idx], target.loc[valid_idx]

        clf.fit(train_X, train_y, eval_set = [(valid_X, valid_y)], verbose = 500, early_stopping_rounds = 500, )
        oof_preds[valid_idx] = clf.predict_proba(valid_X)[:,1]
        sub_preds += clf.predict_proba(test)[:,1]
        del train_X, train_y, valid_X, valid_y
        gc.collect()

    evalution_result = roc_auc_score(target, oof_preds)
    print('*'*10)
    print('roc auc score:', evalution_result)
    print('*'*20)
    sub_preds_result = sub_preds / kfold.n_splits
    return oof_preds ,sub_preds_result


In [8]:
# xgboost, lightgbm, catboost

oof_preds_1, sub_preds_1 = get_oof(feats, target, test, kfold, lgb)
oof_preds_2, sub_preds_2 = get_oof(feats, target, test, kfold, xgb)


In [9]:
oof_preds_3, sub_preds_3 = get_oof(feats, target, test, kfold, cat)  #调整eval_set = [(valid_X, valid_y)]

In [10]:
from sklearn.linear_model import RidgeClassifier

def stack_model(train_stack, test_stack, y):  #oof_set =[oof_1, oof_2, oof_3, ..., oof_n], predictions_set =[predictions_1, predictions_2, predictions_3, ..., predictions_n],

    oof = np.zeros((train_stack.shape[0],))
    predictions = np.zeros((test_stack.shape[0],))
    scores = []

    for fold_, (trn_idx, val_idx) in enumerate(kfold.split(train_stack, y)):
        trn_data, trn_y = train_stack[trn_idx], y[trn_idx]
        val_data, val_y = train_stack[val_idx], y[val_idx]

        clf = RidgeClassifier(random_state=2099)
        clf.fit(trn_data, trn_y)

        oof[val_idx] = clf._predict_proba_lr(val_data)[:,1]
        predictions +=clf._predict_proba_lr(test_stack)[:,1] / kfold.n_splits

        score_single = roc_auc_score(val_y, oof[val_idx])
        scores.append(score_single)
    print('mean: ', np.mean(scores))

    return oof, predictions



In [11]:
pred_matrix = np.hstack([sub_preds_1[:,np.newaxis], sub_preds_2[:,np.newaxis], sub_preds_3[:,np.newaxis]])
oof_matrix = np.hstack([oof_preds_1[:,np.newaxis], oof_preds_2[:,np.newaxis], oof_preds_3[:,np.newaxis]])

oof_stack, predictions_stack = stack_model(oof_matrix, pred_matrix, target)

In [12]:
result = pd.DataFrame({'id':test.index, 'target': predictions_stack})
result.to_csv('./stack_model.csv', index = False)