In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import scipy.stats as st
from scipy.stats import probplot, ks_2samp
from typing import List, Optional
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import xgboost as xgb
import lightgbm as lgbm
import catboost as catb
from scipy.stats import gmean, rankdata

from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion

In [2]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

In [3]:
def feature_selector(X):
    continuous_columns = X.select_dtypes(include=[np.number]).drop(['APPLICATION_NUMBER'], 1)
    continuous_columns = continuous_columns.columns.to_list()
    #continuous_columns = continuous_columns[1:]

    cat_feature_num = [
        feature for feature in continuous_columns
        if len(X[feature].unique())< 50
    ]
    categorical_columns = X.select_dtypes(include=[np.object]).columns.to_list()
    categorical_columns = list(categorical_columns + cat_feature_num)
    #categorical_columns = categorical_columns[1:]

    continuous_columns = list(set(continuous_columns) - set(categorical_columns))
    
    dummies_features = pd.get_dummies(X[categorical_columns])
    X = pd.concat([X, dummies_features], axis=1)
    dummies_features = dummies_features.columns.tolist()
    X = X.drop(categorical_columns, axis=1)

    return X

In [4]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
def lightgbm_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = lgbm.LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="auc", verbose=50, early_stopping_rounds=100,
            categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


def xgboost_cross_validation(params, X, y, cv, categorical=None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, encoders = [], {}
    oof_preds = np.zeros(X.shape[0])

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        dtrain = xgb.DMatrix(x_train, y_train)
        dvalid = xgb.DMatrix(x_valid, y_valid)

        model = xgb.train(
            params=params,
            dtrain=dtrain,
            maximize=True,
            num_boost_round=10000,
            early_stopping_rounds=25,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            verbose_eval=10,
        )
        oof_preds[valid_idx] = model.predict(dvalid)
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        estimators.append(model)

    return estimators, encoders, oof_preds

def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        dtrain = catb.Pool(x_train, y_train)
        dvalid = catb.Pool(x_valid, y_valid)

        model = catb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            verbose=50, early_stopping_rounds=100
            #categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

    return estimators, encoders, oof_preds

In [7]:
train = pd.read_csv('./prepared_data/train.csv')
test = pd.read_csv('./prepared_data/test.csv')

In [8]:
reduce_mem_usage(train)
reduce_mem_usage(test)

Memory usage of dataframe is 225.10 MB
Memory usage after optimization is: 111.92 MB
Decreased by 50.3%
Memory usage of dataframe is 336.40 MB
Memory usage after optimization is: 167.73 MB
Decreased by 50.1%


Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE,AMOUNT_ANNUITY_x,AMT_APPLICATION,AMOUNT_CREDIT_x,AMOUNT_PAYMENT,AMOUNT_GOODS_PAYMENT,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,...,NUM_INSTALMENT_VERSION_count_APPLICATION_NUMBER,DAYS_ENTRY_PAYMENT_mean-APPLICATION_NUMBER,DAYS_ENTRY_PAYMENT_std_APPLICATION_NUMBER,DAYS_ENTRY_PAYMENT_count_APPLICATION_NUMBER,DAYS_INSTALMENT_mean-APPLICATION_NUMBER,DAYS_INSTALMENT_std_APPLICATION_NUMBER,DAYS_INSTALMENT_count_APPLICATION_NUMBER,PREV_APPLICATION_NUMBER_mean-APPLICATION_NUMBER_y,PREV_APPLICATION_NUMBER_std_APPLICATION_NUMBER,PREV_APPLICATION_NUMBER_count_APPLICATION_NUMBER_y
0,123724268,Cash,16837.806641,143007.750000,143007.750000,0.000000,286015.500000,514.500,36.750000,6.000000,...,638510.00000,123605608.0,102651.070312,624.500,123600960.0,102610.109375,646.500,123724272.0,0.0,2.000000
1,123456549,Cash,26243.730469,321183.593750,393309.000000,1.170000,321183.593750,729.500,67.000000,24.000000,...,321397.00000,123602512.0,103926.343750,572.000,123606848.0,102817.453125,641.000,123456552.0,,1.000000
2,123428178,Credit Card,32940.507812,386057.250000,438194.250000,3304.125000,386057.250000,1078.000,214.625000,15.500000,...,638510.00000,123601048.0,100741.906250,606.000,123592176.0,100686.375000,563.000,123428176.0,,1.000000
3,123619984,Cash,13925.384766,397755.000000,357979.500000,39775.500000,397755.000000,637.000,25.000000,36.000000,...,,,,,,,,,,
4,123671104,Cash,7451.520020,50146.957031,45373.429688,7230.285156,62683.695312,1294.000,1176.599976,9.335938,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165136,123487967,Cash,13829.872070,82500.000000,98340.000000,,247500.000000,219.625,-1.000000,24.000000,...,,,,,,,,,,
165137,123536402,Cash,18183.599609,181498.500000,163347.750000,18150.750000,181498.500000,269.000,65.000000,10.000000,...,638510.00000,123603992.0,103860.562500,722.000,123607064.0,102914.617188,734.000,123536400.0,0.0,2.000000
165138,123718238,Cash,19887.287109,231750.000000,260080.656250,0.000000,257500.000000,1835.000,-0.700000,16.796875,...,410275.34375,123600680.0,102069.203125,415.250,123598784.0,102830.515625,402.750,123718240.0,0.0,8.296875
165139,123631557,Cash,20090.880859,260456.781250,303534.000000,2322.000000,260456.781250,1149.000,18.571428,23.140625,...,638510.00000,123600240.0,102042.789062,531.000,123601992.0,102946.281250,508.750,123631560.0,0.0,4.199219


In [9]:
def type_selector(df):

    continuous_columns = df.select_dtypes(include=[np.number]).drop(['APPLICATION_NUMBER'], 1)
    continuous_columns = continuous_columns.columns.to_list()
    #continuous_columns = continuous_columns[1:]

    cat_feature_num = [
        feature for feature in continuous_columns
        if len(df[feature].unique())< 40
    ]
    categorical_columns = df.select_dtypes(include=[np.object]).columns.to_list()
    categorical_columns = list(categorical_columns + cat_feature_num)
    #categorical_columns = categorical_columns[1:]

    continuous_columns = list(set(continuous_columns) - set(categorical_columns))
    
    return categorical_columns, continuous_columns



In [10]:
cat_col, con_col =  type_selector(train.drop("TARGET", 1))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_columns = df.select_dtypes(include=[np.object]).columns.to_list()


In [11]:
dummies_features = pd.get_dummies(train['NAME_CONTRACT_TYPE'])
train = pd.concat([train, dummies_features], axis=1)
dummies_features = dummies_features.columns.tolist()
train = train.drop("NAME_CONTRACT_TYPE", axis=1)

In [12]:
dummies_features = pd.get_dummies(test['NAME_CONTRACT_TYPE'])
test = pd.concat([test, dummies_features], axis=1)
dummies_features = dummies_features.columns.tolist()
test = test.drop("NAME_CONTRACT_TYPE", axis=1)

In [13]:
train

Unnamed: 0,APPLICATION_NUMBER,TARGET,AMOUNT_ANNUITY_x,AMT_APPLICATION,AMOUNT_CREDIT_x,AMOUNT_PAYMENT,AMOUNT_GOODS_PAYMENT,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,...,DAYS_ENTRY_PAYMENT_std_APPLICATION_NUMBER,DAYS_ENTRY_PAYMENT_count_APPLICATION_NUMBER,DAYS_INSTALMENT_mean-APPLICATION_NUMBER,DAYS_INSTALMENT_std_APPLICATION_NUMBER,DAYS_INSTALMENT_count_APPLICATION_NUMBER,PREV_APPLICATION_NUMBER_mean-APPLICATION_NUMBER_y,PREV_APPLICATION_NUMBER_std_APPLICATION_NUMBER,PREV_APPLICATION_NUMBER_count_APPLICATION_NUMBER_y,Cash,Credit Card
0,123687442,0,7703.805176,68787.179688,67592.101562,4813.080078,68787.179688,1221.0,86.666664,10.664062,...,104850.492188,153.000,123601368.0,104226.812500,161.000,123687440.0,0.0,2.0,1,0
1,123597908,1,27919.001953,331908.750000,434949.750000,0.000000,331908.750000,659.0,38.750000,26.500000,...,99689.250000,721.000,123599568.0,100609.742188,689.000,123597904.0,,1.0,1,0
2,123526683,0,32538.476562,353857.500000,402818.250000,18814.500000,707715.000000,1423.0,16.833334,18.000000,...,101979.968750,357.500,123602040.0,103538.539062,328.500,123526680.0,0.0,1.5,1,0
3,123710391,1,4237.694824,61206.750000,59661.000000,2250.000000,61206.750000,1152.0,2058.500000,14.000000,...,104155.031250,231.000,123593160.0,105341.585938,212.000,123710392.0,,1.0,1,0
4,123590329,1,14583.622070,266842.000000,308073.500000,1462.500000,300197.250000,741.0,55.777779,30.750000,...,102476.609375,279.000,123598328.0,102099.812500,285.250,123590328.0,0.0,4.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110088,123458312,0,13051.197266,66922.867188,74327.398438,0.832500,66922.867188,1308.0,1126.599976,7.199219,...,103644.585938,244.375,123601632.0,102128.742188,243.625,123458312.0,0.0,3.0,1,0
110089,123672463,0,9349.533203,68150.523438,67411.382812,6643.649902,88595.687500,585.0,26.692308,10.203125,...,101471.093750,439.750,123599232.0,101529.312500,460.500,123672464.0,0.0,2.0,1,0
110090,123723001,0,4000.297607,30363.750000,28244.250000,4281.750000,30363.750000,464.5,156.000000,9.000000,...,104741.265625,614.000,123604768.0,101463.195312,710.000,123723000.0,,1.0,1,0
110091,123554358,0,24760.710938,299340.000000,327861.000000,0.000000,299340.000000,789.0,66.000000,21.000000,...,,,,,,,,,1,0


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165141 entries, 0 to 165140
Columns: 268 entries, APPLICATION_NUMBER to Credit Card
dtypes: float32(265), int32(1), uint8(2)
memory usage: 167.9 MB


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110093 entries, 0 to 110092
Columns: 269 entries, APPLICATION_NUMBER to Credit Card
dtypes: float32(265), int32(1), int8(1), uint8(2)
memory usage: 112.0 MB


In [16]:
X_train, X_test, y_train, y_test = train_test_split(train.drop("TARGET",1), 
                                                    train['TARGET'], random_state=0)

In [17]:
data = pd.DataFrame()

In [18]:
params_lgbm = {"boosting_type": "gbdt",
                  "objective": "binary",
                  "metric": "auc",
                  "num_boost_round": 10000,  
                  "learning_rate": 0.01,          
                  "class_weight": 'balanced',
                  "max_depth": 4,
                  "n_estimators": 5000,
                  "n_jobs": -1,
                  "seed": 27}

cv = KFold(n_splits=10, random_state=435, shuffle=True)
target, data = train["TARGET"], train.drop("TARGET", 1)

In [19]:
estimators_lgbm_basic, oof_preds_lgbm_basic = lightgbm_cross_validation(
    params_lgbm, data, target, cv
)

Mon Apr 12 15:29:50 2021, Cross-Validation, 110093 rows, 268 cols




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.69897
[100]	valid_0's auc: 0.708889
[150]	valid_0's auc: 0.714404
[200]	valid_0's auc: 0.718817
[250]	valid_0's auc: 0.720377
[300]	valid_0's auc: 0.720784
[350]	valid_0's auc: 0.723016
[400]	valid_0's auc: 0.724107
[450]	valid_0's auc: 0.724832
[500]	valid_0's auc: 0.724053
Early stopping, best iteration is:
[449]	valid_0's auc: 0.725064
Fold 1, Valid score = 0.72506




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.701526
[100]	valid_0's auc: 0.70953
[150]	valid_0's auc: 0.711478
[200]	valid_0's auc: 0.715993
[250]	valid_0's auc: 0.718286
[300]	valid_0's auc: 0.71945
[350]	valid_0's auc: 0.720837
[400]	valid_0's auc: 0.721456
[450]	valid_0's auc: 0.722854
[500]	valid_0's auc: 0.722936
[550]	valid_0's auc: 0.722562
Early stopping, best iteration is:
[477]	valid_0's auc: 0.723081
Fold 2, Valid score = 0.72308




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.70047
[100]	valid_0's auc: 0.709165
[150]	valid_0's auc: 0.71345
[200]	valid_0's auc: 0.716442
[250]	valid_0's auc: 0.720072
[300]	valid_0's auc: 0.722655
[350]	valid_0's auc: 0.724705
[400]	valid_0's auc: 0.725338
[450]	valid_0's auc: 0.726119
[500]	valid_0's auc: 0.72564
[550]	valid_0's auc: 0.725577
Early stopping, best iteration is:
[465]	valid_0's auc: 0.726454
Fold 3, Valid score = 0.72645




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.699979
[100]	valid_0's auc: 0.707562
[150]	valid_0's auc: 0.712409
[200]	valid_0's auc: 0.716095
[250]	valid_0's auc: 0.719034
[300]	valid_0's auc: 0.721542
[350]	valid_0's auc: 0.72432
[400]	valid_0's auc: 0.725556
[450]	valid_0's auc: 0.726567
[500]	valid_0's auc: 0.727045
[550]	valid_0's auc: 0.726067
Early stopping, best iteration is:
[482]	valid_0's auc: 0.727232
Fold 4, Valid score = 0.72723




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.683143
[100]	valid_0's auc: 0.690708
[150]	valid_0's auc: 0.695327
[200]	valid_0's auc: 0.6981
[250]	valid_0's auc: 0.701586
[300]	valid_0's auc: 0.702477
[350]	valid_0's auc: 0.704137
[400]	valid_0's auc: 0.705913
[450]	valid_0's auc: 0.706699
[500]	valid_0's auc: 0.706864
[550]	valid_0's auc: 0.708168
[600]	valid_0's auc: 0.708112
Early stopping, best iteration is:
[548]	valid_0's auc: 0.708211
Fold 5, Valid score = 0.70821




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.670138
[100]	valid_0's auc: 0.679458
[150]	valid_0's auc: 0.684414
[200]	valid_0's auc: 0.687354
[250]	valid_0's auc: 0.690088
[300]	valid_0's auc: 0.693979
[350]	valid_0's auc: 0.696831
[400]	valid_0's auc: 0.697783
[450]	valid_0's auc: 0.698025
[500]	valid_0's auc: 0.697789
[550]	valid_0's auc: 0.698295
[600]	valid_0's auc: 0.698132
[650]	valid_0's auc: 0.69861
[700]	valid_0's auc: 0.698616
[750]	valid_0's auc: 0.698598
Early stopping, best iteration is:
[686]	valid_0's auc: 0.698813
Fold 6, Valid score = 0.69881




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.696836
[100]	valid_0's auc: 0.704256
[150]	valid_0's auc: 0.709541
[200]	valid_0's auc: 0.71311
[250]	valid_0's auc: 0.716396
[300]	valid_0's auc: 0.717695
[350]	valid_0's auc: 0.718964
[400]	valid_0's auc: 0.719349
[450]	valid_0's auc: 0.720119
[500]	valid_0's auc: 0.721009
[550]	valid_0's auc: 0.721448
[600]	valid_0's auc: 0.720942
[650]	valid_0's auc: 0.720953
Early stopping, best iteration is:
[550]	valid_0's auc: 0.721448
Fold 7, Valid score = 0.72145




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.674664
[100]	valid_0's auc: 0.678734
[150]	valid_0's auc: 0.685441
[200]	valid_0's auc: 0.690558
[250]	valid_0's auc: 0.692071
[300]	valid_0's auc: 0.694886
[350]	valid_0's auc: 0.69662
[400]	valid_0's auc: 0.697451
[450]	valid_0's auc: 0.698626
[500]	valid_0's auc: 0.69997
[550]	valid_0's auc: 0.700796
[600]	valid_0's auc: 0.70134
[650]	valid_0's auc: 0.701445
[700]	valid_0's auc: 0.701562
[750]	valid_0's auc: 0.701657
[800]	valid_0's auc: 0.701885
[850]	valid_0's auc: 0.702275
[900]	valid_0's auc: 0.70259
[950]	valid_0's auc: 0.702685
[1000]	valid_0's auc: 0.702732
[1050]	valid_0's auc: 0.702681
Early stopping, best iteration is:
[970]	valid_0's auc: 0.702793
Fold 8, Valid score = 0.70279




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.701214
[100]	valid_0's auc: 0.706243
[150]	valid_0's auc: 0.71127
[200]	valid_0's auc: 0.714173
[250]	valid_0's auc: 0.716077
[300]	valid_0's auc: 0.719194
[350]	valid_0's auc: 0.719383
[400]	valid_0's auc: 0.719854
[450]	valid_0's auc: 0.720018
[500]	valid_0's auc: 0.720227
[550]	valid_0's auc: 0.720391
[600]	valid_0's auc: 0.720831
[650]	valid_0's auc: 0.722357
[700]	valid_0's auc: 0.722593
[750]	valid_0's auc: 0.722436
Early stopping, best iteration is:
[684]	valid_0's auc: 0.722668
Fold 9, Valid score = 0.72267




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.691292
[100]	valid_0's auc: 0.695462
[150]	valid_0's auc: 0.698847
[200]	valid_0's auc: 0.702587
[250]	valid_0's auc: 0.703415
[300]	valid_0's auc: 0.705426
[350]	valid_0's auc: 0.708653
[400]	valid_0's auc: 0.709393
[450]	valid_0's auc: 0.709647
[500]	valid_0's auc: 0.710585
[550]	valid_0's auc: 0.711133
[600]	valid_0's auc: 0.711262
[650]	valid_0's auc: 0.711608
[700]	valid_0's auc: 0.710897
Early stopping, best iteration is:
[643]	valid_0's auc: 0.711716
Fold 10, Valid score = 0.71172
Score by each fold: [0.72506, 0.72308, 0.72645, 0.72723, 0.70821, 0.69881, 0.72145, 0.70279, 0.72267, 0.71172]


In [20]:
xgb_params = {
    "booster": "gbtree", 
    "objective": "binary:logistic", 
    "eval_metric": "auc", 
    "learning_rate": 0.05,  
    "reg_lambda": 100, 
    "max_depth": 4, 
    "gamma": 10, 
    "nthread": -1, 
    "seed": 27} 


xgb_cv = KFold(n_splits=10, random_state=42, shuffle=True)

In [21]:
xgb_estimators_basic, _, xgb_oof_basic = xgboost_cross_validation(
    xgb_params, data, target, xgb_cv)

Mon Apr 12 15:39:16 2021, Cross-Validation, 110093 rows, 268 cols
[0]	train-auc:0.58923	valid-auc:0.58548
[10]	train-auc:0.65897	valid-auc:0.66865
[20]	train-auc:0.68246	valid-auc:0.69386
[30]	train-auc:0.68896	valid-auc:0.70070
[40]	train-auc:0.69584	valid-auc:0.70655
[50]	train-auc:0.70219	valid-auc:0.71368
[60]	train-auc:0.70807	valid-auc:0.71874
[70]	train-auc:0.71297	valid-auc:0.72447
[80]	train-auc:0.71659	valid-auc:0.72834
[90]	train-auc:0.72013	valid-auc:0.73039
[100]	train-auc:0.72193	valid-auc:0.73226
[110]	train-auc:0.72398	valid-auc:0.73418
[120]	train-auc:0.72563	valid-auc:0.73625
[130]	train-auc:0.72677	valid-auc:0.73729
[140]	train-auc:0.72810	valid-auc:0.73743
[150]	train-auc:0.72954	valid-auc:0.73760
[160]	train-auc:0.73073	valid-auc:0.73765
[166]	train-auc:0.73145	valid-auc:0.73766
Fold 1, Valid score = 0.73767
[0]	train-auc:0.58851	valid-auc:0.59281
[10]	train-auc:0.67187	valid-auc:0.68161
[20]	train-auc:0.68369	valid-auc:0.69088
[30]	train-auc:0.69029	valid-auc:0.69

In [22]:
params_catb = {
    "n_estimators": 10000,
    "loss_function": "Logloss",
    "task_type": "CPU",
    "learning_rate":0.01,
    "max_bin": 30,
    "verbose": 200,
    "max_depth": 7,
    "l2_leaf_reg": 40,
    "early_stopping_rounds": 50,
    "thread_count": -1,
    "random_seed": 42,
    "eval_metric":"AUC",
}


cv = KFold(n_splits=10, random_state=435, shuffle=True)
target, data = train["TARGET"], train.drop("TARGET", 1)

In [23]:
catb_estimators_basic, catb_oof_basic = catboost_cross_validation(
    params_catb, data, target, cv)

Mon Apr 12 16:11:16 2021, Cross-Validation, 110093 rows, 268 cols
0:	test: 0.4845687	best: 0.4845687 (0)	total: 119ms	remaining: 19m 46s
50:	test: 0.6713744	best: 0.6713744 (50)	total: 3.58s	remaining: 11m 38s
100:	test: 0.6979830	best: 0.6987738 (92)	total: 7.21s	remaining: 11m 46s
150:	test: 0.7050875	best: 0.7052856 (148)	total: 10.5s	remaining: 11m 27s
200:	test: 0.7098751	best: 0.7098751 (200)	total: 14.2s	remaining: 11m 30s
250:	test: 0.7138888	best: 0.7138888 (250)	total: 17.8s	remaining: 11m 32s
300:	test: 0.7162795	best: 0.7163543 (299)	total: 21.5s	remaining: 11m 31s
350:	test: 0.7182079	best: 0.7182079 (350)	total: 25.1s	remaining: 11m 29s
400:	test: 0.7193122	best: 0.7193122 (400)	total: 28.7s	remaining: 11m 27s
450:	test: 0.7209347	best: 0.7209347 (450)	total: 32.4s	remaining: 11m 25s
500:	test: 0.7219107	best: 0.7219107 (500)	total: 35.9s	remaining: 11m 20s
550:	test: 0.7223847	best: 0.7223938 (548)	total: 39.5s	remaining: 11m 17s
600:	test: 0.7229536	best: 0.7229930 (591

In [24]:
cat_col, con_col =  type_selector(train.drop("TARGET", 1))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_columns = df.select_dtypes(include=[np.object]).columns.to_list()


In [25]:
def freq_encoder(df,features_list):
    
    for item in features_list:
        freq_encoder = df[item].value_counts(normalize=True)
        df[item+"_freq_enc"] = df[item].map(freq_encoder)
    
    return df

In [26]:
train_freq_encoder = freq_encoder(train.drop('TARGET',1), cat_col+con_col)
test_freq_encoder = freq_encoder(test, cat_col+con_col)
train_freq_encoder["TARGET"] = train["TARGET"]

cv = KFold(n_splits=10, random_state=435, shuffle=True)
target, data = train_freq_encoder["TARGET"], train_freq_encoder.drop("TARGET", 1)

In [27]:
estimators_lgbm_freg_enc, oof_preds_lgbm_freg_enc = lightgbm_cross_validation(
    params_lgbm, data, target, cv
)

Mon Apr 12 16:27:47 2021, Cross-Validation, 110093 rows, 535 cols




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.699523
[100]	valid_0's auc: 0.708728
[150]	valid_0's auc: 0.713855
[200]	valid_0's auc: 0.719379
[250]	valid_0's auc: 0.721175
[300]	valid_0's auc: 0.721769
[350]	valid_0's auc: 0.722755
[400]	valid_0's auc: 0.723439
[450]	valid_0's auc: 0.724294
[500]	valid_0's auc: 0.724835
[550]	valid_0's auc: 0.724859
[600]	valid_0's auc: 0.725141
[650]	valid_0's auc: 0.725479
[700]	valid_0's auc: 0.725837
[750]	valid_0's auc: 0.725659
Early stopping, best iteration is:
[698]	valid_0's auc: 0.725893
Fold 1, Valid score = 0.72589




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.700582
[100]	valid_0's auc: 0.70871
[150]	valid_0's auc: 0.710933
[200]	valid_0's auc: 0.715259
[250]	valid_0's auc: 0.717256
[300]	valid_0's auc: 0.718297
[350]	valid_0's auc: 0.720377
[400]	valid_0's auc: 0.721074
[450]	valid_0's auc: 0.721609
[500]	valid_0's auc: 0.722234
[550]	valid_0's auc: 0.723198
[600]	valid_0's auc: 0.723661
[650]	valid_0's auc: 0.723272
Early stopping, best iteration is:
[580]	valid_0's auc: 0.723769
Fold 2, Valid score = 0.72377




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.701393
[100]	valid_0's auc: 0.70872
[150]	valid_0's auc: 0.712527
[200]	valid_0's auc: 0.716386
[250]	valid_0's auc: 0.718977
[300]	valid_0's auc: 0.721286
[350]	valid_0's auc: 0.723703
[400]	valid_0's auc: 0.723988
[450]	valid_0's auc: 0.7252
[500]	valid_0's auc: 0.725279
[550]	valid_0's auc: 0.7253
Early stopping, best iteration is:
[483]	valid_0's auc: 0.725864
Fold 3, Valid score = 0.72586




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.700368
[100]	valid_0's auc: 0.707404
[150]	valid_0's auc: 0.711939
[200]	valid_0's auc: 0.715209
[250]	valid_0's auc: 0.718148
[300]	valid_0's auc: 0.719758
[350]	valid_0's auc: 0.721803
[400]	valid_0's auc: 0.72307
[450]	valid_0's auc: 0.72343
[500]	valid_0's auc: 0.723254
Early stopping, best iteration is:
[438]	valid_0's auc: 0.723642
Fold 4, Valid score = 0.72364




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.683093
[100]	valid_0's auc: 0.690672
[150]	valid_0's auc: 0.694594
[200]	valid_0's auc: 0.699146
[250]	valid_0's auc: 0.701535
[300]	valid_0's auc: 0.703067
[350]	valid_0's auc: 0.704815
[400]	valid_0's auc: 0.706288
[450]	valid_0's auc: 0.707231
[500]	valid_0's auc: 0.707815
[550]	valid_0's auc: 0.708247
[600]	valid_0's auc: 0.708135
[650]	valid_0's auc: 0.706863
Early stopping, best iteration is:
[564]	valid_0's auc: 0.708454
Fold 5, Valid score = 0.70845




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.670503
[100]	valid_0's auc: 0.680569
[150]	valid_0's auc: 0.684714
[200]	valid_0's auc: 0.687978
[250]	valid_0's auc: 0.691135
[300]	valid_0's auc: 0.693422
[350]	valid_0's auc: 0.69563
[400]	valid_0's auc: 0.698156
[450]	valid_0's auc: 0.698989
[500]	valid_0's auc: 0.699782
[550]	valid_0's auc: 0.700044
[600]	valid_0's auc: 0.700616
[650]	valid_0's auc: 0.700563
Early stopping, best iteration is:
[597]	valid_0's auc: 0.700678
Fold 6, Valid score = 0.70068




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.695901
[100]	valid_0's auc: 0.703913
[150]	valid_0's auc: 0.708458
[200]	valid_0's auc: 0.712934
[250]	valid_0's auc: 0.715151
[300]	valid_0's auc: 0.716517
[350]	valid_0's auc: 0.717235
[400]	valid_0's auc: 0.7174
[450]	valid_0's auc: 0.718938
[500]	valid_0's auc: 0.719366
[550]	valid_0's auc: 0.719939
[600]	valid_0's auc: 0.719723
Early stopping, best iteration is:
[547]	valid_0's auc: 0.720106
Fold 7, Valid score = 0.72011




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.672949
[100]	valid_0's auc: 0.678448
[150]	valid_0's auc: 0.684852
[200]	valid_0's auc: 0.689231
[250]	valid_0's auc: 0.690864
[300]	valid_0's auc: 0.694223
[350]	valid_0's auc: 0.696715
[400]	valid_0's auc: 0.697103
[450]	valid_0's auc: 0.69865
[500]	valid_0's auc: 0.699649
[550]	valid_0's auc: 0.700628
[600]	valid_0's auc: 0.701486
[650]	valid_0's auc: 0.701559
[700]	valid_0's auc: 0.701846
[750]	valid_0's auc: 0.701579
Early stopping, best iteration is:
[689]	valid_0's auc: 0.701912
Fold 8, Valid score = 0.70191




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.70094
[100]	valid_0's auc: 0.707324
[150]	valid_0's auc: 0.710472
[200]	valid_0's auc: 0.713919
[250]	valid_0's auc: 0.716656
[300]	valid_0's auc: 0.717731
[350]	valid_0's auc: 0.718153
[400]	valid_0's auc: 0.72007
[450]	valid_0's auc: 0.720156
[500]	valid_0's auc: 0.719893
[550]	valid_0's auc: 0.720615
[600]	valid_0's auc: 0.721375
[650]	valid_0's auc: 0.721098
[700]	valid_0's auc: 0.721142
Early stopping, best iteration is:
[600]	valid_0's auc: 0.721375
Fold 9, Valid score = 0.72137




Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.691379
[100]	valid_0's auc: 0.695735
[150]	valid_0's auc: 0.698138
[200]	valid_0's auc: 0.701605
[250]	valid_0's auc: 0.705006
[300]	valid_0's auc: 0.707578
[350]	valid_0's auc: 0.708579
[400]	valid_0's auc: 0.709253
[450]	valid_0's auc: 0.709448
[500]	valid_0's auc: 0.709643
Early stopping, best iteration is:
[437]	valid_0's auc: 0.710116
Fold 10, Valid score = 0.71012
Score by each fold: [0.72589, 0.72377, 0.72586, 0.72364, 0.70845, 0.70068, 0.72011, 0.70191, 0.72137, 0.71012]


In [None]:
xgb_estimators_freg_enc, _, xgb_oof_freg_enc = xgboost_cross_validation(
    xgb_params, data, target, xgb_cv)

Mon Apr 12 16:40:50 2021, Cross-Validation, 110093 rows, 535 cols
[0]	train-auc:0.58923	valid-auc:0.58548
[10]	train-auc:0.67261	valid-auc:0.68255
[20]	train-auc:0.68313	valid-auc:0.69435
[30]	train-auc:0.68853	valid-auc:0.70132
[40]	train-auc:0.69521	valid-auc:0.70576
[50]	train-auc:0.70247	valid-auc:0.71271
[60]	train-auc:0.70762	valid-auc:0.71930
[70]	train-auc:0.71288	valid-auc:0.72451
[80]	train-auc:0.71725	valid-auc:0.72832
[90]	train-auc:0.72030	valid-auc:0.73060
[100]	train-auc:0.72229	valid-auc:0.73286
[110]	train-auc:0.72437	valid-auc:0.73493
[120]	train-auc:0.72585	valid-auc:0.73591
[130]	train-auc:0.72709	valid-auc:0.73638
[140]	train-auc:0.72892	valid-auc:0.73695
[150]	train-auc:0.73053	valid-auc:0.73672
[160]	train-auc:0.73201	valid-auc:0.73638
[166]	train-auc:0.73273	valid-auc:0.73671
Fold 1, Valid score = 0.73671
[0]	train-auc:0.58851	valid-auc:0.59281
[10]	train-auc:0.67187	valid-auc:0.68161
[20]	train-auc:0.68369	valid-auc:0.69077
[30]	train-auc:0.69108	valid-auc:0.69

In [None]:
catb_estimators_freg_enc, catb_oof_freg_enc = catboost_cross_validation(
    params_catb, data, target, cv)

In [None]:
def group_by_stat(df,features_list, num_featires):
    
    for feature in num_featires:
        for item in features_list:
            df[item+"_mean-"+feature] = df.groupby(item)[feature].transform('mean')
            df[item+"_count_"+feature] = df.groupby(item)[feature].transform('count')
            df[item+"_std_"+feature] = df.groupby(item)[feature].transform('std')
            df[item+"_min-"+feature] = df.groupby(item)[feature].transform('min')
            df[item+"_max_"+feature] = df.groupby(item)[feature].transform('max')
    
    return df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_freq_encoder.drop("TARGET", 1), 
                                                    train_freq_encoder['TARGET'], random_state=0)

In [None]:
%%time

model = lgbm.LGBMClassifier(**params_lgbm)
model.fit(X_train, y_train, verbose=200 , eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
feature_importances = pd.DataFrame(zip(X_train.columns, 
                                       model.feature_importances_ / model.feature_importances_.sum()), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False, inplace=True)

In [None]:
feature_above_zero = feature_importances[feature_importances['importance'] > 0]
feature_above_zero_filtred = feature_above_zero['feature_name'].tolist()
feature_above_zero

In [None]:
target, data = train_freq_encoder["TARGET"], train_freq_encoder[feature_above_zero_filtred]

In [None]:
estimators_lgbm_above_zero_, oof_preds_lgbm_above_zero_ = lightgbm_cross_validation(
    params_lgbm, data, target, cv
)

In [None]:
xgb_estimators_above_zero, _, xgb_oof_above_zero = xgboost_cross_validation(
    xgb_params, data, target, xgb_cv)

In [None]:
catb_estimators_above_zero, catb_oof_above_zero = catboost_cross_validation(
    params_catb, data, target, cv)

In [None]:
cat_col, con_col =  type_selector(train_freq_encoder[feature_above_zero_filtred])

In [None]:
feature_importances = pd.DataFrame(zip(X_train.columns, 
                                       model.feature_importances_ / model.feature_importances_.sum()), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False, inplace=True)

In [None]:
feature_above_zero = feature_importances[feature_importances['importance'] > 0.02]
feature_above_zero_agg_002 = feature_above_zero['feature_name'].tolist()

In [None]:
feature_above_zero_agg_002

In [None]:
train_feature_imp = train_freq_encoder[feature_above_zero_filtred]
test_feature_imp = test_freq_encoder[feature_above_zero_filtred]

In [None]:
cat_col, con_col =  type_selector(train_feature_imp)

In [None]:
train_feature_imp.info()

In [None]:
train_all_in_one = group_by_stat(train_feature_imp, cat_col ,feature_above_zero_agg_002)
test_all_in_one = group_by_stat(test_feature_imp, cat_col ,feature_above_zero_agg_002)

In [None]:
train_all_in_one.shape

In [None]:
test_all_in_one.shape

In [None]:
train_all_in_one["TARGET"] = train["TARGET"]
target, data = train_all_in_one["TARGET"], train_all_in_one.drop('TARGET',1)

In [None]:
estimators_lgbm_all_in_one, oof_preds_lgbm_all_in_one = lightgbm_cross_validation(
    params_lgbm, data, target, cv
)

In [None]:
xgb_estimators_all_in_one, _, xgb_oof_all_in_one = xgboost_cross_validation(
    xgb_params, data, target, xgb_cv)

In [None]:
catb_estimators_all_in_one, catb_oof_all_in_one = catboost_cross_validation(
    params_catb, data, target, cv)

In [None]:
dsfsdfs

In [None]:
train_all_in_one.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_all_in_one.drop("TARGET", 1), 
                                                    train_all_in_one['TARGET'], random_state=0)

In [None]:
%%time

model = lgbm.LGBMClassifier(**params_lgbm)
model.fit(X_train, y_train, verbose=200 , eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
feature_importances = pd.DataFrame(zip(X_train.columns, 
                                       model.feature_importances_ / model.feature_importances_.sum()), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False, inplace=True)

In [None]:
feature_above_zero = feature_importances[feature_importances['importance'] > 0]
feature_above_zero_agg_002 = feature_above_zero['feature_name'].tolist()

In [None]:
feature_above_zero

In [None]:
train_all_in_one["TARGET"] = train["TARGET"]
target, data = train_all_in_one["TARGET"], train_all_in_one[feature_above_zero_agg_002]

In [None]:
estimators_lgbm_all_in_one_above_zero, oof_preds_lgbm_all_in_one_above_zero = lightgbm_cross_validation(
    params_lgbm, data, target, cv)


In [None]:
xgb_estimators_all_in_one_above_zero, _, xgb_oof_all_in_one_above_zero = xgboost_cross_validation(
    xgb_params, data, target, xgb_cv)

In [None]:
catb_estimators_all_in_one_above_zero, catb_oof_all_in_one_above_zero = catboost_cross_validation(
    params_catb, data, target, cv)

In [None]:

test__all = test_all_in_one[feature_above_zero_agg_002]

In [None]:
train__all = train_all_in_one[feature_above_zero_agg_002]

In [None]:
train__all["TARGET"] = train["TARGET"]

In [None]:
test__all.shape

In [None]:
train__all.shape

In [None]:
ghjg

In [None]:
train__all["TARGET"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train__all.drop("TARGET", 1), 
                                                    train__all['TARGET'], random_state=0)

In [None]:
params_catb = {
    "n_estimators": 10000,
    "loss_function": "Logloss",
    "task_type": "CPU",
    "learning_rate":0.005,
    "max_bin": 30,
    "verbose": 200,
    "max_depth": 7,
    "l2_leaf_reg": 40,
    "early_stopping_rounds": 50,
    "thread_count": -1,
    "random_seed": 42,
    "eval_metric":"AUC",
}

In [None]:
%%time

model = catb.CatBoostClassifier(**params_catb)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
roc_auc = pd.DataFrame()
for y_pred in OOF_results.columns:
    roc_auc.loc[y_pred, 'roc_auc_score'] = roc_auc_score(train[TARGET_NAME], OOF_results[y_pred])
    
roc_auc 

In [None]:
catb_oof_all_in_one_above_zero

In [None]:
scores = pd.DataFrame({
    "oof_preds_lgbm_basic": oof_preds_lgbm_basic,
    "xgb_oof_basic": xgb_oof_basic,
    "catb_oof_basic": catb_oof_basic,
    "oof_preds_lgbm_freg_enc": oof_preds_lgbm_freg_enc,
    "xgb_oof_freg_enc": xgb_oof_freg_enc,
    "catb_oof_freg_enc": catb_oof_freg_enc,
    "oof_preds_lgbm_above_zero_": oof_preds_lgbm_above_zero_,
    "xgb_oof_above_zero": xgb_oof_above_zero,
    "catb_oof_above_zero": catb_oof_above_zero,
    "oof_preds_lgbm_all_in_one": oof_preds_lgbm_all_in_one,
    "xgb_oof_all_in_one": xgb_oof_all_in_one,
    "catb_oof_all_in_one": catb_oof_all_in_one,
    "oof_preds_lgbm_all_in_one_above_zero": oof_preds_lgbm_all_in_one_above_zero,
    "xgb_oof_all_in_one_above_zero": xgb_oof_all_in_one_above_zero,
    "catb_oof_all_in_one_above_zero": catb_oof_all_in_one_above_zero,
})

corr = scores.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 10))
sns.heatmap(corr, mask=mask, annot=True, fmt=".4g", square=True, cmap="viridis", ax=axes)

In [None]:
scores_mean = scores.mean(axis=1)
score = roc_auc_score(target, scores_mean)
print(f"Score = {round(score, 5)}")

In [None]:
scores_mean = gmean(scores, axis=1)
score = roc_auc_score(target, scores_mean)
print(f"Score = {round(score, 5)}")

In [None]:
# ROC-AUC / GINI
scores_mean = scores.rank().mean(axis=1)
score = roc_auc_score(target, scores_mean)
print(f"Score = {round(score, 5)}")

In [None]:
scores_mean = gmean(scores.rank(), axis=1)
score = roc_auc_score(target, scores_mean)
print(f"Score = {round(score, 5)}")

In [None]:
scores

In [None]:
roc_auc = pd.DataFrame()
for target in scores.columns:
    roc_auc.loc[target, 'roc_auc_score'] = roc_auc_score(train["TARGET"], scores[target])
    
roc_auc 

In [None]:
results_train, results_valid, results_y_train, results_y_valid = \
train_test_split(scores, train__all["TARGET"], 
                 test_size=0.3, stratify=train__all["TARGET"], random_state=42)

results_valid, results_test, results_y_valid,  results_y_test = \
train_test_split(results_valid, results_y_valid, 
                 test_size=0.5, stratify=results_y_valid, random_state=42)

#params_rf = {"n_estimators": 300,
#                     "max_depth": 6,
#                     "class_weight": 'balanced_subsample', 
#                     "random_state": 42}  



model_cb = catb.CatBoostClassifier(**params_catb)
model_cb.fit(results_train, results_y_train, eval_set=[(results_valid, results_y_valid), (results_test, results_y_test)])




#model_cb = RandomForestClassifier(**params_rf)
#model_cb.fit(results_train, results_y_train)
y_pred_ = model_cb.predict_proba(results_valid)[:, 1] 

y_train_pred = model_cb.predict_proba(results_train)[:, 1]
train_score = roc_auc_score(results_y_train, y_train_pred) 

y_valid_pred = model_cb.predict_proba(results_valid)[:, 1]
valid_score = roc_auc_score(results_y_valid, y_valid_pred)
        
y_test_pred = model_cb.predict_proba(results_test)[:, 1]
test_score = roc_auc_score(results_y_test, y_test_pred)

print(f'train score = {round(train_score, 5)}, valid score = {round(valid_score, 5)}, test score = {round(test_score, 5)}')

In [None]:
preds_final = pd.DataFrame()
preds_final['APPLICATION_NUMBER'] = test__all.APPLICATION_NUMBER.copy()

In [None]:
test__all.shape

In [None]:
train__all.shape

In [None]:
preds_final['TARGET'] = model_cb.predict_proba(test__all)[:, 1]
preds_final.to_csv('predictions.csv', index=False)

preds_final.describe()

In [None]:
df = test.copy()
#y_pred_final = model_rf.predict_proba(predictions)[:, 1]
#df.loc[:, 'TARGET'] = y_pred_final
y_pred_final = predictions['cb'].values
df.loc[:, 'TARGET'] = y_pred_final / y_pred_final.max()
y_final = df[['APPLICATION_NUMBER', 'TARGET']].groupby('APPLICATION_NUMBER').mean()
y_final.to_csv(PATH + 'predictions.csv')
y_final.head(2)

In [None]:
preds_final.head()