## Отбор признаков, обучение модели, предсказание и оценка на лидерборд

In [5]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from utils import create_client_profile_features, get_input, one_hot_encode, calculate_feature_separating_ability, lightgbm_cross_validation

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold

In [49]:
from datetime import timedelta, datetime, date

### загрузка предварительно подготовленных датасетов с признаками

In [8]:
history = get_input('../data/history_prep.csv')
bki = get_input('../data/bki_prep.csv')
client_profile = get_input('../data/client_prep.csv')
payments = get_input('../data/payments_prep.csv')
prev_count = get_input('../data/app_prev_number.csv')

../data/history_prep.csv: shape = 1670214 rows, 89 cols
../data/bki_prep.csv: shape = 945234 rows, 36 cols
../data/client_prep.csv: shape = 250000 rows, 59 cols
../data/payments_prep.csv: shape = 1023932 rows, 8 cols
../data/app_prev_number.csv: shape = 338857 rows, 2 cols


In [17]:
history = history.sort_values(by="application_number").drop_duplicates(subset=["application_number"], keep="last")
bki = bki.sort_values(by="application_number").drop_duplicates(subset=["application_number"], keep="last")
payments = payments.sort_values(by="application_number").drop_duplicates(subset=["application_number"], keep="last")

### подготовка  train, test

In [18]:
#загрузка исходных данных
train = get_input("../data/train.csv")
test = get_input("../data/test.csv")

#объединение для добавления признаков
data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)

../data/train.csv: shape = 110093 rows, 3 cols
../data/test.csv: shape = 165141 rows, 2 cols


In [19]:
data.shape

(275234, 3)

In [20]:
#добавление признаков
data = data.merge(client_profile, on='application_number', how='left') \
           .merge(payments, on='application_number', how='left') \
           .merge(bki, on='application_number', how='left') \
           .merge(prev_count, on='application_number', how='left')

In [21]:
#
data = one_hot_encode(data)

In [22]:
data.shape

(275234, 105)

In [23]:
#разбиение обратно на train, test
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

test = test.replace(np.inf, np.nan)
test = test.replace(-np.inf, np.nan)

In [24]:
len(numerical), len(categorial)

(103, 0)

In [25]:
#замена пропусков на 0
train = train.fillna(0)
test = test.fillna(0)

### оценка разделяющей способности признаков по GINI и отбор наиболее значимых

In [26]:
scores = calculate_feature_separating_ability(train, target)

In [27]:
scores

x0_m                                 0.071453
x1_secondary / secondary special     0.068214
ratio_annuity_to_age                 0.052216
childrens                            0.035298
x0_Cash                              0.033864
                                       ...   
external_scoring_rating_2           -0.208180
external_scoring_rating_max         -0.250446
external_scoring_rating_min         -0.255171
external_scoring_rating_nanmedian   -0.281325
external_scoring_rating_mean        -0.288431
Length: 103, dtype: float64

In [28]:
feats = []

for i, x in enumerate(scores):
    if abs(x) > 0.01:
        feats.append(scores.index[i])

In [29]:
len(feats), len(train.columns)

(53, 103)

### обучение LighGBM c кросс валидацией

In [32]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'n_estimators': 10000,
    'learning_rate': 0.05,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.4,
    'reg_lambda': 0.5,
    'colsample_bytree': 0.5,
    'min_split_gain': 0.02,
    'subsample': 0.7,
    'is_unbalance': False,
    'random_state': 27,
    'silent': -1,
    'verbose': -1,
}

cv = KFold(n_splits=5, random_state=27, shuffle=True)

lgb_estimators, lgb_oof_preds = lightgbm_cross_validation(
    params=lgbm_params, X=train[feats], y=target, cv=cv,
)

lgb_oof_score = roc_auc_score(
    target, lgb_oof_preds
)
print(f"OOF-score = {round(lgb_oof_score, 5)}")

Tue Nov  2 14:50:55 2021, Cross-Validation, 110093 rows, 53 cols
Fold 1, Valid score = 0.73076
Fold 2, Valid score = 0.72515
Fold 3, Valid score = 0.72135
Fold 4, Valid score = 0.73746
Fold 5, Valid score = 0.72109
Score by each fold: [0.73076, 0.72515, 0.72135, 0.73746, 0.72109]
OOF-score = 0.72679


In [41]:
#предсказание lgb
y_pred_lgb = np.zeros(test.shape[0])

for estimator in lgb_estimators:
    y_pred_lgb += estimator.predict_proba(test[feats])[:, 1]
    
y_pred_lgb /= len(estimators)

In [42]:
y_pred_lgb

array([0.05792781, 0.26173485, 0.16489891, ..., 0.08586763, 0.0142263 ,
       0.04528375])

In [46]:
y_pred = y_pred_lgb

In [47]:
#датафрейм с предсказанием
df_pred = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred
})

In [50]:
# сохраняем предсказание в csv
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

file_name = f'../data/TchobanouSG_submission_lgb_cb_mean_{now}.csv'
print('File name: ', file_name)

df_pred.to_csv(file_name, index=False, encoding='utf-8')
print('\n File saved to disk!')

File name:  ../data/TchobanouSG_submission_lgb_cb_mean_2021-11-02_15-16-46.csv

 File saved to disk!


In [51]:
#отправляем на Kaggle предсказание
!kaggle competitions submit -c geekbrains-competitive-data-analysis \
    -f {file_name} -m {round(lgb_oof_score, 5)} -q

Successfully submitted to GeekBrains Competitive Data Analysis

In [52]:
#собственные сабмиты
!kaggle competitions submissions -c geekbrains-competitive-data-analysis

fileName                                                    date                 description  status    publicScore  privateScore  
----------------------------------------------------------  -------------------  -----------  --------  -----------  ------------  
TchobanouSG_submission_lgb_cb_mean_2021-11-02_15-16-46.csv  2021-11-02 12:16:59  0.72679      complete  0.73108      None          
TchobanouSG_submission_lgb_2021-11-01_23-07-35.csv          2021-11-01 20:07:49  0.72679      complete  0.73200      None          
TchobanouSG_submission_cb_2021-11-01_23-03-37.csv           2021-11-01 20:03:44  0.72676      complete  0.72811      None          
TchobanouSG_submission_cb_2021-11-01_22-59-54.csv           2021-11-01 20:00:22  0.72682      complete  0.72779      None          
TchobanouSG_submission_cb_2021-11-01_22-59-54.csv           2021-11-01 20:00:06  0.72645      complete  0.72779      None          
TchobanouSG_submission_lgb_2021-11-01_22-54-44.csv          2021-11-0

In [36]:
# LeaderBoard
!kaggle competitions leaderboard geekbrains-competitive-data-analysis -s

 teamId  teamName                          submissionDate       score    
-------  --------------------------------  -------------------  -------  
6314536  Andrey Korzhun [поток 3]          2021-04-28 09:44:21  0.73682  
5864347  Rostislav Ilyk [поток 2]          2021-05-11 08:35:03  0.73648  
7533793  Anokiro [поток 4]                 2021-11-01 18:51:56  0.73600  
6473429  Vadim Pavlov [поток 3]            2021-05-03 10:30:21  0.73592  
6828867  Alexey Tankov [поток 4]           2021-11-02 11:24:05  0.73546  
6521064  Ainar (поток 3)                   2021-04-11 15:42:33  0.73479  
6476602  Irina Yakupova [поток 3]          2021-05-04 20:44:32  0.73468  
6547930  Denis Akatov [поток 3]            2021-04-27 21:39:41  0.73457  
5874170  Vasilii Sitdikov  [поток 2]       2020-12-19 22:50:14  0.73453  
7555087  Natalia Lapteva                   2021-10-18 10:29:28  0.73450  
5500634  Aleksey Ushakov                   2020-10-05 22:50:09  0.73402  
6579938  Eugene Shenk  [п