# 处理步骤
特征工程实际上就是对数据的处理，在将数据送入分类模型之前，需要将数据转换成合适的形式

- 数据预处理：对数据的基本处理操作
    - 填充缺失值
    - 处理时间格式的数据
    - 转换对象（非数值）类型特征到数值特征
- 异常值处理：？
    - 基于3segama原则
    - 基于箱型图
- 数据分箱：将数据按照预定的格式划分取值
    - 固定宽度分箱
    - 分位数分箱
        - 离散数值型数据分箱
        - 连续数值型数据分箱
    - 卡方分箱？
- 特征交互：？
    - 特征和特征之间组合
    - 特征和特征之间衍生
- 特征编码
    - one-hot编码
    - label-encode编码
- 特征选择：？
    - Filter
    - Wrapper（RFE）
    - Embedded


# 数据预处理

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import MinMaxScaler  # ?
from sklearn.feature_selection import SelectKBest  # ?
from sklearn.feature_selection import chi2  # ?
from sklearn.model_selection import StratifiedKFold  # ?
from sklearn.model_selection import KFold  # ?
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

import xgboost as xgb  # ?
import lightgbm as lgb  # ?
from catboost import CatBoostRegressor  # ?

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

In [70]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('testA.csv')
df_train.shape, df_test.shape

((800000, 47), (200000, 48))

In [71]:
df_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n2.1,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2 years,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,Aug-2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5 years,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,May-2002,1723.0,1.0,,,,,10.0,,,,,,13.0,,,,
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8 years,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,May-2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10+ years,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,May-1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,Aug-1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


## 特征预处理
- 将特征按数值和对象划分
- 填充缺失值
- 处理时间格式数据
- 对象特征转数值
- 类别特征处理

In [72]:
# 将特征按类型划分
numerical_feats = df_train.select_dtypes(exclude=['object']).columns.tolist()
category_feats = df_train.select_dtypes(include=['object']).columns.tolist()
label = 'isDefault'
numerical_feats.remove(label)
print('numerical_feats:{}\ncategory_feats:{}'.format(numerical_feats, category_feats))

numerical_feats:['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
category_feats:['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']


### 填充缺失值
因为各个特征的缺失比例较低，选择填充的方式弥补缺失值，填充方法有：
- 固定填充
    ```
    df_train = df_train.fillna(0)  # 也可以对指定列进行固定填充，也可以采用均值填充
    ```
- 以上填充：用缺失值上面的值填充缺失值
    ```
    df_train = df_train.fillna(axis=0, method='ffill')
    ```
- 以下填充：用缺失值下面的值填充缺失值
    ```
    df_train = df_train.fillna(axis=0, method='bfill',limit=2)  # 限制最多只填充两个连续缺失值
    ```

In [73]:
df_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n2.1                

In [74]:
# 对数值类型用平均数填充
df_train[numerical_feats] = df_train[numerical_feats].fillna(df_train[numerical_feats].mean())
df_test[numerical_feats] = df_test[numerical_feats].fillna(df_test[numerical_feats].mean())
# 按众数填充类别特征
df_train[category_feats] = df_train[category_feats].fillna(df_train[category_feats].mode())
df_test[category_feats] = df_test[category_feats].fillna(df_test[category_feats].mode())

In [75]:
df_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           0
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  0
regionCode                0
dti                       0
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies        0
revolBal                  0
revolUtil                 0
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     0
policyCode                0
n0                        0
n1                        0
n2                        0
n2.1                

这里`employmentLength`没有被填充，原因是？

###  处理时间格式数据
pandas中处理时间格式数据的方法是`pd.to_datetime(data, format='%Y-%m-%d)`，具体的内容参考[我的博客](https://www.gentlecp.com/articles/934.html)，找到最早的时间用于作为起始时间，计算`issueDate`的相对时间

In [76]:
df_train.sort_values(by=['issueDate'])['issueDate'].head()

647313    2007-06-01
739876    2007-07-01
762487    2007-07-01
590926    2007-07-01
212568    2007-07-01
Name: issueDate, dtype: object

In [77]:
df_test.sort_values(by=['issueDate'])['issueDate'].head() 

40344     2007-07-01
77231     2007-07-01
165280    2007-07-01
15654     2007-07-01
48479     2007-08-01
Name: issueDate, dtype: object

In [78]:
for data in [df_train, df_test]:
    data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
    start_date = pd.to_datetime('2007-06-01',format='%Y-%m-%d')
    # 以相对日期转换,按天计算
    data['issueDate'] = data['issueDate'].apply(lambda x: x-start_date).dt.days

df_train['issueDate'].head()

0    2587
1    1888
2    3044
3    2983
4    3196
Name: issueDate, dtype: int64

### 对象特征转数值
在该例子中主要有
- `employmentLength`这个特征,其中包含`< 1 year`和`10+ years`这两个特别的特征
- `earliesCreditLine`: 包含月-年的数据，仅取年份作为特征值

In [79]:
df_train['employmentLength'].value_counts(dropna=False).sort_index()

1 year        52489
10+ years    262753
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
NaN           46799
Name: employmentLength, dtype: int64

In [80]:
# 定义一个应用函数，用于对employmentLength中的数据提取数值
def get_employment_length(value):
    if pd.isnull(value):
        return value
    else:
        return np.int8(value.split()[0])  # 去掉year，只取数字

for data in [df_train, df_test]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace(to_replace='< 1 year', value='0 year', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(get_employment_length)

df_train['employmentLength'].value_counts(dropna=False).sort_index()

0.0      64237
1.0      52489
2.0      72358
3.0      64152
4.0      47985
5.0      50102
6.0      37254
7.0      35407
8.0      36192
9.0      30272
10.0    262753
NaN      46799
Name: employmentLength, dtype: int64

In [81]:
df_train['earliesCreditLine'].head()

0    Aug-2001
1    May-2002
2    May-2006
3    May-1999
4    Aug-1977
Name: earliesCreditLine, dtype: object

In [82]:
for data in [df_train, df_test]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

df_train['earliesCreditLine'].head()

0    2001
1    2002
2    2006
3    1999
4    1977
Name: earliesCreditLine, dtype: int64

### 类别特征处理
- 对数量少，有优先级的特征可直接映射或labelencode
- 对特征取值多，可以采用one-hot编码，使用`pandas.get_dummies`,该函数的效果如下实例所示，采用`drop_first`相当于用全0的情况代表一种取值,有什么好处吗？

In [83]:
test_data = pd.DataFrame({
    'system':['mac','windows','linux'],
    'software':['ali pay','qq','wechat']
})
pd.get_dummies(test_data, columns=['system','software'], drop_first=True)

Unnamed: 0,system_mac,system_windows,software_qq,software_wechat
0,1,0,0,0
1,0,1,1,0
2,0,0,0,1


In [84]:
pd.get_dummies(test_data, columns=['system','software'], drop_first=False)

Unnamed: 0,system_linux,system_mac,system_windows,software_ali pay,software_qq,software_wechat
0,0,1,0,1,0,0
1,0,0,1,0,1,0
2,1,0,0,0,0,1


In [85]:
cat_feats = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 
                 'applicationType', 'initialListStatus', 'title']
for feat in cat_feats:
    print('feat:{}, 类别数:{}'.format(feat, df_train[feat].nunique()))

feat:grade, 类别数:7
feat:subGrade, 类别数:35
feat:employmentTitle, 类别数:248684
feat:homeOwnership, 类别数:6
feat:verificationStatus, 类别数:3
feat:purpose, 类别数:14
feat:postCode, 类别数:933
feat:regionCode, 类别数:51
feat:applicationType, 类别数:2
feat:initialListStatus, 类别数:2
feat:title, 类别数:39645


In [86]:
# 自映射确定grade取值
for data in [df_train, df_test]:
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3, 'D':4, 'E':5, 'F':6, 'G':7})

In [30]:
# # one-hot编码取值较多的特征
# df_train = pd.get_dummies(df_train, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
# df_test = pd.get_dummies(df_test, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

In [54]:
df_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n2.1,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,5,E2,320.0,2.0,2,110000.0,2,2587,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,4,D2,219843.0,5.0,0,46000.0,2,1888,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002,1723.0,1.0,0.511932,3.64233,5.642648,5.642648,10.0,8.107937,8.575994,8.282953,14.622488,5.592345,13.0,0.000815,0.003384,0.089366,2.178606
2,2,12000.0,5,16.99,298.17,4,D3,31698.0,8.0,0,74000.0,2,3044,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,1,A4,46854.0,10.0,1,118000.0,1,2983,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,3,C2,54.0,,1,29000.0,2,3196,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


## 异常值处理
- 无规律的偶发异常值可删除
- 真实场景中的异常值需纳入考量

通常的异常值检测方法有
- 均方差：分布接近正态分布的数据，大部分数值会在均值的有限个标准差之内
- 箱型图:?

In [87]:
def find_outliers_by_3segema(data, feat):
    """
    查找在均值附近，3个标准差的数据分布
    """
    std = np.std(data[feat])
    mean = np.mean(data[feat])
    std_3 = std * 3
    lower_bound = mean - std_3
    upper_bound = mean + std_3
    data[feat+'_outliers'] = data[feat].apply(lambda x:str('异常值') if x > upper_bound or x < lower_bound else '正常值')
    return data

In [88]:
df_train_copy = df_train.copy()

for feat in numerical_feats:
    df_train_copy = find_outliers_by_3segema(df_train_copy, feat)
    print('正常值与异常值统计：\n{}'.format(df_train_copy[feat+'_outliers'].value_counts()))
    # 根据变量异常值的分布和对应预测变量的分布查看两者的关系
    print(df_train_copy.groupby(feat+'_outliers')['isDefault'].sum())
    print('-'*20)

正常值与异常值统计：
正常值    800000
Name: id_outliers, dtype: int64
id_outliers
正常值    159610
Name: isDefault, dtype: int64
--------------------
正常值与异常值统计：
正常值    800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
--------------------
正常值与异常值统计：
正常值    800000
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
--------------------
正常值与异常值统计：
正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
--------------------
正常值与异常值统计：
正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
--------------------
正常值与异常值统计：
正常值    800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
--------------------
正常值与异常值统计：
正常值    799701
异常值       299
Name: homeOwnership_o

In [89]:
df_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n2.1,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,5,E2,320.0,2.0,2,110000.0,2,2587,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,4,D2,219843.0,5.0,0,46000.0,2,1888,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002,1723.0,1.0,0.511932,3.64233,5.642648,5.642648,10.0,8.107937,8.575994,8.282953,14.622488,5.592345,13.0,0.000815,0.003384,0.089366,2.178606
2,2,12000.0,5,16.99,298.17,4,D3,31698.0,8.0,0,74000.0,2,3044,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,1,A4,46854.0,10.0,1,118000.0,1,2983,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,3,C2,54.0,,1,29000.0,2,3196,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


In [90]:
# 删除异常值
for feat in numerical_feats:
    df_train = df_train_copy[df_train_copy[feat+'_outliers'] == '正常值']
    df_train = df_train.reset_index(drop=True)


## 数据分桶
- 目的：降低变量复杂性，减少变量噪音对模型的影响，提高自变量和因变量的相关度，使模型更加稳定
- 对象
    - 将连续变量离散化
    - 将多状态的离散变量合并成少状态
- 原因：数据特征内值跨度较大，以欧式距离作为相似度函数的（k-means）方法容易出现“大吃小”的影响，用分箱实现区间量化，实际上是进行合群？
- 优点
    - 处理缺失值：将null单独作为一个分箱
    - 处理异常值：当数据中存在离群点时，可以把其通过分箱离散化处理，从而提高变量的鲁棒性（抗干扰能力）。例如，age若出现200这种异常值，可分入“age > 60”这个分箱里，排除影响
    - 业务解释性：我们习惯于线性判断变量的作用，当x越来越大，y就越来越大。但实际x与y之间经常存在着非线性关系，此时可经过WOE变换？？
- 基本原则
    - 最小分箱占比不低于5%
    - 箱内不能全部是好客户？
    - 连续箱单调？


In [59]:
pass

## 特征交互
从多个特征中衍生出新的特征方法

In [60]:
pass

## 特征编码
labelEncode

In [61]:
pass

## 特征选择
精简掉无用特征，降低模型的复杂度，主要方法包括：？
1. Filter
    - 方差选择法
    - 相关系数法（pearson 相关系数）
    - 卡方检验
    - 互信息法
2. Wrapper（RFE）
    - 递归特征消除法
3. Embedded
    - 基于惩罚项的特征选择法
    - 基于树模型的特征选择法

In [62]:
pass

# 模型训练
一个基础的模型

In [119]:
feats = [f for f in df_train.columns if f not in ['id','issueDate','isDefault', 'subGrade'] and '_outliers' not in f]
feats

['loanAmnt',
 'term',
 'interestRate',
 'installment',
 'grade',
 'employmentTitle',
 'employmentLength',
 'homeOwnership',
 'annualIncome',
 'verificationStatus',
 'purpose',
 'postCode',
 'regionCode',
 'dti',
 'delinquency_2years',
 'ficoRangeLow',
 'ficoRangeHigh',
 'openAcc',
 'pubRec',
 'pubRecBankruptcies',
 'revolBal',
 'revolUtil',
 'totalAcc',
 'initialListStatus',
 'applicationType',
 'earliesCreditLine',
 'title',
 'policyCode',
 'n0',
 'n1',
 'n2',
 'n2.1',
 'n4',
 'n5',
 'n6',
 'n7',
 'n8',
 'n9',
 'n10',
 'n11',
 'n12',
 'n13',
 'n14']

In [120]:
x_train = df_train[feats]
x_test = df_test[feats]
y_train = df_train['isDefault']
x_train.shape, x_test.shape

((788884, 43), (200000, 43))

In [121]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [122]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")

In [123]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds


KeyboardInterrupt: 

In [104]:
result = pd.DataFrame(data={'id':df_test['id'].tolist(),'isDefault':lgb_test})
result.head()

Unnamed: 0,id,isDefault
0,800000,0.019574
1,800001,0.059731
2,800002,0.128015
3,800003,0.055564
4,800004,0.08561


In [124]:
result.to_csv('baseline_lgb_200921v1.csv')