
# **数据挖掘——Home Credit Default Risk**

Authors：李林（3120220938）、杨洋（3220211141）、敬甲男（3220221052）、李翰杰（3120220936）

github地址：https://github.com/leealim/kaggle-Home-Credit-Default-Risk

---

## 特征工程——特征增加

通过各种方式，增加特征：
> 简单计算的特征
> 融合原有的数据表中的数据的特征

---


In [9]:
# 引入本部分所需要的包，并定义需要的值和函数

import pandas as pd
import numpy as np
import os
from sklearn import preprocessing

source_dir="..\\data\\outlier_handling"
result_dir="..\\data\\table_merge"

app_tr_path = source_dir+"\\application_train.csv"
app_te_path = source_dir+"\\application_test.csv"
bur_path = source_dir+"\\bureau.csv"
bur_bal_path = source_dir+"\\bureau_balance.csv"
pos_path = source_dir+"\\POS_CASH_balance.csv"
cre_path = source_dir+"\\credit_card_balance.csv"
pre_path = source_dir+"\\previous_application.csv"
ins_path = source_dir+"\\installments_payments.csv"
hom_path = "..\\data\\home-credit-default-risk\\HomeCredit_columns_description.csv"  # 列描述表
hom = pd.read_csv(hom_path)

if not os.path.exists(result_dir):
    os.makedirs(result_dir)

def agg_numeric(df, parent_var, df_name):
    
    #排除其他键
    for col in df:
        if col != parent_var and 'SK_ID' in col:
            df = df.drop(columns = col)
    
    #聚合
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    #重命名列
    columns = []
    for var in agg.columns.levels[0]:
        if var != parent_var:
            for stat in agg.columns.levels[1]:
                columns.append('%s_%s_%s' % (df_name, var, stat))
    agg.columns = columns
    
    #排除重复列
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

def agg_categorical(df, parent_var, df_name):

    categorical = pd.get_dummies(df.select_dtypes('object'))
    categorical[parent_var] = df[parent_var]
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])
    
    #重命名
    column_names = []
    for var in categorical.columns.levels[0]:
        for stat in ['sum', 'count', 'mean']:
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    categorical.columns = column_names
    
    #排除重复
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

def aggregate_client(df, group_vars, df_names):
    #聚合多键关系表信息
    df_agg = agg_numeric(df, parent_var = group_vars[0], df_name = df_names[0])
    if any(df.dtypes == 'category'):
        df_counts = agg_categorical(df, parent_var = group_vars[0], df_name = df_names[0])
        df_by_loan = df_counts.merge(df_agg, on = group_vars[0], how = 'outer')
        df_by_loan = df_by_loan.merge(df[[group_vars[0], group_vars[1]]], on = group_vars[0], how = 'left')
        df_by_loan = df_by_loan.drop(columns = [group_vars[0]])
        df_by_client = agg_numeric(df_by_loan, parent_var = group_vars[1], df_name = df_names[1])
    else:
        df_by_loan = df_agg.merge(df[[group_vars[0], group_vars[1]]], on = group_vars[0], how = 'left')
        df_by_loan = df_by_loan.drop(columns = [group_vars[0]])
        df_by_client = agg_numeric(df_by_loan, parent_var = group_vars[1], df_name = df_names[1])
        
    return df_by_client




In [10]:
app_tr = pd.read_csv(app_tr_path)
app_te = pd.read_csv(app_te_path)
pre = pd.read_csv(pre_path)

In [11]:

# 将主表的分类数据转换成数值数据


In [12]:
print(app_tr.shape)
print(app_te.shape)

(290621, 80)
(48744, 79)


In [13]:
# 使用聚合函数产生聚合信息后，直接聚合到主表

pre_agg = agg_numeric(pre, 'SK_ID_CURR', 'previous')
pre_count = agg_categorical(pre, 'SK_ID_CURR', 'previous')


app_tr = app_tr.merge(pre_agg, on ='SK_ID_CURR', how = 'left')
app_tr = app_tr.merge(pre_count, on = 'SK_ID_CURR', how = 'left')
app_te = app_te.merge(pre_agg, on ='SK_ID_CURR', how = 'left')
app_te = app_te.merge(pre_count, on = 'SK_ID_CURR', how = 'left')



In [14]:
# cash = pd.read_csv('../data/home-credit-default-risk/POS_CASH_balance.csv')
# cash_by_client = aggregate_client(cash, group_vars = ['SK_ID_PREV', 'SK_ID_CURR'], df_names = ['cash', 'client'])
# app_tr = app_tr.merge(cash_by_client, on = 'SK_ID_CURR', how = 'left')
# app_te = app_te.merge(cash_by_client, on = 'SK_ID_CURR', how = 'left')

# credit = pd.read_csv('../data//home-credit-default-risk/credit_card_balance.csv')
# credit_by_client = aggregate_client(credit, group_vars = ['SK_ID_PREV', 'SK_ID_CURR'], df_names = ['credit', 'client'])
# app_tr = app_tr.merge(credit_by_client, on = 'SK_ID_CURR', how = 'left')
# app_te = app_te.merge(credit_by_client, on = 'SK_ID_CURR', how = 'left')

# installments = pd.read_csv('../data//home-credit-default-risk/installments_payments.csv')
# installments_by_client = aggregate_client(installments, group_vars = ['SK_ID_PREV', 'SK_ID_CURR'], df_names = ['installments', 'client'])
# app_tr = app_tr.merge(installments_by_client, on = 'SK_ID_CURR', how = 'left')
# app_te = app_te.merge(installments_by_client, on = 'SK_ID_CURR', how = 'left')

In [15]:
le = preprocessing.LabelEncoder()
le_count = 0
for col in app_tr:
    if app_tr[col].dtype == 'object':
        if len(list(app_tr[col].unique())) <= 2:
            le.fit(app_tr[col])
            app_tr[col] = le.transform(app_tr[col])
            app_te[col] = le.transform(app_te[col])
            le_count += 1

app_tr = pd.get_dummies(app_tr)
app_te = pd.get_dummies(app_te)

train_labels = app_tr['TARGET']
app_tr, app_te = app_tr.align(app_te, join = 'inner', axis = 1)
app_tr['TARGET'] = train_labels

In [16]:
print(app_tr.shape)
print(app_te.shape)

(290621, 572)
(48744, 571)


In [17]:
#结果保存

app_tr.to_csv(result_dir+"\\application_train.csv",index=False)
app_te.to_csv(result_dir+"\\application_test.csv",index=False)