# 任务二 - 特征工程

## 1. 特征选择

In [2]:
import numpy as np
import pandas as pd

In [4]:
train_data = pd.read_csv('./data/train.csv')
train_data.head()

Unnamed: 0,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,transd_mcc,trans_days_interval_filter,trans_days_interval,regional_mobility,...,loans_max_limit,loans_avg_limit,consfin_credit_limit,consfin_credibility,consfin_org_count_current,consfin_product_count,consfin_max_limit,consfin_avg_limit,latest_query_day,loans_latest_day
0,0.01,0.99,0,0.9,0.55,0.313,17.0,27.0,26.0,3.0,...,2900.0,1688.0,1200.0,75.0,1.0,2.0,1200.0,1200.0,12.0,18.0
1,0.02,0.94,2000,1.28,1.0,0.458,19.0,30.0,14.0,4.0,...,3500.0,1758.0,15100.0,80.0,5.0,6.0,22800.0,9360.0,4.0,2.0
2,0.0,0.96,2000,0.13,0.57,0.777,22.0,14.0,6.0,3.0,...,3200.0,1541.0,16300.0,80.0,5.0,5.0,30000.0,12180.0,2.0,4.0
3,0.03,0.65,0,0.31,0.76,0.472,15.0,21.0,14.0,2.0,...,5300.0,4750.0,5500.0,79.0,8.0,11.0,19200.0,7987.0,24.0,7.0
4,0.01,0.99,500,0.8,1.0,0.088,15.0,36.0,35.0,2.0,...,2800.0,1520.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,142.0


### 2.1 IV筛选特征
参考：https://www.jianshu.com/p/9a34892ee9e5

In [35]:
from sklearn.utils.multiclass import type_of_target
from scipy import stats
import math

#求woe值和iv值
def woe(X, y, event):
    res_woe = []  #列表存放woe字典
    res_iv = []  #列表存放iv
    X1 = feature_discretion(X) #对连续型特征进行处理
    for i in range(0, X1.shape[-1]):  #遍历所有特征
        x = X1[:, i]  #单个特征
        woe_dict, iv1 = woe_single_x(x, y, event)  #计算单个特征的woe值
        res_woe.append(woe_dict)
        res_iv.append(iv1)
    return np.array(res_woe), np.array(res_iv)  #返回数组

In [36]:
#求单个特征的woe值
def woe_single_x(x, y, event):
    event_total, non_event_total = count_binary(y, event) #计算好人坏人总数
    x_labels = np.unique(x) #特征中的分段
    woe_dict = {}  #存放每个分段的名称 以及 其对应的woe值
    iv = 0
    for x1 in x_labels: #遍历每个分段
        y1 = y[np.where(x == x1)[0]]
        event_count, non_event_count = count_binary(y1, event=event)
        rate_event = 1.0 * event_count / event_total
        rate_non_event = 1.0 * non_event_count / non_event_total
        #woe无穷大时处理
        if rate_event == 0:
            print()#print("{'",x1,"'}"+":全是好人") #只输出不做处理
        elif rate_non_event == 0:
            print()#print("{'",x1,"'}"+":全是坏人")
        else:
            woe1 = math.log(rate_event / rate_non_event)
            woe_dict[x1] = woe1  
            iv += (rate_event - rate_non_event) * woe1
    return woe_dict, iv

In [37]:
#计算个数
def count_binary(a, event):
    event_count = (a == event).sum()
    non_event_count = a.shape[-1] - event_count
    return event_count, non_event_count

In [38]:
#判断特征数据是否为离散型
def feature_discretion(X):
    temp = []
    for i in range(0, X.shape[-1]):
        x = X[:, i]
        x_type = type_of_target(x)
        if pd.Series(list(x)).dtype != 'O':
            x1 = discrete(x)
            temp.append(x1)
        else:
            temp.append(x)
    return np.array(temp).T

### 目标变量和特征特征

In [43]:
#目标变量
y = train_data['status'].values
# 所有特征
x = train_data.drop(['status'],axis=1).values
y

array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

### 计算所有特征的iv

In [50]:
res_woe, res_iv = woe(x, y, y[0])
dic = dict(zip(train_data.columns.values, res_iv)) #特征列表:IV值列表
dic_sort = sorted(dic.items(),key = lambda x:x[1],reverse = True)
dic_sort

[('trans_fail_top_count_enum_last_1_month', 0.5738251555141427),
 ('history_suc_fee', 0.5032245133297788),
 ('latest_six_month_apply', 0.43866404297963046),
 ('loans_settle_count', 0.35919819707466094),
 ('trans_day_last_12_month', 0.34408240900678505),
 ('trans_fail_top_count_enum_last_12_month', 0.2644876668331204),
 ('trans_fail_top_count_enum_last_6_month', 0.25640227596356635),
 ('latest_one_month_suc', 0.14714140965160658),
 ('history_fail_fee', 0.12851330530009836),
 ('rank_trad_1_month', 0.09658511076715653),
 ('max_cumulative_consume_later_1_month', 0.09410939090142396),
 ('first_transaction_day', 0.0874862341754116),
 ('trans_top_time_last_1_month', 0.0826299051197211),
 ('pawns_auctions_trusts_consume_last_1_month', 0.05951063024820402),
 ('top_trans_count_last_1_month', 0.05823744436103096),
 ('consfin_max_limit', 0.05796154598230204),
 ('trans_amount_3_month', 0.04338806330064936),
 ('loans_avg_limit', 0.04301830985857236),
 ('latest_one_month_loan', 0.03975843902475576),


### 2.2 随机森林