In [5]:
# 0，task1-缺省值处理：
import pandas as pd
import numpy as np
import seaborn
from sklearn.preprocessing import Imputer
from sklearn.model_selection  import train_test_split


data = pd.read_csv('../DataSets/001data.csv', encoding='gbk')

columns_dropna = ['bank_card_no', 'reg_preference_for_trad', 'id_name','first_transaction_time','trade_no',
                 'latest_query_time','loans_latest_time']  # 对于这些列实行删除NA样本的操作
temp_columns = list(data.columns.values)
columns_imputer = []                                       # 对于剩余的列实行填充0值的操作
for i in range(len(temp_columns)):
    if temp_columns[i] not in columns_dropna:
        columns_imputer.append(temp_columns[i])
print(len(columns_imputer), len(columns_dropna))

print(data.shape)
data.dropna(axis=0, how='any', subset=columns_dropna, inplace=True)         # 删除缺省值样本
data = data.replace(np.NaN, 0)                                              # 缺省值填充0

column_headers = list(data.columns.values)
print(column_headers.index('status'), len(column_headers))


# 删掉id数据：
columns_drop = ['bank_card_no','source','id_name','unnameid','custid','student_feature','trade_no']
data.drop(columns_drop, axis=1, inplace=True)
# 日期数据格式化：
data['latest_query_time'] = pd.to_datetime(data['latest_query_time'])
data['loans_latest_time'] = pd.to_datetime(data['loans_latest_time'])
# 将城市特征数值化：
map_dic = {'一线城市':'1','二线城市':'2','三线城市':'3','境外':'4','其他':'0'}
data['reg_preference_for_trad'] = data['reg_preference_for_trad'].map(map_dic)
# 将时间戳数据列丢掉：
data.drop(['latest_query_time','loans_latest_time'], axis=1, inplace=True)
# 丢掉方差为零的列：
data.drop(data.columns[data.std()==0], axis=1, inplace=True)


data.drop(['reg_preference_for_trad'], axis=1, inplace=True)  # 为什么'reg_preference_for_trad'的缺失率会是100%
# 统计缺失值占比：
data_missing = (data.isnull().sum()/len(data))*100    # np.isnan(data):可将他替换成 data.isnull()
data_missing = data_missing.drop(data_missing[data_missing==0].index).sort_values(ascending=False)
miss_data = pd.DataFrame({'缺失百分比':data_missing})
miss_data

83 7
(4754, 90)
44 90


Unnamed: 0,缺失百分比


In [6]:
# 2，使用随机会森林进行特征选择：
from sklearn.ensemble import RandomForestClassifier


x_columns = []  # X的列属性(x变量)
for i in range(len(column_headers)):
    if column_headers[i] not in ['reg_preference_for_trad','latest_query_time','loans_latest_time','status','bank_card_no','source','id_name','unnameid','custid','student_feature','trade_no']:
        x_columns.append(column_headers[i])  
        
X = data[x_columns]  # 获取x变量
Y = data['status']   # 获取y标签(label)

clf = RandomForestClassifier()
clf.fit(X, Y)

importance = clf.feature_importances_
indices = np.argsort(importance)[::-1]
features = X.columns
for f in range(X.shape[1]):
 print(("%2d) %-*s %f" % (f + 1, 30, features[f], importance[indices[f]])))

 1) low_volume_percent             0.051434
 2) middle_volume_percent          0.040662
 3) take_amount_in_later_12_month_highest 0.037493
 4) trans_amount_increase_rate_lately 0.031835
 5) trans_activity_month           0.030872
 6) trans_activity_day             0.027339
 7) transd_mcc                     0.021421
 8) trans_days_interval_filter     0.019528
 9) trans_days_interval            0.018560
10) regional_mobility              0.017795
11) repayment_capability           0.017736
12) is_high_user                   0.017366
13) number_of_trans_from_2011      0.017050
14) first_transaction_time         0.016615
15) historical_trans_amount        0.015945
16) historical_trans_day           0.015517
17) rank_trad_1_month              0.015122
18) trans_amount_3_month           0.014880
19) avg_consume_less_12_valid_month 0.014417
20) abs                            0.014322
21) top_trans_count_last_1_month   0.014062
22) avg_price_last_12_month        0.014060
23) avg_price_top_las



In [3]:
# 1，使用iv值进行特征选择：
import pandas as pd
import numpy as np


def calc_iv(df, feature, target, pr=False):
    """
    Set pr=True to enable printing of output.

    Output:
      * iv: float,
      * data: pandas.DataFrame
    """

    lst = []
    df[feature] = df[feature].fillna('NULL')

    for i in range(df[feature].nunique()):  # nuinque()是查看该序列(axis=0/1对应着列或行)的不同值的数量个数
        val = list(df[feature].unique())[i]
        lst.append([feature,
                    val,  # Value
                    df[df[feature] == val].count()[feature],  # all
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature],  # good rate
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature]])  # bad rate
    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])

    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])

    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})

    data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
    data.index = range(len(data.index))

    if pr:
        print(data)
        print("IV = ", data['IV'].sum())

    iv = data['IV'].sum()

    return iv, data


df = pd.read_csv('../DataSets/001data.csv', encoding='gbk')
column_headers = list(df.columns.values)
# print(column_headers)
del column_headers[11]
for x in column_headers:
    IV_1, data = calc_iv(df, x, 'status')
    print('{}： {}'.format(x, IV_1))

# for example:
# calc_iv(df, 'NET_TM', 'overdue')



unnameid： 0.0
custid： 0.0
trade_no： 0.0
bank_card_no： 0.0
low_volume_percent： 0.03434912946670754
middle_volume_percent： 0.07118473287723781
take_amount_in_later_12_month_highest： 0.08329868129431747
trans_amount_increase_rate_lately： 0.2883857251682308
trans_activity_month： 0.0728243237636469
trans_activity_day： 0.36466249883484175
transd_mcc： 0.03615978377155986
trans_days_interval： 0.09452853184236024
regional_mobility： 0.007908925668911125
student_feature： 0.0009555051797128911
repayment_capability： 0.4390959671989395
is_high_user： 0.005613886645064013
number_of_trans_from_2011： 0.057930909034620885
first_transaction_time： 0.4861222320898152
historical_trans_amount： 0.053932681011240866
historical_trans_day： 0.3525315105292556
rank_trad_1_month： 0.12800012489827373
trans_amount_3_month： 0.24484789224813222
avg_consume_less_12_valid_month： 0.01366176736863671
abs： 0.3727207019325916
top_trans_count_last_1_month： 0.07238437681409188
avg_price_last_12_month： 0.19973803729124645
avg_pr