In [1066]:
# 导入第三方库
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn import metrics
from sklearn.metrics import roc_curve,roc_auc_score
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
# 显示所有列
pd.set_option('display.max_columns', None)

In [1067]:
# 读取数据
train_data = pd.read_excel("./data/到期用户分析20210624.xlsx")
test_data = pd.read_excel("./data/test_data.xlsx")
test_data.drop("target", axis=1, inplace=True)
data = pd.concat([train_data, test_data], axis = 0).reset_index(drop = True)

In [1068]:
# 缺失值填充
data.fillna(-999, inplace=True)

In [1069]:
# 性别编码
data.replace(["MALE", "FEMALE"], [1, 0], inplace=True)

# 分类特征编码
data["network_net_type"].replace({"WIFI":0, "4G":1, "3G":2}, inplace=True)

data["device_brand"].replace({"motorola":0, "samsung":1, "HUAWEI":2, "ZTE":3, "lge":4, "Lenovo":5, "htc":6,
                             "Nokia":7, "xiaomi":8, "TCL": 9, "LANIX": 10, "Redmi":11, "HONOR":12,
                             "Hisense":13, "Alcatel":14, "OPPO":15, "Huawei":2, "Xiaomi":8,
                             "Sony":16, "ZUUM":17, "BLU":18, "UMIDIGI":19, "KEMPLER_STRAUSS":20,
                             "blackberry":21, "realme":22, "SELFIX":23, "GHIA":24, "POCO":25, "EVOO":26,
                             "Verizon":27, "Mito":28, "TechPad":29, "HTC":6}, inplace=True)

In [1070]:
# 特征衍生
# label encoding
def encode_LE(data, cols):
    for f in tqdm(cols,desc='Encode_LE is working:'):
        le = LabelEncoder()
        le.fit(list(data[f].values))
        data[f] = le.transform(list(data[f].values))
        #data[f] =data[f].astype('category')
    print(cols)
    return data


# combine category
def encode_CB(df,combine_cols):
    uids=[]
    for i in range(len(combine_cols)-1):
        for j in range(i+1,len(combine_cols)):
            name = combine_cols[i]+'_'+combine_cols[j]
            uids.append(name)
            df[name] = df[combine_cols[i]].astype(str)+'_'+df[combine_cols[j]].astype(str)
            df[name+'_FE'] = df[name].map(df[name].value_counts(dropna=True,normalize=True))
            df[name+'_FE'] = df[name+"_FE"].astype('float32')
    return df,uids


data,uid = encode_CB(data, ["education_code", "marriage_code", "job_code", "income_code"])
data = encode_LE(data, uid)

Encode_LE is working:: 100%|████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 668.24it/s]

['education_code_marriage_code', 'education_code_job_code', 'education_code_income_code', 'marriage_code_job_code', 'marriage_code_income_code', 'job_code_income_code']





In [1071]:
# 构建选举卡的邮政编码与用户填写的是否一致
data['identify_base_postal_code'] = abs(data['identify_postal_code'] - data['base_postal_code'])


def location_base(x):
    base_state_city, location_state_city = x
    if base_state_city == location_state_city:
        return 1
    else:
        return 0

In [1072]:
# 特征工程
# "network_net_type"
cate_cols = ["education_code", "marriage_code", "job_code", "income_code", "device_brand"]
cate_cols_combine = [[cate_cols[i], cate_cols[j]] for i in range(len(cate_cols)) \
                     for j in range(i + 1, len(cate_cols))]


# 类别特征count、count ratio、onehot编码
for f in tqdm(cate_cols):
    data[f] = data[f].map(dict(zip(data[f].unique(), range(data[f].nunique()))))
    data[f + '_count'] = data[f].map(data[f].value_counts())
    data = pd.concat([data, pd.get_dummies(data[f], prefix=f"{f}")], axis=1)


# 类别特征与类别特征交叉
for f1, f2 in tqdm(cate_cols_combine):
    data['{}_{}_count'.format(f1, f2)] = data.groupby([f1, f2])['user_id'].transform('count')
    data['{}_in_{}_prop'.format(f1, f2)] = data['{}_{}_count'.format(f1, f2)] / data[f2 + '_count']
    data['{}_in_{}_prop'.format(f2, f1)] = data['{}_{}_count'.format(f1, f2)] / data[f1 + '_count']


#     "extra_duration_second","invalid_phone_number_sms_count_3d_rate"
# 类别特征与数值特征交叉
num_cols = ["battery_scale", "app_user_count"]
for f1 in tqdm(cate_cols):
    g = data.groupby(f1)
    for f2 in num_cols:
        for i in ['sum', 'mean', 'std', 'max', 'min']:
            data['{}_{}_{}'.format(f1, f2, i)] = g[f2].transform(i)
        # 分组统计特征和基础特征线性组合
        for j in ['sum', 'mean']:
            data['{}_{}_{}_add'.format(f1, f2, j)] = data[f2] + g[f2].transform(j)
            data['{}_{}_{}_diff'.format(f1, f2, j)] = data[f2] - g[f2].transform(j)
            data['{}_{}_{}_multi'.format(f1, f2, j)] = data[f2] * g[f2].transform(j)
            data['{}_{}_{}_div'.format(f1, f2, j)] = data[f2] / g[f2].transform(j)



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 89.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 371.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 45.17it/s]


In [1073]:
# 切分训练集和测试集
train_df = data[data["target"] != -999].reset_index(drop=True)
test_df = data[data["target"] == -999].reset_index(drop=True)

# 删除类别特征取值只有1个的
drop_feats = [f for f in train_df.columns if train_df[f].nunique() == 1 or train_df[f].nunique() == 0]
cols =  [col for col in train_df.columns if col not in ["user_id", "target", "name", "father_last_name", "mother_last_name", "identify_state", 
                                             "identify_district", "base_state", "base_city", "location_state", "location_city"]]
print("----------------------------------衍生特征数:{}----------------------------------".format(len(cols)))

----------------------------------衍生特征数:422----------------------------------


In [1074]:
def ks_score(y_val, pred_val):
    fpr, tpr, thre = roc_curve(y_val, pred_val, pos_label=1)
    return abs(fpr-tpr).max()

In [1102]:
X = train_df[cols]
y = train_df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
gbm = lgb.LGBMClassifier(boosting_type='gbdt', max_depth=-1, num_leaves=31, random_state=2021)
gbm_model = gbm.fit(X_train, y_train) 

lgb_importance = pd.DataFrame()
lgb_importance["feature"] = X_train.columns
lgb_importance["importance"] = gbm_model.feature_importances_ 
lgb_importance.sort_values(by=["importance"], ascending=False, inplace=True)
lgb_importance.set_index('feature',drop=True, append=False, inplace=True)
print(lgb_importance.head(50))

                                              importance
feature                                                 
invalid_phone_number_sms_count_3d                     64
ocr_duration_second                                   60
identity_duration_second                              57
location_longitude                                    57
user_contact_people_count                             57
phone_number_area_code                                55
extra_duration_second                                 55
loan_count_rate_15d                                   54
app_install_to_apply_standard_deviation_days          54
invalid_phone_number_sms_count_3d_rate                54
current_apply_loan_time_day_of_month                  54
education_code_app_user_count_sum_diff                52
invalid_phone_number_sms_count                        52
loan_category_distributed_rate_30d                    52
income_code_battery_scale_sum_add                     50
location_latitude              

In [1103]:
y_pred = gbm_model.predict(X_test) 
y_predprob = gbm_model.predict_proba(X_test)[:, 1] 
# print(y_predprob)

ks_value = ks_score(y_test, y_predprob)
print("Ks Score : %.4g" % ks_value, "\n")
print("Accuracy : %.4g" % metrics.accuracy_score(y_test.values, y_pred))
print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

lgb_test = np.where(y_predprob>=0.5,1,0)
# 坏样本数
print("预测坏样本数：", sum(np.where(y_predprob >= 0.5, 1, 0)))
tpr = metrics.recall_score(list(y_test), list(lgb_test))  # pos_label=1，表示值为1的实际值为正样本
print("召回率：", tpr)
# 精确率
print("精确率：", metrics.precision_score(list(y_test), list(lgb_test)))
# lift值
print("lift值：", metrics.precision_score(list(y_test), list(lgb_test)) / y_test.mean())

# test_preds = pd.DataFrame()
# test_preds["user_id"] = data.loc[y_test.index, "user_id"]
# test_preds["target"] = y_test
# test_preds["y_pred"] = y_predprob
# test_preds.to_excel('test_preds.xlsx', index=False)

Ks Score : 0.3537 

Accuracy : 0.7342
AUC Score (Test): 0.687506
预测坏样本数： 54
召回率： 0.46153846153846156
精确率： 0.5555555555555556
lift值： 1.8974358974358976


In [1101]:
all_cols = list(lgb_importance.index)
feature_nums = 70
cols = all_cols[:feature_nums]
# lift值: 1.977
# Ks Score : 0.3676 

In [552]:
all_cols = list(lgb_importance.index)
feature_nums = 30
cols = all_cols[:feature_nums]

In [1118]:
# 打印入模特征
print(cols)

['ocr_duration_second', 'invalid_phone_number_sms_count_3d', 'location_latitude', 'invalid_phone_number_sms_count_3d_rate', 'loan_count_rate_15d', 'location_longitude', 'education_code_app_user_count_sum_diff', 'device_brand_app_user_count_sum_div', 'app_install_to_apply_standard_deviation_days', 'receive_3d_30d_rate', 'send_count', 'education_code_battery_scale_sum_diff', 'identity_duration_second', 'current_apply_loan_time_day_of_month', 'loan_word_count_7d', 'phone_number_area_code', 'sms_create_to_apply_max_days', 'loan_due_sms_count', 'education_code_battery_scale_sum_add', 'user_contact_people_count', 'receive_30d_rate', 'identify_postal_code', 'loan_category_distributed_rate_30d', 'app_install_to_apply_median_days', 'auth_duration_second', 'distinct_valid_phone_number_count_7d', 'age', 'face_compare_duration_second', 'app_total_count', 'loan_due_sms_count_7d', 'extra_duration_second', 'count_30d', 'job_code_battery_scale_sum_multi', 'one_day_max_count', 'job_code_app_user_count_

In [1105]:
# 测试集预测
test_x = test_df[cols]
y_pred = gbm_model.predict(test_x) 
y_predprob = gbm_model.predict_proba(test_x)[:, 1]  

test_preds = pd.DataFrame()
test_preds["user_id"] = test_df["user_id"]
test_preds["y_pred"] = y_predprob
test_preds.to_excel('test_preds.xlsx', index=False)

In [1121]:
# lgb转pmml
# 法一
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline

cat_columns = ["Education", "Employment", "Marital", "Occupation"]
cont_columns = ['ocr_duration_second', 'invalid_phone_number_sms_count_3d', 'location_latitude', 'invalid_phone_number_sms_count_3d_rate', 'loan_count_rate_15d', 'location_longitude', 'education_code_app_user_count_sum_diff', 'device_brand_app_user_count_sum_div', 'app_install_to_apply_standard_deviation_days', 'receive_3d_30d_rate', 'send_count', 'education_code_battery_scale_sum_diff', 'identity_duration_second', 'current_apply_loan_time_day_of_month', 'loan_word_count_7d', 'phone_number_area_code', 'sms_create_to_apply_max_days', 'loan_due_sms_count', 'education_code_battery_scale_sum_add', 'user_contact_people_count', 'receive_30d_rate', 'identify_postal_code', 'loan_category_distributed_rate_30d', 'app_install_to_apply_median_days', 'auth_duration_second', 'distinct_valid_phone_number_count_7d', 'age', 'face_compare_duration_second', 'app_total_count', 'loan_due_sms_count_7d', 'extra_duration_second', 'count_30d', 'job_code_battery_scale_sum_multi', 'one_day_max_count', 'job_code_app_user_count_sum_div', 'marriage_code_battery_scale_sum_multi', 'sms_create_to_apply_standard_deviation_days', 'loan_count_rate_90d', 'invalid_phone_number_sms_count', 'app_user_rate_90d', 'income_code_battery_scale_mean_div', 'sms_create_to_apply_sum_days', 'education_code_job_code', 'loan_install_to_apply_average_days', 'loan_count_rate_30d', 'app_user_count_90d', 'loan_sms_count_7d', 'job_code_in_income_code_prop', 'device_brand_battery_scale_sum_multi', 'income_code_app_user_count_sum_div', 'loan_category_distributed_rate_7d', 'base_postal_code', 'loan_install_to_apply_max_days', 'job_code_battery_scale_sum_diff', 'app_system_count', 'income_code_in_device_brand_prop', 'income_code_in_job_code_prop', 'loan_sms_count_15d', 'marriage_code_battery_scale_sum_diff', 'device_brand_app_user_count_sum_multi', 'income_code_app_user_count_sum_multi', 'loan_install_to_apply_median_days', 'p3_ip_credit_count_30d', 'income_code_battery_scale_sum_div', 'sms_create_to_apply_median_days', 'marriage_code_in_job_code_prop', 'income_code_battery_scale_sum_add', 'device_brand_app_user_count_mean_multi', 'job_code_income_code', 'app_user_rate_7d']

mapper = DataFrameMapper(
  [(cont_columns, ContinuousDomain())]
)
pipeline = PMMLPipeline([
  ("mapper", mapper),
  ("classifier", gbm)
])
pipeline.fit(X_train, y_train)

sklearn2pmml(pipeline, "LightGBMAudit.pmml") 

In [1125]:
# 法二
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline
pipeline = PMMLPipeline([
    ("classifier", gbm)
])
X_train = X_train[cols]
pipeline.fit(X_train, y_train)
sklearn2pmml(pipeline, "LightGBMAudit.pmml", with_repr = True)

In [1126]:
# oot预测
test_x = test_df[cols]
y_predprob = pipeline.predict_proba(test_x)[:, 1]  

test_preds_pipeline = pd.DataFrame()
test_preds_pipeline["user_id"] = test_df["user_id"]
test_preds_pipeline["y_pred"] = y_predprob
test_preds_pipeline.to_excel('test_preds_pipeline.xlsx', index=False)