In [1]:
import time
from datetime import datetime
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
'''单变量特征选取'''
from sklearn.feature_selection import SelectKBest, chi2
'''去除方差小的特征'''
from sklearn.feature_selection import VarianceThreshold
'''循环特征选取'''
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
'''RFE_CV'''
from sklearn.ensemble import ExtraTreesClassifier

In [3]:
'''训练集'''
train_auth_info = pd.read_csv('./data/AI_risk_train_V3.0/train_auth_info.csv', low_memory=False)
train_bankcard_info = pd.read_csv('./data/AI_risk_train_V3.0/train_bankcard_info.csv', low_memory=False)
train_credit_info = pd.read_csv('./data/AI_risk_train_V3.0/train_credit_info.csv', low_memory=False)
train_order_info = pd.read_csv('./data/AI_risk_train_V3.0/train_order_info.csv', low_memory=False)
train_recieve_addr_info = pd.read_csv('./data/AI_risk_train_V3.0/train_recieve_addr_info.csv', low_memory=False)
train_user_info = pd.read_csv('./data/AI_risk_train_V3.0/train_user_info.csv', low_memory=False)
train_target = pd.read_csv('./data/AI_risk_train_V3.0/train_target.csv', low_memory=False)

In [5]:
'''测试集'''
test_auth_info = pd.read_csv('./data/AI_Risk_BtestData_V1.0/Btest_auth_info.csv', low_memory=False)
test_bankcard_info = pd.read_csv('./data/AI_Risk_BtestData_V1.0/Btest_bankcard_info.csv', low_memory=False)
test_credit_info = pd.read_csv('./data/AI_Risk_BtestData_V1.0/Btest_credit_info.csv', low_memory=False)
test_order_info = pd.read_csv('./data/AI_Risk_BtestData_V1.0/Btest_order_info.csv', low_memory=False)
test_recieve_addr_info = pd.read_csv('./data/AI_Risk_BtestData_V1.0/Btest_recieve_addr_info.csv', low_memory=False)
test_user_info = pd.read_csv('./data/AI_Risk_BtestData_V1.0/Btest_user_info.csv', low_memory=False)
test_list = pd.read_csv('./data/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False)

In [6]:
def return_set(group):
    return set(group)

In [7]:
def cal_auc(list_one, list_two):
    '''计算AUC值'''
    positive = []
    negative = []
    for index in range(len(list_one)):
        if list_one[index] == 1:
            positive.append(index)
        else:
            negative.append(index)
    SUM = 0
    for i in positive:
        for j in negative:
            if list_two[i] > list_two[j]:
                SUM += 1
            elif list_two[i] == list_two[j]:
                SUM += 0.5
            else:
                pass
    return SUM / (len(positive)*len(negative))

In [8]:
'''
用户信用积分
用户额度是否为 0
用户已使用的额度
用户的信用额度
用户剩余的额度
用户额度使用率
用户额度排名
是否所有信息都为空，除了 id
是否所有信息都为 0，除了 id
信用额度是否为 0
信用积分是否为 0
用户是否还有剩余的额度
'''
def extract_credit_info(credit_info):
    '''提取credit_info表 特征'''
    credit_info['credit_score'] = credit_info['credit_score'].fillna(credit_info['credit_score'].mean())
    credit_info['quota_is_zero'] = [1 if i != 0.0 else 0 for i in credit_info.quota]  # 是否有信用额度 #
    credit_info['overdraft'] = credit_info['overdraft'].fillna(0)
    credit_info['quota'] = credit_info['quota'].fillna(0)
    credit_info['quota_surplus'] = credit_info['quota'] - credit_info['overdraft']
    # credit_info['quota_rate'] = (credit_info['overdraft'] / credit_info['quota']).fillna(0)
    credit_info['quota_rate'] = credit_info[['overdraft', 'quota']].apply(lambda x: x.overdraft / x.quota if x.quota != 0 else 0, axis=1)
    credit_info['credit_score_rank'] = credit_info['credit_score'].rank(method='first', ascending=False)

    credit_info.loc[:, 'all_is_null'] = credit_info[['credit_score', 'overdraft', 'quota']].apply(lambda x: 1 if ((x.credit_score is not np.nan) and (x.overdraft is not np.nan) and (x.quota is not np.nan)) else 0, axis=1)
    credit_info.loc[:, 'all_is_zero'] = credit_info[['credit_score', 'overdraft', 'quota']].apply(lambda x: 1 if ((x.credit_score == 0) and (x.overdraft == 0) and (x.quota == 0)) else 0, axis=1)
    credit_info.loc[:, 'quota_is_zero'] = credit_info[['quota']].apply(lambda x: 1 if x.quota == 0 else 0, axis=1)
    credit_info.loc[:, 'credit_score_is_null'] = credit_info[['credit_score']].apply(lambda x: 1 if x.credit_score == 0 else 0, axis=1)
    credit_info.loc[:, 'quota_surplus_is_null'] = credit_info[['quota_surplus', 'quota']].apply(lambda x: 1 if (x.quota_surplus == 0) and (x.quota != 0) else 0, axis=1)

    '''归一化'''
    credit_info[['credit_score', 'overdraft', 'quota', 'quota_surplus', 'credit_score_rank']] = credit_info[['credit_score', 'overdraft', 'quota', 'quota_surplus', 'credit_score_rank']].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
    return credit_info

In [9]:
'''年龄特征'''
def is_valid_date(strdate):
    '''''判断是否是一个有效的日期字符串'''
    try:
        if ":" in strdate:
            time.strptime(strdate, "%Y-%m-%d %H:%M:%S")
        else:
            time.strptime(strdate, "%Y-%m-%d")
        return True
    except:
        return False

In [10]:
'''
用户生日是否是“0000-00-00”
用户性别的 one-hot 编码
用户婚姻状况的 one-hot 编码
用户会员等级的 one-hot 编码
用户是否绑定 QQ
用户是否绑定微信号
用户学历是否是“硕士、其它、博士”
用户身份证号是否为空
用户会员收入的 one-hot 编码
'''
def extract_user_info(user_info):
    '''提取 user_info表 特征'''
    feature = user_info[['id']]
    feature.loc[:, 'birthday_is_zero'] = user_info[['birthday']].apply(lambda x: 1 if x.birthday == '0000-00-00' else 0, axis=1)
    feature.loc[:, 'sex_not_male'] = user_info[['sex']].apply(lambda x: 1 if x.sex != '女' else 0, axis=1)
    feature.loc[:, 'female'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '男' else 0, axis=1)
    feature.loc[:, 'male'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '女' else 0, axis=1)
    feature.loc[:, 'sex_secret'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '保密' else 0, axis=1)    # 0.69504936432
    ##
    feature.loc[:, 'merriage1'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '未婚' else 0, axis=1)
    feature.loc[:, 'merriage2'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '已婚' else 0, axis=1)
    feature.loc[:, 'merriage3'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '保密' else 0, axis=1)
    feature.loc[:, 'merriage_is_null'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage is np.nan else 0, axis=1)   # 0.700624700466
    ####
    feature.loc[:, 'account_grade1'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '注册会员' else 0, axis=1)
    feature.loc[:, 'account_grade2'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '铜牌会员' else 0, axis=1)
    feature.loc[:, 'account_grade3'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '银牌会员' else 0, axis=1)
    feature.loc[:, 'account_grade4'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '金牌会员' else 0, axis=1)
    feature.loc[:, 'account_grade5'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '钻石会员' else 0, axis=1)
    feature.loc[:, 'account_grade_is_null'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade is np.nan else 0, axis=1)
    ###
    feature.loc[:, 'qq_bound_is_null'] = user_info[['qq_bound']].apply(lambda x: 1 if x.qq_bound is np.nan else 0, axis=1)
    feature.loc[:, 'wechat_bound_is_null'] = user_info[['wechat_bound']].apply(lambda x: 1 if x.wechat_bound is np.nan else 0, axis=1)
    feature.loc[:, 'degree'] = user_info[['degree']].apply(lambda x: 1 if (x.degree == '硕士') | (x.degree == '其他') | (x.degree == '博士') else 0, axis=1)
    feature.loc[:, 'id_card_is_null'] = user_info[['id_card']].apply(lambda x: 1 if x.id_card is np.nan else 0, axis=1)
    #####
    feature.loc[:, 'income1'] = [1 if index == '4000-5999元' else 0 for index in user_info['income']]
    feature.loc[:, 'income2'] = [1 if index == '8000元以上' else 0 for index in user_info['income']]
    feature.loc[:, 'income3'] = [1 if index == '2000-3999元' else 0 for index in user_info['income']]
    feature.loc[:, 'income4'] = [1 if index == '6000-7999元' else 0 for index in user_info['income']]
    feature.loc[:, 'income5'] = [1 if index == '2000元以下' else 0 for index in user_info['income']]     # 0.775891365882 #

    ####
    user_info['birthday_two'] = user_info[['birthday']].apply(lambda index: is_valid_date(index.birthday), axis=1)
    user_info['birthday'] = user_info[['birthday']].apply(lambda index: 0 if (index.birthday is np.nan) or (index.birthday == '0000-00-00') else index.birthday[0:4], axis=1)
    user_info['age'] = user_info[['birthday', 'birthday_two']].apply(lambda x: 2018 - int(x.birthday) if x.birthday_two is True else 0, axis=1)
    # print(user_info[['birthday_two', 'age']])
    feature.loc[:, 'age_one'] = user_info[['age']].apply(lambda x: 1 if x.age <= 18 and x.age > 0 else 0, axis=1)
    feature.loc[:, 'age_two'] = user_info[['age']].apply(lambda x: 1 if x.age <= 30 and x.age > 18 else 0, axis=1)
    feature.loc[:, 'age_three'] = user_info[['age']].apply(lambda x: 1 if x.age <= 60 and x.age > 30 else 0, axis=1)
    feature.loc[:, 'age_four'] = user_info[['age']].apply(lambda x: 1 if x.age <= 100 and x.age > 60 else 0, axis=1)
    feature.loc[:, 'age_five'] = user_info[['age']].apply(lambda x: 1 if x.age > 100 and x.age == 0 else 0, axis=1)

    return feature

In [11]:
'''
用户记录中是否有除 id 外都为空
'addr_id', 'region', 'phone', 'fix_phone', 'receiver_md5'是否同时为空
用户的记录数
用户收获地址中的省份离散特征
用户收获地址中有多少不同的省份
'''
def extract_recieve_addr_info(recieve_addr_info):
    '''提取 recieve_addr_info表 特征'''
    recieve_addr_info['all_null'] = recieve_addr_info[['addr_id', 'region', 'phone', 'fix_phone', 'receiver_md5']].apply(lambda x: 1 if (x.addr_id is np.nan) and (x.region is np.nan) and (x.phone is np.nan) and (x.fix_phone is np.nan) | (x.receiver_md5 is np.nan) else 0, axis=1)
    feature = recieve_addr_info.drop_duplicates(['id'])[['id']]
    recieve_addr_info['index'] = recieve_addr_info.index
    all_is_null = pd.pivot_table(recieve_addr_info, index='id', values='all_null', aggfunc='min').reset_index()
    addr_id = pd.pivot_table(recieve_addr_info, index='id', values='index', aggfunc='count').reset_index().rename(columns={'index': 'record_count'})
    feature = feature.merge(all_is_null, on='id', how='left')
    feature = feature.merge(addr_id, on='id', how='left')
    province = {'甘肃', '云南', '贵州', '河南', '黑龙', '香港', '北京', '湖南', '江苏', '青海', '宁夏', '内蒙', '浙江', '吉林', '海南', '福建', '重庆', '台湾', '陕西', '湖北', '江西', '辽宁', '山西', '西藏', '广东', '安徽', '四川', '河北', '山东', '上海', '广西', '新疆', '天津', 'null'}

    train_recieve_addr_info['province'] = train_recieve_addr_info[['region']].apply(lambda x: 'null' if x.region is np.nan else x.region[0:2], axis=1)
    city_set = pd.pivot_table(train_recieve_addr_info, index='id', values='province', aggfunc=return_set).reset_index()
    for string in list(province):
        city_set[string] = [1 if string in index else 0 for index in city_set['province']]
    city_set['province'] = city_set[['province']].apply(lambda x: x.province.clear() if 'null' in x.province else x.province, axis=1)
    city_set['province_len'] = [0 if index is None else len(index) for index in city_set['province']]

    feature = feature.merge(city_set.drop(['province'], axis=1), on='id', how='left')
    # print(feature)
    return feature

In [12]:
'''
用户有多少条记录
用户有多少个不同的手机号码
用户的储蓄卡的数量
用户的信用卡的数量
用户是否有信用卡
用户有几种不同类型的银行卡
用户银行卡数量是否大于 6
用户是否只有一张银行卡
'''
def extract_bankcard_info(bankcard_info):
    ''' 提取 bankcard_info表 特征 '''

    def cal_store_card_num(group):
        flag = 0
        for index in group:
            if index == '储蓄卡':
                flag += 1
        return flag

    def if_have_credit_card(group):
        for index in group:
            if index == '信用卡':
                return 1
            else:
                return 0
        return 0

    def list_set(group):
        return len(set(group))

    bankcard_info = bankcard_info.drop_duplicates()
    feature = bankcard_info.drop_duplicates(['id'])[['id']]
    card_record_count = pd.pivot_table(bankcard_info, index='id', values='phone', aggfunc='count').reset_index().rename(columns={'phone': 'card_record_count'})
    phone_count = pd.pivot_table(bankcard_info, index='id', values='phone', aggfunc=list_set).reset_index().rename(columns={'phone': 'phone_count'})
    store_card_count = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=cal_store_card_num).reset_index().rename(columns={'card_type': 'store_card_count'})
    have_credit_card = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=if_have_credit_card).reset_index().rename(columns={'card_type': 'have_credit_card'})
    card_category_count = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=list_set).reset_index().rename(columns={'card_type': 'card_category_count'})

    feature = feature.merge(phone_count, on='id', how='left')
    feature = feature.merge(card_record_count, on='id', how='left')
    feature = feature.merge(store_card_count, on='id', how='left')
    feature = feature.merge(have_credit_card, on='id', how='left')
    feature = feature.merge(card_category_count, on='id', how='left')
    feature['credit_count'] = feature['card_record_count'] - feature['store_card_count']
    feature['card_count_one'] = feature[['card_record_count']].apply(lambda x: 1 if x.card_record_count > 6 else 0, axis=1)
    feature['record_is_unique'] = feature[['card_record_count']].apply(lambda x: 1 if x.card_record_count == 1 else 0, axis=1)
    # print(feature)

    return feature

In [13]:
'''
身份证账号是否为空
认证时间是否为空
电话号码是否为空
是否所有信息都为空，除了 id
是否所有信息都不为空
认证时间和身份证是否同时为空
认证时间和手机号码是否同时为空
'''
def extract_auth_info(auth_info):
    '''提取 auth_info表 特征'''
    feature = auth_info[['id']]
    feature.loc[:, 'auth_id_card_is_null'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card is not np.nan else 0, axis=1)
    feature.loc[:, 'auth_time_is_null'] = auth_info[['auth_time']].apply(lambda x: 1 if x.auth_time is not np.nan else 0, axis=1)
    feature.loc[:, 'phone_is_null'] = auth_info[['phone']].apply(lambda x: 1 if x.phone is not np.nan else 0, axis=1)
    feature.loc[:, 'all_is_null'] = auth_info[['id_card', 'auth_time', 'phone']].apply(lambda x: 1 if ((x.id_card is np.nan) and (x.auth_time is np.nan) and (x.phone is np.nan)) else 0, axis=1)
    feature.loc[:, 'all_not_null'] = auth_info[['id_card', 'auth_time', 'phone']].apply(lambda x: 1 if ((x.id_card is not np.nan) and (x.auth_time is not np.nan) and (x.phone is not np.nan)) else 0, axis=1)
    feature.loc[:, 'card_time_is_null'] = auth_info[['id_card', 'auth_time']].apply(lambda x: 1 if ((x.id_card is np.nan) and (x.auth_time is np.nan)) else 0, axis=1)
    feature.loc[:, 'time_phone_is_null'] = auth_info[['auth_time', 'phone']].apply(lambda x: 1 if ((x.phone is np.nan) and (x.auth_time is np.nan)) else 0, axis=1)
    
    return feature

In [14]:
'''
用户记录是否有除 id 外都为空
用户关于商品单价的统计特征
用户关于订单金额的统计特征
支付方式的离散特征
订单状态的离散特征
'''
def extract_order_info(order_info):
    '''提取 order_info表 特征'''
    def cal_set(group):
        return len(set(group))

    '''求标准差'''
    def cal_std(group):
        return np.std(group)

    feature = order_info.drop_duplicates(['id'])[['id']]
    # amt_order, type_pay, time_order, sts_order, phone, unit_price, no_order_md5, name_rec_md5, product_id_md5
    order_info['order_all_is_null'] = order_info.apply(lambda x: 1 if ((x.amt_order is np.nan) and (x.type_pay is np.nan) and (x.time_order is np.nan) and (x.sts_order is np.nan)) else 0, axis=1)
    order_all_is_null = pd.pivot_table(order_info[['id', 'order_all_is_null']], index='id', values='order_all_is_null', aggfunc='max').reset_index()

    '''均值填充amt_order属性'''
    order_info_amt = order_info[['amt_order']]
    order_info_amt = order_info_amt[order_info_amt['amt_order'].notnull()]
    order_info_amt = order_info_amt[order_info_amt['amt_order'] != 'null']
    order_info_amt['amt_order'] = [float(index) for index in order_info_amt['amt_order']]
    mean = order_info_amt['amt_order'].mean()
    order_info['amt_order'] = order_info['amt_order'].fillna(mean)
    order_info['amt_order'] = [mean if index == 'null' else index for index in order_info['amt_order']]
    order_info['amt_order'] = [float(index) for index in order_info['amt_order']]

    order_info['unit_price'] = order_info[['amt_order', 'unit_price']].apply(lambda x: x.amt_order if np.isnan(x.unit_price) else x.unit_price, axis=1)
    unit_price_mean = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='mean').reset_index().rename(columns={'unit_price': 'unit_price_mean'})
    unit_price_max = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='max').reset_index().rename(columns={'unit_price': 'unit_price_max'})
    unit_price_min = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='min').reset_index().rename(columns={'unit_price': 'unit_price_min'})
    unit_price_std = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc=cal_std).reset_index().rename(columns={'unit_price': 'unit_price_std'})

    amt_order_mean = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='mean').reset_index().rename(columns={'amt_order': 'amt_order_mean'})
    amt_order_max = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='max').reset_index().rename(columns={'amt_order': 'amt_order_max'})
    amt_order_min = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='min').reset_index().rename(columns={'amt_order': 'amt_order_min'})
    amt_order_std = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc=cal_std).reset_index().rename(columns={'amt_order': 'amt_order_std'})
    type_pay_count = pd.pivot_table(order_info[['id', 'type_pay']], index='id', values='type_pay', aggfunc=cal_set).reset_index().rename(columns={'type_pay': 'type_pay_count'})
    sts_order_count = pd.pivot_table(order_info[['id', 'sts_order']], index='id', values='sts_order', aggfunc=cal_set).reset_index().rename(columns={'sts_order': 'sts_order_count'})
    order_phone_count = pd.pivot_table(order_info[['id', 'phone']], index='id', values='phone', aggfunc=cal_set).reset_index().rename(columns={'phone': 'order_phone_count'})
    name_rec_md5_count = pd.pivot_table(order_info[['id', 'name_rec_md5']], index='id', values='name_rec_md5', aggfunc=cal_set).reset_index().rename(columns={'name_rec_md5': 'name_rec_md5_count'})

    feature = feature.merge(unit_price_mean, on='id', how='left')
    feature = feature.merge(unit_price_max, on='id', how='left')
    feature = feature.merge(unit_price_min, on='id', how='left')
    feature = feature.merge(unit_price_std, on='id', how='left')

    feature = feature.merge(order_all_is_null, on='id', how='left')
    feature = feature.merge(amt_order_mean, on='id', how='left')
    feature = feature.merge(amt_order_max, on='id', how='left')
    feature = feature.merge(amt_order_min, on='id', how='left')
    feature = feature.merge(amt_order_std, on='id', how='left')
    feature = feature.merge(type_pay_count, on='id', how='left')
    feature = feature.merge(sts_order_count, on='id', how='left')
    feature = feature.merge(order_phone_count, on='id', how='left')
    feature = feature.merge(name_rec_md5_count, on='id', how='left')
    '''归一化'''
    feature.iloc[:, 1:] = feature.iloc[:, 1:].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))   # 0.791859501859 #
    '''离散化特征'''
    order_info['type_pay'] = order_info[['type_pay']].apply(lambda x: 'null' if x.type_pay is np.nan else x.type_pay, axis=1)
    type_pay = pd.pivot_table(order_info, index='id', values='type_pay', aggfunc=return_set).reset_index()
    
    type_pay_category = {'定向京券支付', '白条支付', '在线+余额+限品东券', '高校代理-代理支付', '京券全额支付', '分期付款', '积分支付', '在线+限品东券', '定向东券', '东券混合支付', '余额', '京豆东券混合支付', '前台自付', '在线', '在线+东券支付', '上门自提', '公司转账', '在线支付', '在线支付 ', '在线+京豆', '邮局汇款', '在线+全品京券', '货到付款', '分期付款(招行)', '在线+全品东券', '余额+限品东券', '在线+京券支付', '在线+余额', '限品京券', 'null', '京豆支付', '在线预付', '定向京券', '混合支付', '全品京券', '京豆', '在线+定向东券', '京豆混合支付', '在线+限品京券', '高校代理-自己支付', '京券混合支付', '在线+东券'}
    for string in list(type_pay_category):
        type_pay[string] = [1 if string in index else 0 for index in type_pay['type_pay']]

    type_pay['type_pay'] = type_pay[['type_pay']].apply(lambda x: x.type_pay.clear() if 'null' in x.type_pay else x.type_pay, axis=1)
    type_pay['type_pay_len'] = [0 if index is None else len(index) for index in type_pay['type_pay']]
    feature = feature.merge(type_pay.drop(['type_pay'], axis=1), on='id', how='left')

    '''sts_order离散化'''
    order_info['sts_order'] = order_info[['sts_order']].apply(lambda x: 'null' if x.sts_order is np.nan else x.sts_order, axis=1)
    # sts_order_category = set(train_order_info['sts_order'])
    sts_order = pd.pivot_table(order_info, index='id', values='sts_order', aggfunc=return_set).reset_index()
    sts_order_category = {'null', '等待审核', '等待处理', '已退款', '已收货', '购买成功', '付款成功', '失败退款', '已完成', '预订结束', '退款完成', '正在出库', '订单已取消', '充值成功', '商品出库', '下单失败', '请上门自提', '已晒单', '充值失败;退款成功',
                          '退款成功', '未入住', '等待收货', '配送退货', '出票失败', '等待付款确认', '缴费成功', '预约完成', '未抢中', '完成', '已取消', '出票成功', '抢票已取消', '等待付款', '已取消订单', '正在处理', '等待退款', '充值失败', '订单取消'}

    for string in list(sts_order_category):
        sts_order[string] = [1 if string in index else 0 for index in sts_order['sts_order']]

    sts_order['sts_order'] = sts_order[['sts_order']].apply(lambda x: x.sts_order.clear() if 'null' in x.sts_order else x.sts_order, axis=1)
    sts_order['sts_order_len'] = [0 if index is None else len(index) for index in sts_order['sts_order']]
    # print(sts_order)
    feature = feature.merge(sts_order.drop(['sts_order'], axis=1), on='id', how='left')

    # print(feature)
    return feature

In [15]:
'''
用户年龄
用户注册天数
用户借贷日期是否早于注册日期
下订单时间与注册时间的天数差的最大、最小、平均
'''
def extract_time_feature(auth_info, target_list):
    '''提取时间相关特征'''
    feature = target_list[['id']]
    target_list = target_list[['id', 'appl_sbm_tm']].merge(auth_info[['id', 'auth_time']], on='id', how='left')
    target_list.loc[:, 'appl_sbm_tm'] = [index.split(' ')[0] for index in target_list['appl_sbm_tm']]
    target_list['auth_time'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm if x.auth_time == '0000-00-00' else x.auth_time, axis=1)
    target_list['auth_time'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm if x.auth_time is np.nan else x.auth_time, axis=1)
    feature['feature_1'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: 1 if x.appl_sbm_tm < x.auth_time else 0, axis=1)
    feature['register_days'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: (datetime(int(x.appl_sbm_tm.split('-')[0]), int(x.appl_sbm_tm.split('-')[1]), int(x.appl_sbm_tm.split('-')[2])) - datetime(int(x.auth_time.split('-')[0]), int(x.auth_time.split('-')[1]), int(x.auth_time.split('-')[2]))).days, axis=1)
    # print(target_list)
    # print(feature)
    return feature

In [16]:
def extract_order_payment_time(order_info, target_list):
    str_len = len('2016-01-19 22:38:26')
    feature = target_list[['id']]
    target_list = target_list[['id', 'appl_sbm_tm']].merge(order_info[['id', 'time_order']], on='id', how='left')
    target_list.loc[:, 'appl_sbm_tm'] = [index.split(' ')[0] for index in target_list['appl_sbm_tm']]
    target_list['time_order'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: x.appl_sbm_tm if x.time_order is np.nan else x.time_order, axis=1)
    target_list['time_order'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: x.appl_sbm_tm if len(x.time_order) != str_len else x.time_order, axis=1)
    target_list.loc[:, 'time_order'] = [index.split(' ')[0] for index in target_list['time_order']]
    target_list['days'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: (datetime(int(x.appl_sbm_tm.split('-')[0]), int(x.appl_sbm_tm.split('-')[1]), int(x.appl_sbm_tm.split('-')[2])) - datetime(int(x.time_order.split('-')[0]), int(x.time_order.split('-')[1]), int(x.time_order.split('-')[2]))).days, axis=1)
    print(target_list)
    day_mean = pd.pivot_table(target_list, index='id', values='days', aggfunc='mean').reset_index().rename(columns={'days': 'day_mean'})
    day_max = pd.pivot_table(target_list, index='id', values='days', aggfunc='max').reset_index().rename(columns={'days': 'day_max'})
    day_min = pd.pivot_table(target_list, index='id', values='days', aggfunc='min').reset_index().rename(columns={'days': 'day_min'})
    order_record_count = pd.pivot_table(target_list, index='id', values='days', aggfunc='count').reset_index().rename(columns={'days': 'order_record_count'})
    feature = feature.merge(day_mean, on='id', how='left')
    feature = feature.merge(day_max, on='id', how='left')
    feature = feature.merge(day_min, on='id', how='left')
    feature = feature.merge(order_record_count, on='id', how='left')     # 记录数 #
    feature.loc[:, 'order_record_unique'] = [1 if index == 1 else 0 for index in feature['order_record_count']]     # 记录数是否唯一 #
    print(feature)
    return feature

In [17]:
'''Logistic Regression'''
def train_LR_module(store_result=False, store_feature=False, select_feature=False, feature_num='all', OneEncode=False):
    train_feature = pd.read_csv('train_feature.csv', encoding='utf-8')
    validate_feature = pd.read_csv('validate_feature.csv', encoding='utf-8')
    test_feature = pd.read_csv('test_feature.csv', encoding='utf-8')
    train_test_feature = pd.read_csv('train_test_feature.csv', encoding='utf-8')
    print('读取数据完毕。。。')

    validate_label = validate_feature[['target']]
    train_label = train_feature[['target']]
    train_test_label = train_test_feature[['target']]

    train_feature = train_feature.iloc[:, 2:]
    test_feature = test_feature.iloc[:, 1:]
    validate_feature = validate_feature.iloc[:, 2:]
    train_test_feature = train_test_feature.iloc[:, 2:]

    if OneEncode is True:
        features = list(train_feature.columns)
        one_hot = []
        continuous_feature = []
        for name in features:
            if len(set(train_feature[name])) == 2:
                one_hot.append(name)
            else:
                continuous_feature.append(name)

        feature = one_hot[:140] + continuous_feature
        train_feature = train_feature[feature]
        validate_feature = validate_feature[feature]
        test_feature = test_feature[feature]
        train_test_feature = train_test_feature[feature]

    if select_feature is True:
        print('开始特征选择。。。')
        ch2 = SelectKBest(chi2, k=feature_num)
        train_feature = ch2.fit_transform(train_feature, train_label)
        test_feature = ch2.transform(test_feature)
        validate_feature = ch2.transform(validate_feature)
        train_test_feature = ch2.transform(train_test_feature)
        print('特征选择完毕。。。')
    else:
        feature_num = train_feature.shape[1]

    print('开始训练logisticRegression模型。。。')
    module = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4)  # , solver='sag'
    # module = lgb.LGBMClassifier(
    #     num_leaves=64,  # num_leaves = 2^max_depth * 0.6 #
    #     max_depth=6,
    #     n_estimators=80,
    #     learning_rate=0.1
    # )
    '''训练集'''
    module.fit(train_feature, train_label)

    if store_result is True:
        '''测试训练集'''
        module_two = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4)
        # module_two = lgb.LGBMClassifier(
        #     num_leaves=64,  # num_leaves = 2^max_depth * 0.6 #
        #     max_depth=6,
        #     n_estimators=80,
        #     learning_rate=0.1
        # )
        module_two.fit(train_test_feature, train_test_label)

        result = module_two.predict_proba(test_feature)[:, 1]
        result = pd.DataFrame(result)
        result.columns = ['predicted_score']
        sample = test_list[['id']]
        sample['predicted_score'] = [index for index in result['predicted_score']]
        sample.columns = ['ID', 'PROB']
        sample.to_csv('lr_sample.csv', index=None)
        # sample.to_csv(r'lgb_sample.csv', index=None)
        print(sample)
        print('结果已更新。。。')

    print(" Score_offline:", roc_auc_score(validate_label, module.predict_proba(validate_feature)[:, 1]))
    print('特征维数：', feature_num)

In [18]:
class FeatureSelection(object):
    def __init__(self, feature_num):
        self.feature_num = feature_num
        self.train_test, self.label, self.test = self.read_data()    # features #
        self.feature_name = list(self.train_test.columns)     # feature name #

    def read_data(self):
        test = pd.read_csv('test_feature.csv', encoding='utf-8')
        train_test = pd.read_csv('train_test_feature.csv', encoding='utf-8')
        train_test = train_test.drop(['feature_1', 'register_days', 'id_card_one', 'id_card_two', 'id_card_three', 'id_card_four', 'id_card_five', 'id_card_six', 'mobile', 'unicom', 'telecom', 'virtual'], axis=1)
        print('读取数据完毕。。。')
        label = train_test[['target']]
        test = test.iloc[:, 1:]
        train_test = train_test.iloc[:, 2:]
        return train_test, label, test

    def variance_threshold(self):
        sel = VarianceThreshold()
        sel.fit_transform(self.train_test)
        feature_var = list(sel.variances_)    # feature variance #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        # print(features)   # 100 cols #
        return set(features)   # return set type #

    def select_k_best(self):
        ch2 = SelectKBest(chi2, k=self.feature_num)
        ch2.fit(self.train_test, self.label)
        feature_var = list(ch2.scores_)  # feature scores #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        # print(features)     # 100 cols #
        return set(features)    # return set type #

    def svc_select(self):
        svc = SVC(kernel='rbf', C=1, random_state=2018)    # linear #
        rfe = RFE(estimator=svc, n_features_to_select=self.feature_num, step=1)
        rfe.fit(self.train_test, self.label.ravel())
        print(rfe.ranking_)
        return rfe.ranking_

    def tree_select(self):
        clf = ExtraTreesClassifier(n_estimators=300, max_depth=7, n_jobs=4)
        clf.fit(self.train_test, self.label)
        feature_var = list(clf.feature_importances_)  # feature scores #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        # print(features)     # 100 cols #
        return set(features)  # return set type #

    def return_feature_set(self, variance_threshold=False, select_k_best=False, svc_select=False, tree_select=False):
        names = set([])
        if variance_threshold is True:
            name_one = self.variance_threshold()
            names = names.union(name_one)
        if select_k_best is True:
            name_two = self.select_k_best()
            names = names.intersection(name_two)
        if svc_select is True:
            name_three = self.svc_select()
            names = names.intersection(name_three)
        if tree_select is True:
            name_four = self.tree_select()
            names = names.intersection(name_four)

        # print(len(names))
        print(names)
        return list(names)

In [19]:
def train_xgb_module(features_name, store_result=False):
    '''训练模型'''
    train_feature = pd.read_csv('train_feature.csv', encoding='utf-8')
    validate_feature = pd.read_csv('validate_feature.csv', encoding='utf-8')
    test_feature = pd.read_csv('test_feature.csv', encoding='utf-8')
    train_test_feature = pd.read_csv('train_test_feature.csv', encoding='utf-8')

    print('读取数据完毕。。。')

    validate_label = validate_feature[['target']]
    train_label = train_feature[['target']]
    train_test_label = train_test_feature[['target']]

    train_feature = train_feature[features_name]
    test_feature = test_feature[features_name]
    validate_feature = validate_feature[features_name]
    train_test_feature = train_test_feature[features_name]

    print('开始训练xgboost模型。。。')
    '''xgboost分类器'''
    num_round = 500    # 迭代次数 #
    params = {
        'booster': 'gbtree',
        'max_depth': 4,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'eta': 0.03,
        'silent': 1,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'min_child_weight': 1,
        'scale_pos_weight': 1,
        'seed': 27,
        'reg_alpha': 0.01
    }
    '''训练集'''
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    validate_feature = xgb.DMatrix(validate_feature)
    module = xgb.train(params, dtrain, num_round)

    if store_result is True:
        '''测试训练集'''
        dtrain_two = xgb.DMatrix(train_test_feature, label=train_test_label)
        test_feature = xgb.DMatrix(test_feature)
        module_two = xgb.train(params, dtrain_two, num_round)

        features = module_two.get_fscore()
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-20:]
        features.reverse()
        print(features)       # 输出特征重要性 #

        result = module_two.predict(test_feature)
        result = pd.DataFrame(result)
        result.columns = ['predicted_score']
        test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False)
        sample = test_list[['id']]
        sample['predicted_score'] = [index for index in result['predicted_score']]
        sample.columns = ['ID', 'PROB']
        sample.to_csv(r'xgb_sample.csv', index=None)
        print(sample)
        print('结果已更新。。。')

    print(" Score_offline:", roc_auc_score(validate_label, module.predict(validate_feature)))
    print('特征维数：', len(features_name))

In [20]:
def train_lr_module(features_name, store_result=False):
    train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8')
    validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8')
    test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8')
    train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
    print('读取数据完毕。。。')

    validate_label = validate_feature[['target']]
    train_label = train_feature[['target']]
    train_test_label = train_test_feature[['target']]

    train_feature = train_feature[features_name]
    test_feature = test_feature[features_name]
    validate_feature = validate_feature[features_name]
    train_test_feature = train_test_feature[features_name]

    print('开始训练logisticRegression模型。。。')
    module = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4)  # , solver='sag'
    # module = lgb.LGBMClassifier(
    #     num_leaves=64,  # num_leaves = 2^max_depth * 0.6 #
    #     max_depth=6,
    #     n_estimators=80,
    #     learning_rate=0.1
    # )
    '''训练集'''
    module.fit(train_feature, train_label)

    if store_result is True:
        '''测试训练集'''
        module_two = LogisticRegression(
            penalty='l2',
            solver='sag',
            max_iter=500,
            random_state=42,
            n_jobs=4
        )

        # module_two = lgb.LGBMClassifier(
        #     num_leaves=64,  # num_leaves = 2^max_depth * 0.6 #
        #     max_depth=6,
        #     n_estimators=80,
        #     learning_rate=0.1
        # )
        module_two.fit(train_test_feature, train_test_label)

        result = module_two.predict_proba(test_feature)[:, 1]
        result = pd.DataFrame(result)
        result.columns = ['predicted_score']
        test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False)
        sample = test_list[['id']]
        sample['predicted_score'] = [index for index in result['predicted_score']]
        sample.columns = ['ID', 'PROB']
        sample.to_csv(r'lr_sample.csv', index=None)
        # sample.to_csv(r'lgb_sample.csv', index=None)
        print(sample)
        print('结果已更新。。。')

    print(" Score_offline:", roc_auc_score(validate_label, module.predict_proba(validate_feature)[:, 1]))
    print('特征维数：', len(features_name))

In [21]:
'''划分数据集'''
train_target['date'] = [index.replace('-', '') for index in train_target['appl_sbm_tm']]
train_target['date'] = [index.split(' ')[0][0:6] for index in train_target['date']]
'''验证集'''
validate_data = train_target[(train_target['date'] == '201704')][['target', 'id']]
'''训练集'''
train_data = train_target[(train_target['date'] >= '201603') & (train_target['date'] <= '201703')][['target', 'id']]
'''测试集'''
test_data = test_list[['id']]
'''测试训练集'''
train_test_data = train_target[['target', 'id']]

In [22]:
def extract_feature():
    '''credit_info'''
    train_credit_info_feature = extract_credit_info(train_credit_info)
    train_test_feature = train_test_data.merge(train_credit_info_feature, on='id', how='left')    # 训练测试集 #
    train_feature = train_data.merge(train_credit_info_feature, on='id', how='left')
    validate_feature = validate_data.merge(train_credit_info_feature, on='id', how='left')
    test_feature = test_data.merge(extract_credit_info(test_credit_info), on='id', how='left')

    '''order_info'''
    train_order_info_feature = extract_order_info(train_order_info)
    train_feature = train_feature.merge(train_order_info_feature, on='id', how='left')
    train_test_feature = train_test_feature.merge(train_order_info_feature, on='id', how='left')  # 训练测试集 #
    validate_feature = validate_feature.merge(train_order_info_feature, on='id', how='left')
    test_feature = test_feature.merge(extract_order_info(test_order_info), on='id', how='left')

    '''user_info'''
    train_user_info_feature = extract_user_info(train_user_info)
    train_feature = train_feature.merge(train_user_info_feature, on='id', how='left')
    train_test_feature = train_test_feature.merge(train_user_info_feature, on='id', how='left')    # 训练测试集 #
    validate_feature = validate_feature.merge(train_user_info_feature, on='id', how='left')
    test_feature = test_feature.merge(extract_user_info(test_user_info), on='id', how='left')

    '''recieve_addr_info'''
    train_recieve_addr_info_feature = extract_recieve_addr_info(train_recieve_addr_info)
    train_feature = train_feature.merge(train_recieve_addr_info_feature, on='id', how='left')
    train_test_feature = train_test_feature.merge(train_recieve_addr_info_feature, on='id', how='left')  # 训练测试集 #
    validate_feature = validate_feature.merge(train_recieve_addr_info_feature, on='id', how='left')
    test_feature = test_feature.merge(extract_recieve_addr_info(test_recieve_addr_info), on='id', how='left')

    '''bankcard_info'''
    train_bankcard_info_feature = extract_bankcard_info(train_bankcard_info)
    train_feature = train_feature.merge(train_bankcard_info_feature, on='id', how='left')
    train_test_feature = train_test_feature.merge(train_bankcard_info_feature, on='id', how='left')  # 训练测试集 #
    validate_feature = validate_feature.merge(train_bankcard_info_feature, on='id', how='left')
    test_feature = test_feature.merge(extract_bankcard_info(test_bankcard_info), on='id', how='left')

    '''auth_info'''
    train_auth_info_feature = extract_auth_info(train_auth_info)
    train_feature = train_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0)
    train_test_feature = train_test_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0)  # 训练测试集 #
    validate_feature = validate_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0)
    test_feature = test_feature.merge(extract_auth_info(test_auth_info), on='id', how='left').fillna(0)

    '''time relative features one'''
    train_time_feature = extract_time_feature(train_auth_info, train_target)
    train_feature = train_feature.merge(train_time_feature, on='id', how='left').fillna(0)
    train_test_feature = train_test_feature.merge(train_time_feature, on='id', how='left').fillna(0)  # 训练测试集 #
    validate_feature = validate_feature.merge(train_time_feature, on='id', how='left').fillna(0)
    test_feature = test_feature.merge(extract_time_feature(test_auth_info, test_list), on='id', how='left').fillna(0)

    '''time relative features two'''
    train_order_payment_time = extract_order_payment_time(train_order_info, train_target)
    train_feature = train_feature.merge(train_order_payment_time, on='id', how='left').fillna(0)
    train_test_feature = train_test_feature.merge(train_order_payment_time, on='id', how='left').fillna(0)  # 训练测试集 #
    validate_feature = validate_feature.merge(train_order_payment_time, on='id', how='left').fillna(0)
    test_feature = test_feature.merge(extract_order_payment_time(test_order_info, test_list), on='id', how='left').fillna(0)

    print(train_feature.head(5))
    print(validate_feature.head(5))
    print(test_feature.head(5))
    return train_feature, validate_feature, test_feature, train_test_feature

In [23]:
def train_module(store_result=False, store_feature=False, select_feature=False, feature_num='all', one_encode=False):
    '''训练模型'''
    if store_feature is True:
        train_feature, validate_feature, test_feature, train_test_feature = extract_feature()
        ''' 保存特征数据 '''
        train_feature.to_csv(r'train_feature.csv', index=None, encoding='utf-8')
        validate_feature.to_csv(r'validate_feature.csv', index=None, encoding='utf-8')
        test_feature.to_csv(r'test_feature.csv', index=None, encoding='utf-8')
        train_test_feature.to_csv(r'train_test_feature.csv', index=None, encoding='utf-8')
        print('保存数据完毕。。。')

        print('特征提取完毕。。。')
        exit(0)
    else:
        train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8')
        validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8')
        test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8')
        train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
        print('读取数据完毕。。。')

    validate_label = validate_feature[['target']]
    train_label = train_feature[['target']]
    train_test_label = train_test_feature[['target']]

    train_feature = train_feature.iloc[:, 2:]
    test_feature = test_feature.iloc[:, 1:]
    validate_feature = validate_feature.iloc[:, 2:]
    train_test_feature = train_test_feature.iloc[:, 2:]

    train_feature = train_feature.drop(['feature_1', 'register_days'], axis=1)
    test_feature = test_feature.drop(['feature_1', 'register_days'], axis=1)
    validate_feature = validate_feature.drop(['feature_1', 'register_days'], axis=1)
    train_test_feature = train_test_feature.drop(['feature_1', 'register_days'], axis=1)

    if one_encode is True:
        features = list(train_feature.columns)
        continuous_feature = []
        one_hot = []
        for name in features:
            if len(set(train_feature[name])) != 2:
                continuous_feature.append(name)
            else:
                one_hot.append(name)

        feature = continuous_feature + one_hot[:130]
        train_feature = train_feature[feature]
        validate_feature = validate_feature[feature]
        test_feature = test_feature[feature]
        train_test_feature = train_test_feature[feature]

    if select_feature is True:
        print('开始特征选择。。。')
        ch2 = SelectKBest(chi2, k=feature_num)
        train_feature = ch2.fit_transform(train_feature, train_label)
        test_feature = ch2.transform(test_feature)
        validate_feature = ch2.transform(validate_feature)
        train_test_feature = ch2.transform(train_test_feature)
        print('特征选择完毕。。。')
    else:
        feature_num = train_feature.shape[1]

    print('开始训练xgboost模型。。。')
    '''xgboost分类器'''
    num_round = 500    # 迭代次数 #
    params = {
        'booster': 'gbtree',
        'max_depth': 4,
        'colsample_bytree': 0.6,
        'subsample': 0.7,
        'eta': 0.03,
        'silent': 1,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        # 'min_child_weight': 1,
        'scale_pos_weight': 1,
        # 'seed': 27,
        # 'reg_alpha': 0.01
    }
    '''训练集'''
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    validate_feature = xgb.DMatrix(validate_feature)
    module = xgb.train(params, dtrain, num_round)

    if store_result is True:
        '''测试训练集'''
        dtrain_two = xgb.DMatrix(train_test_feature, label=train_test_label)
        test_feature = xgb.DMatrix(test_feature)
        module_two = xgb.train(params, dtrain_two, num_round)

        result = module_two.predict(test_feature)
        result = pd.DataFrame(result)
        result.columns = ['predicted_score']
        sample = test_list[['id']]
        sample['predicted_score'] = [index for index in result['predicted_score']]
        sample.columns = ['ID', 'PROB']
        sample.to_csv(r'xgb_sample.csv', index=None)
        print(sample)
        print('结果已更新。。。')

    print(" Score_offline:", roc_auc_score(validate_label, module.predict(validate_feature)))
    print('特征维数：', feature_num)

In [24]:
''' 模型融合 '''
def module_merge_triple(prob_xgb, prob_lr, prob_lgb):
    xgb_sample = pd.read_csv(r'result_xgb.csv', low_memory=False)   # encode:159:0.790297834417
    lr_sample = pd.read_csv(r'lr_sample.csv', low_memory=False)     # Uncode:0.792171452209
    lgb_sample = pd.read_csv(r'xgb_sample_51.csv', low_memory=False)

    xgb_sample.columns = ['ID', 'PROB_xgb']
    lr_sample.columns = ['ID', 'PROB_lr']
    lgb_sample.columns = ['ID', 'PROB_lgb']
    sample = xgb_sample.merge(lr_sample, on='ID', how='left')
    sample = sample.merge(lgb_sample, on='ID', how='left')
    # print(sample)
    sample['PROB'] = sample['PROB_xgb'] * prob_xgb + sample['PROB_lr'] * prob_lr + sample['PROB_lgb'] * prob_lgb
    sample = sample[['ID', 'PROB']]
    print(sample)
    sample.to_csv(r'sample.csv', index=None)
    print('模型已融合。。。')

In [25]:
def module_merge_double(prob_x, prob_l):
    xgb_sample = pd.read_csv(r'result0501_152.csv', low_memory=False)   # encode:159:0.790297834417
    lr_sample = pd.read_csv(r'xgb_sample_51.csv', low_memory=False)     # Uncode:0.792171452209
    sample = xgb_sample.merge(lr_sample, on='ID', how='left')
    sample['PROB'] = sample['PROB_x'] * prob_x + sample['PROB_y'] * prob_l
    sample = sample[['ID', 'PROB']]
    print(sample)
    sample.to_csv(r'sample.csv', index=None)
    print('模型已融合。。。')

In [26]:
def main():
    '''xgboost单模型'''
    train_module(store_result=False, store_feature=True, select_feature=False, feature_num='all', one_encode=False)

    '''LogisticRegression单模型'''
    # train_LR_module(store_result=False, select_feature=True, feature_num=140, OneEncode=False)
    '''线性融合三个sample'''
    # module_merge_triple(prob_xgb=0.4, prob_lr=0.2, prob_lgb=0.4)
    '''现行融合两个sample'''
    # module_merge_double(prob_x=0.5, prob_l=0.5)
    '''Stacking'''
    # # ensemble = Ensemble(5, xgb_module, [xgb_module, lgb_module, lr_module, rf_module, gb_module])
    # ensemble = Ensemble(4, lr_module, [xgb_module, xgb_module, xgb_module, xgb_module])
    # train_test, label, test = ensemble.read_data()
    # result = ensemble.fit_predict(train_test, label, test)
    # print('模型融合完毕。。。')
    # result = pd.DataFrame(result, columns=['PROB'])
    # sample = pd.read_csv(r'lr_sample.csv', low_memory=False)
    # sample['PROB'] = [index for index in result['PROB']]
    # sample.to_csv(r'stacking.csv', index=None)
    # print(sample)
    # print('数据整合完毕。。。')

    '''multiply_feature_selection  xgboost_module'''
    features_name = ['order_all_is_null', 'feature_1', 'register_days', 'quota', 'quota_surplus',  'all_is_null_y',  'account_grade_is_null', 'all_is_zero', 'account_grade2', 'age_three', 'type_pay_len', 'null_y', '等待付款', 'income1', 'auth_time_is_null', 'record_count', 'qq_bound_is_null', 'card_record_count', 'quota_is_zero', '新疆', '云南', 'account_grade3', '广东', 'card_time_is_null', 'have_credit_card', '充值成功', '已取消', 'credit_count', '在线', '四川', 'wechat_bound_is_null', 'null', 'credit_score_rank', '未抢中', 'null_x', '完成', '天津', 'age_two', 'female', '订单取消', 'quota_rate', '山东', '重庆', 'sts_order_len', 'merriage1', '福建', 'account_grade1', 'phone_count', 'record_is_unique', '上海', 'income3', '湖北', 'phone_is_null', 'time_phone_is_null', 'province_len', 'birthday_is_zero', '混合支付', 'auth_id_card_is_null', 'credit_score', '江西', '货到付款', '吉林', 'credit_score_is_null', '江苏', 'all_not_null', 'sex_secret', '已完成', 'card_category_count', 'card_count_one', '等待收货', '湖南', 'male', 'store_card_count']
    train_xgb_module(features_name, store_result=True)

    # 0.81882083452 seed=27
    # original   ->    0.816853963449
    # colsample_bytree: 0.8   ->  0.818427843445
    # scale_pos_weight: 16   ->   0.82029535496
    # reg_alpha: 0.01  ->   0.820431061402
    # 'quota', 'quota_surplus',  ->   0.820543215061

    '''multiply_feature_selection  LogisticRegression_module'''
    features_name = ['order_all_is_null', 'feature_1', 'record_is_unique', '浙江', '辽宁', 'card_time_is_null', 'income1', 'account_grade2', '黑龙', '江苏', '未抢中', '山东', '内蒙', '上海', '分期付款', '货到付款', 'overdraft', '公司转账', 'null', '订单取消', 'age_two', '充值成功', '在线', '新疆', '完成', 'quota_rate', 'sex_not_male', '湖北', 'quota', 'account_grade_is_null', '安徽', 'card_category_count', 'all_not_null', 'phone_is_null', '河北', 'merriage_is_null', '混合支付', 'quota_surplus_is_null', 'birthday_is_zero', 'income3', '江西', 'store_card_count', 'time_phone_is_null', 'id_card_is_null', 'auth_id_card_is_null', '已取消', '广东', 'record_count', '云南', '等待付款', '已完成', 'card_count_one', 'type_pay_len', 'female', 'sts_order_len', '福建', 'auth_time_is_null', '在线支付', 'null_x', 'income2', 'quota_is_zero', 'credit_score_is_null', 'account_grade3', '四川', '等待审核', '重庆', '河南', 'all_is_null_y', '吉林', '抢票已取消', 'province_len', 'credit_count', 'account_grade1', 'credit_score_rank', 'sts_order_count', '湖南', '充值失败;退款成功', 'wechat_bound_is_null', 'card_record_count', 'male', '邮局汇款', 'merriage1', '山西', 'phone_count', 'sex_secret', '海南', 'merriage2', '等待收货', 'all_is_zero', '天津', 'credit_score', 'age_three', 'null_y', 'qq_bound_is_null', 'have_credit_card', '北京']
    train_lr_module(features_name, store_result=True)

    module_merge_triple(prob_xgb=0.4, prob_lr=0.2, prob_lgb=0.4)

In [27]:
start_time = time.clock()
main()
end_time = time.clock()
print('程序耗时：', end_time - start_time)

  """Entry point for launching an IPython kernel.
  result = method(y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat