### 导入相关库和封装函数

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
from sklearn.metrics import cohen_kappa_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
import os
import itertools
warnings.filterwarnings('ignore')

def drop_correlated_col(df, cutoff):
    # 筛选高度相关特征
    def filter_corr(corr, cutoff=0.9):
        cols = []
        for i,j in feature_group:
            if corr.loc[i, j] > cutoff:
                print(i,j,corr.loc[i, j])
                i_avg = corr[i][corr[i] != 1].mean()
                j_avg = corr[j][corr[j] != 1].mean()
                if i_avg >= j_avg:
                    cols.append(i)
                else:
                    cols.append(j)
        return set(cols)

    corr = df.corr()
    feature_group = list(itertools.combinations(corr.columns, 2))
    drop_cols = filter_corr(corr, cutoff)
    print(list(drop_cols))
    df.drop(list(drop_cols),inplace = True, axis = 1)
    return df


def statistics_feature(train, test, df, agg_stat, cutoff=0.96, fillna=1):

    # 数据处理
    group_df3 = df[(df['mon']<=9)&(df['mon']>=7)].groupby(['cust_no']).agg(agg_stat)
    group_df3.columns = [f[0]+'_'+f[1] for f in group_df3.columns]
    group_df3.reset_index(inplace=True)
    group_df3['season'] = 3

    group_df4 = df[(df['mon']<=12)&(df['mon']>=10)].groupby(['cust_no']).agg(agg_stat)
    col = [f[0]+'_'+f[1] for f in group_df4.columns]
    group_df4.columns = [f[0]+'_'+f[1] for f in group_df4.columns]
    group_df4.reset_index(inplace=True)
    group_df4['season'] = 4

    group_df1 = df[(df['mon']<=15)&(df['mon']>=13)].groupby(['cust_no']).agg(agg_stat)
    group_df1.columns = [f[0]+'_'+f[1] for f in group_df1.columns]
    group_df1.reset_index(inplace=True)
    group_df1['season'] = 5
                           
    stat = pd.concat([pd.concat([group_df3, group_df4], axis=0, ignore_index=True), group_df1], axis=0, ignore_index=True)
    stat.to_pickle('stat.pkl')

    del group_df3, group_df4, group_df1      
    
    # 剔除高度相关特征
    stat = drop_correlated_col(stat, cutoff)
    
    # 将特征合并进去         
    train, test = merge_feat(train, test, stat, fillna)
    return train, test


def merge_feat(train, test, df, fillna):
    
    tmp = df[df['season']==3].copy()
    del tmp['season']
    tmp.columns = ['cust_no'] +  [f+'_1' for f in tmp.columns[1:]]
    col_1 = [f for f in tmp.columns[1:]]
    train = train.merge(tmp, on=['cust_no'], how='left')
    if fillna==1:
        train[col_1] = train[col_1].fillna(value=0)
#     train[col_1].fillna(value=0, inplace=True)


    tmp = df[df['season']==4].copy()
    del tmp['season']
    tmp.columns = ['cust_no'] +  [f+'_2' for f in tmp.columns[1:]]
    col_2 = [f for f in tmp.columns[1:]]
    train = train.merge(tmp, on=['cust_no'], how='left')
    if fillna==1:
        train[col_2] = train[col_2].fillna(value=0)


    for i,j  in zip(col_1, col_2):
        train[j+'_'+i] = train[j] - train[i]

    tmp = df[df['season']==4].copy()
    del tmp['season']
    tmp.columns = ['cust_no'] +  [f+'_1' for f in tmp.columns[1:]]
    col_1 = [f for f in tmp.columns[1:]]
    test = test.merge(tmp, on=['cust_no'], how='left')
    if fillna==1:
        test[col_1] = test[col_1].fillna(value=0)


    tmp = df[df['season']==5].copy()
    del tmp['season']
    tmp.columns = ['cust_no'] +  [f+'_2' for f in tmp.columns[1:]]
    col_2 = [f for f in tmp.columns[1:]]
    test = test.merge(tmp, on=['cust_no'], how='left')
    if fillna==1:
        test[col_2] = test[col_2].fillna(value=0)

    for i,j  in zip(col_1, col_2):
        test[j+'_'+i] = test[j] - test[i]
        
    return train, test

def get_Fluction_Feature(df_fea, train, test, f):
    df_fea = df_fea[~df_fea['cust_no'].isnull()] #去除id为空的行
    stat = pd.DataFrame(df_fea[['cust_no']].drop_duplicates())#去除重复值
    for i in range(7,16):
        tmp = df_fea[(df_fea['mon']==i)][['cust_no', f]].copy()
        stat = stat.merge(tmp, on=['cust_no'], how='left')
        print(stat.shape)
    stat.fillna(value=0, inplace=True)
    stat.columns =['cust_no'] + [f + '_' + str(i) for i in range(7, 16)]
    # 季度内波动
    stat[f+'_3s'] = stat[f +'_9'] - stat[f +'_7']
    stat[f+'_4s'] = stat[f +'_12'] - stat[f +'_10']
    stat[f+'_5s'] = stat[f +'_15'] - stat[f +'_13']
    # 季度间波动
    stat[f+'_34s'] = stat[f +'_12'] - stat[f +'_9']
    stat[f+'_45s'] = stat[f +'_15'] - stat[f +'_12']


    tmp = stat[['cust_no', f+'_9', f+'_12', f+'_3s', f+'_4s', f+'_34s']].copy()
    tmp.columns = ['cust_no'] + [f+'_'+str(i) for i in range(1,6)]
    train = train.merge(tmp, on=['cust_no'], how='left')
    del tmp

    tmp = stat[['cust_no',f+'_12', f+'_15', f+'_4s', f+'_5s',f+'_45s']].copy()
    tmp.columns = ['cust_no'] + [f+'_'+str(i) for i in range(1,6)]
    test = test.merge(tmp, on=['cust_no'], how='left')
    del tmp
    return train, test


## 读取数据

In [2]:
os.getcwd() 
os.chdir('./data')

### 读取数据
y_Q3_3 = pd.read_csv('./y_train_3/y_Q3_3.csv')
y_Q4_3 = pd.read_csv('./y_train_3/y_Q4_3.csv')

aum_fils = os.listdir('x_train/aum_train/')+os.listdir('x_test/aum_test/')
aum = []
for f in aum_fils:
    print(f)
    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))
    if mon>=7:
        tmp = pd.read_csv('x_train/aum_train/'+f)
        tmp['mon'] = mon
    else:
        tmp = pd.read_csv('x_test/aum_test/'+f)
        tmp['mon'] = mon+12
    aum.append(tmp)
aum = pd.concat(aum, axis=0, ignore_index=True)

behavior_fils = os.listdir('x_train/behavior_train/')+os.listdir('x_test/behavior_test/')
behavior = []
for f in behavior_fils:
    print(f)
    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))
    if mon>=7:
        tmp = pd.read_csv('x_train/behavior_train/'+f)
        tmp['mon'] = mon
    else:
        tmp = pd.read_csv('x_test/behavior_test/'+f)
        tmp['mon'] = mon+12
    behavior.append(tmp)
behavior = pd.concat(behavior, axis=0, ignore_index=True)

event_fils = os.listdir('x_train/big_event_train/')+os.listdir('x_test/big_event_test/')
event = []
for f in event_fils:
    print(f)
    season = int((f.split('.')[0]).split('_')[-1].replace('Q', ''))
    if season>=3:
        tmp = pd.read_csv('x_train/big_event_train/'+f)
        tmp['season'] = season
    else:
        tmp = pd.read_csv('x_test/big_event_test/'+f)
        tmp['season'] = season + 4
    
    event.append(tmp)
event = pd.concat(event, axis=0, ignore_index=True)
del event['c']

cunkuan_fils = os.listdir('x_train/cunkuan_train/')+os.listdir('x_test/cunkuan_test/')
cunkuan = []
for f in cunkuan_fils:
    print(f)
    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))
    if mon>=7:
        tmp = pd.read_csv('x_train/cunkuan_train/'+f)
        tmp['mon'] = mon
    else:
        tmp = pd.read_csv('x_test/cunkuan_test/'+f)
        tmp['mon'] = mon+12
    cunkuan.append(tmp)
cunkuan = pd.concat(cunkuan, axis=0, ignore_index=True)

cust_avli_Q3 = pd.read_csv('./x_train/cust_avli_Q3.csv')
cust_avli_Q4 = pd.read_csv('./x_train/cust_avli_Q4.csv')
cust_info_Q3 = pd.read_csv('x_train/cust_info_q3.csv')
cust_info_Q4 = pd.read_csv('x_train/cust_info_q4.csv')
cust_avli_Q1 = pd.read_csv('x_test/cust_avli_Q1.csv')
cust_info_Q1 = pd.read_csv('x_test/cust_info_q1.csv')

aum_m10.csv
aum_m10.csv.baiduyun.uploading.cfg
aum_m11.csv
aum_m11.csv.baiduyun.uploading.cfg
aum_m12.csv
aum_m12.csv.baiduyun.uploading.cfg
aum_m7.csv
aum_m7.csv.baiduyun.uploading.cfg
aum_m8.csv
aum_m8.csv.baiduyun.uploading.cfg
aum_m9.csv
aum_m9.csv.baiduyun.uploading.cfg
aum_m1.csv
aum_m2.csv
aum_m3.csv
behavior_m10.csv
behavior_m10.csv.baiduyun.uploading.cfg
behavior_m11.csv
behavior_m11.csv.baiduyun.uploading.cfg
behavior_m12.csv
behavior_m12.csv.baiduyun.uploading.cfg
behavior_m7.csv
behavior_m7.csv.baiduyun.uploading.cfg
behavior_m8.csv
behavior_m8.csv.baiduyun.uploading.cfg
behavior_m9.csv
behavior_m9.csv.baiduyun.uploading.cfg
behavior_m1.csv
behavior_m2.csv
behavior_m3.csv
behavior_m3.csv.baiduyun.uploading.cfg
big_event_Q3.csv
big_event_Q3.csv.baiduyun.uploading.cfg
big_event_Q4.csv
big_event_Q4.csv.baiduyun.uploading.cfg
big_event_Q1.csv
big_event_Q1.csv.baiduyun.uploading.cfg
cunkuan_m10.csv
cunkuan_m10.csv.baiduyun.uploading.cfg
cunkuan_m11.csv
cunkuan_m11.csv.baiduyun.u

## 特征工程

第一组特征很自然的想到用户历史的label，例如在预测季度4的用户时，使用用户在季度3的label作为特征。可以简单看到这个特征的kappa值可以达到0.238+。

In [3]:
train = y_Q4_3.copy()
y_Q3_3 = y_Q3_3.rename(columns={'label': 'bef_label'})
train = train.merge(y_Q3_3, on=['cust_no'], how='left').copy()

test = cust_avli_Q1.copy()
y_Q4_3 = y_Q4_3.rename(columns={'label': 'bef_label'})
test = test.merge(y_Q4_3, on=['cust_no'], how='left')
test

Unnamed: 0,cust_no,bef_label
0,0x3b9b4615,0.0
1,0x3b9ae61b,1.0
2,0x3b9add69,0.0
3,0x3b9b3601,0.0
4,0x3b9b2599,0.0
...,...,...
76717,0xb2d69017,0.0
76718,0xb2d68153,1.0
76719,0xb2d5bba1,1.0
76720,0xb2d61b9b,1.0


### 用户特征属性处理

In [4]:
# 根据EDA去掉一部分的列
cust_info_Q4.drop(columns = ['I1','I2','I4','I7','I8','I9','I12','I15','I17','I18','I19'], inplace=True)
cust_info_Q3.drop(columns = ['I1','I2','I4','I7','I8','I9','I12','I15','I17','I18','I19'], inplace=True)
cust_info_Q1.drop(columns = ['I1','I2','I4','I7','I8','I9','I12','I15','I17','I18','I19'], inplace=True)
train = train.merge(cust_info_Q4, on=['cust_no'], how='left')
train = train.merge(cust_info_Q3[['cust_no','I3']], on=['cust_no'], how='left')
test = test.merge(cust_info_Q1, on=['cust_no'], how='left')
test = test.merge(cust_info_Q4[['cust_no','I3']], on=['cust_no'], how='left')
print(train.shape, test.shape)

### 特征编码
for col in [f for f in train.select_dtypes('object').columns if f not in ['label', 'cust_no']]:
    print(col)
    train[col].fillna(train[col].mode()[0], inplace=True)### 缺失值处理·
    test[col].fillna(train[col].mode()[0], inplace=True)### 缺失值处理·
    le = LabelEncoder()
    le.fit(pd.concat([train[[col]], test[[col]]], axis=0, ignore_index=True))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

(76170, 13) (76722, 12)
I3_x
I5
I10
I13
I14
I3_y


### 存款特征处理

In [6]:
cunkuan = cunkuan.sort_values(by=['cust_no', 'mon']).reset_index(drop=True)
cunkuan_agg_stat = {'C1': ['max', 'min', 'sum', 'last'],
#             'C2': ['max', 'min', 'sum', 'last'],
           }
train, test = statistics_feature(train, test, cunkuan, cunkuan_agg_stat, cutoff=0.96)
# cunkuan_stat = pd.read_pickle('cunkuan_stat.pkl')
# cunkuan_stat = drop_correlated_col(cunkuan_stat, cutoff = 0.96)
# train, test = merge_feat(train, test, cunkuan_stat, fillna=1)
# train, test =  get_Fluction_Feature(cunkuan, train, test, 'C1')
# print(train.shape, test.shape)

C1_max C1_sum 0.9722382981454611
C1_min C1_sum 0.9638184487079668
C1_min C1_last 0.9721794830610042
C1_sum C1_last 0.967627208540242
['C1_last', 'C1_sum']


### 金额特征处理

In [7]:
X_cols = [f for f in aum.columns if f.startswith('X') and f not in ['X7']]
aum['X_sum'] = aum[X_cols].sum(axis=1) - aum['X7']
aum['X_fuzailv'] = aum['X7']/aum['X_sum'] 
aum['X_siqilv'] = aum['X2']/aum['X_sum'] 
aum['X_huoqilv'] = aum['X3']/aum['X_sum'] 
aum_agg_stat = {'X1': [ 'sum', 'last'],
                'X2': [ 'sum', 'min', 'max', 'last'],
                'X3': [ 'sum', 'min', 'max', 'last'],
                'X4': [ 'sum', 'last'],
                'X5': [ 'sum', 'last'],
                'X6': [ 'sum', 'last'],
                'X7': [ 'sum', 'last'],
                'X8': [ 'sum', 'last'],
                'X_sum': [ 'sum', 'min', 'max', 'last'],
                'X_fuzailv': [ 'sum', 'last'],
                'X_siqilv': [ 'sum', 'last'],
                'X_huoqilv': [ 'sum', 'last'],
           }
train, test = statistics_feature(train, test, aum, aum_agg_stat, cutoff=0.96)
train, test =  get_Fluction_Feature(aum, train, test, 'X_sum')
train, test =  get_Fluction_Feature(aum, train, test, 'X2')
train, test =  get_Fluction_Feature(aum, train, test, 'X3')
print(train.shape, test.shape)

X1_sum X1_last 0.9686445268910792
X2_sum X2_max 0.9717797933901305
X2_min X2_last 0.9831454095524654
X7_sum X7_last 0.9723667616496774
X_sum_sum X_sum_min 0.9672674573232287
X_sum_sum X_sum_max 0.9732399867084586
X_sum_sum X_sum_last 0.9706362970747083
['X1_sum', 'X_sum_max', 'X2_last', 'X2_sum', 'X7_last', 'X_sum_sum']
(659624, 2)
(659624, 3)
(659624, 4)
(659624, 5)
(659624, 6)
(659624, 7)
(659624, 8)
(659624, 9)
(659624, 10)
(659624, 2)
(659624, 3)
(659624, 4)
(659624, 5)
(659624, 6)
(659624, 7)
(659624, 8)
(659624, 9)
(659624, 10)
(659624, 2)
(659624, 3)
(659624, 4)
(659624, 5)
(659624, 6)
(659624, 7)
(659624, 8)
(659624, 9)
(659624, 10)
(76170, 106) (76722, 105)


### 行为特征处理

In [8]:
behavior['B5-B3'] = behavior['B5'] - behavior['B3']
behavior['B5/B3'] = behavior['B5'] / behavior['B3']
behavior.loc[((behavior['mon']==9)),'B6'] =  (pd.to_datetime('2019-10-01 00:00:00') -pd.to_datetime(behavior.loc[((behavior['mon']==9)),'B6'])).dt.days
behavior.loc[((behavior['mon']==12)),'B6'] =  (pd.to_datetime('2020-01-01 00:00:00')-pd.to_datetime(behavior.loc[((behavior['mon']==12)),'B6'])).dt.days
behavior.loc[((behavior['mon']==15)),'B6'] =  (pd.to_datetime('2020-04-01 00:00:00') -pd.to_datetime(behavior.loc[((behavior['mon']==15)),'B6'])).dt.days
behavior_agg_stat = {'B5-B3': ['max', 'min', 'sum'],
            'B5/B3': ['max', 'min', 'sum'],
            'B1': ['max', 'min', 'sum'],
            'B2': ['max', 'min', 'sum'],
            'B3': ['max', 'min', 'sum'],
            'B4': ['max', 'min', 'sum'],
            'B5': ['max', 'min', 'sum'],
            'B6': ['last'],
            'B7': ['last'],
           }

train, test = statistics_feature(train, test, behavior, behavior_agg_stat, cutoff=0.97)

B5/B3_max B5/B3_min 0.9968893511806203
B5/B3_max B5/B3_sum 0.9999999985612688
B5/B3_min B5/B3_sum 0.9968893437084174
['B5/B3_sum', 'B5/B3_max']


In [9]:
# behavior_stat = pd.read_pickle('behavior_stat.pkl')
# behavior_stat = drop_correlated_col(behavior_stat, cutoff = 0.97)
# train, test = merge_feat(train, test, behavior_stat, fillna=0)
# print(train.shape, test.shape)

B3/B2_min B3/B2_last 0.9714778417386742
B5/B3_max B5/B3_min 0.9968893511806203
B5/B3_max B5/B3_std 0.99752284106521
B5/B3_max B5/B3_sum 0.9999999985612688
B5/B3_max B5/B3_last 0.9969164322151617
B5/B3_min B5/B3_sum 0.9968893437084174
B5/B3_min B5/B3_last 0.9999728105501697
B5/B3_std B5/B3_sum 0.997522396716309
B5/B3_sum B5/B3_last 0.9969164276516074
['B5/B3_last', 'B3/B2_last', 'B5/B3_sum', 'B5/B3_max']
(76170, 400) (76722, 399)


In [9]:
del event['E3'], event['E11']
## 现将异常值替换成nan
event.loc[pd.to_datetime(event['E1'])<pd.to_datetime('1949-10-01 00:00:00'), 'E1']= np.nan
event.loc[pd.to_datetime(event['E8'])<pd.to_datetime('1949-10-01 00:00:00'), 'E8']= np.nan

## 再用E2填充E1
event.loc[event['E1'].isnull()&event['E2'].notnull(), 'E1'] = event.loc[event['E1'].isnull()&event['E2'].notnull(), 'E2']

## 现转换成时间数据格式
E_cols = [f for f in event.columns if f.startswith('E')]
for col in E_cols:
    if col not in [ 'E15', 'E17']:
        event[col] = pd.to_datetime(event[col])

def get_Dateencoder(x, flag):
    if flag==1:
        #  当季度   
        if x > pd.to_datetime('2019-10-01 00:00:00'): return 1
         #  上一季度  
        elif  x > pd.to_datetime('2019-07-01 00:00:00'): return 2
         #  上上季度  
        elif  x > pd.to_datetime('2019-04-01 00:00:00'): return 3
        #  上上上季度 
        elif  x > pd.to_datetime('2019-01-01 00:00:00'): return 4
        #  长期
        return 5
    elif flag==0:
        #  当季度  
        if x > pd.to_datetime('2020-01-01 00:00:00'): return 1    
        #  上一季度  
        elif x > pd.to_datetime('2019-10-01 00:00:00'): return 2
         #  上上季度  
        elif  x > pd.to_datetime('2019-07-01 00:00:00'): return 3
        #  上上上季度   
        elif  x > pd.to_datetime('2019-04-01 00:00:00'): return 4
        #  长期
        return 5
    
# train = pd.DataFrame(cust_avli_Q4['cust_no'])
train = train.merge(event.loc[event['season']==4], how='inner', on='cust_no')
train.head()
del train['season']

# test = pd.DataFrame(cust_avli_Q1['cust_no'])
test = test.merge(event.loc[event['season']==5], how='inner', on='cust_no')
del test['season']


E_cols = [f for f in event.columns if f.startswith('E')]
for col in E_cols:
    if col not in [ 'E15', 'E17']:
        print(col)
        train[col] = train[col].apply(lambda x: get_Dateencoder(x, 1))
        test[col] = test[col].apply(lambda x: get_Dateencoder(x, 0))        

E1
E2
E4
E5
E6
E7
E8
E9
E10
E12
E13
E14
E16
E18


In [11]:
# drop_cols = train.corr()['label'][abs(train.corr()['label'])<0.01].index
# print(drop_cols)
# test.drop(drop_cols,axis=1,inplace=True)
# train.drop(drop_cols,axis=1,inplace=True)
# print(train.shape, test.shape)

Index(['I10', 'I18', 'C3_std_1', 'C2_skew_2_C2_skew_1', 'C3_min_2_C3_min_1',
       'X2_min_1', 'X2_std_1', 'X2_sum_1', 'X3_max_1', 'X3_std_1', 'X3_sum_1',
       'X5_max_1', 'X5_min_1', 'X5_std_1', 'X5_sum_1', 'X5_last_1', 'X6_std_1',
       'X6_skew_1', 'X8_std_1', 'X_num_skew_1', 'X2_min_2', 'X2_sum_2',
       'X2_skew_2', 'X5_min_2', 'X5_sum_2', 'X6_max_2', 'X6_min_2', 'X6_std_2',
       'X6_sum_2', 'X6_last_2', 'X8_skew_2', 'X1_min_2_X1_min_1',
       'X1_std_2_X1_std_1', 'X1_sum_2_X1_sum_1', 'X1_skew_2_X1_skew_1',
       'X3_min_2_X3_min_1', 'X3_last_2_X3_last_1', 'X6_std_2_X6_std_1',
       'X7_std_2_X7_std_1', 'X7_skew_2_X7_skew_1', 'X8_std_2_X8_std_1',
       'X_num_std_2_X_num_std_1', 'B5-B3_max_1', 'B5-B3_min_1', 'B5-B3_std_1',
       'B5-B3_sum_1', 'B4-B2_sum_1', 'B4-B2_last_1', 'B5/B4_std_1',
       'B4/B2_skew_1', 'B5/B3_min_1', 'B2_std_1', 'B3_min_1', 'B3_last_1',
       'B5_min_1', 'B5_last_1', 'B5-B3_sum_2', 'B5-B3_last_2', 'B5/B4_skew_2',
       'B5/B3_min_2', 'B1_std

In [10]:
train

Unnamed: 0,cust_no,label,bef_label,I3_x,I5,I6,I10,I11,I13,I14,...,E9,E10,E12,E13,E14,E15,E16,E17,E18,c
0,0xb2d8e1f9,1,1.0,3,4,0,3,0.0,2,5,...,5,1,5,5,5,1300.0,1,0.0,5,
1,0xb2da4f54,1,,0,0,0,3,0.0,2,5,...,5,1,5,5,5,0.0,5,0.0,5,
2,0xb2d0f4e5,1,1.0,0,7,0,3,0.0,2,5,...,5,5,5,5,5,0.0,5,0.0,5,
3,0x3b9b3b70,1,1.0,3,0,0,3,0.0,2,5,...,5,5,5,5,5,110000.0,5,9335.0,5,
4,0xb2d8d086,1,1.0,2,5,0,3,0.0,2,5,...,5,2,5,5,5,0.0,5,300000.0,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76165,0xb2d174f1,1,-1.0,1,5,0,3,0.0,2,5,...,5,2,5,2,5,50000000.0,2,50000010.0,2,
76166,0x3b9af14f,0,0.0,1,0,0,3,0.0,2,5,...,5,5,5,5,5,200000.0,5,370000.0,5,
76167,0xb2d1bb5d,0,1.0,3,5,0,3,0.0,2,5,...,5,5,5,5,5,50000.0,3,120000.0,5,
76168,0xb2d9ed26,1,,0,1,0,3,0.0,2,5,...,5,1,5,5,5,0.0,5,0.0,5,


In [14]:
## 加入权重信息，权重系数是直接参考大佬分享https://github.com/BirderEric/XianmenBank，猜想这个系数是通过暴力搜索得到的
train['weight'] = train['label'].map({0:1.03,1:0.58,-1:1})

In [55]:
# cat_col =['bef_label'] + [f for f in train.columns if f.startswith('I') and f not in  ['I11'] ] + [f for f in train.columns if f.startswith('E') and f not in ['E15', 'E17']]
# train.loc[(train['bef_label'].isnull())&(train['E1']==1),'bef_label'] =2
# test.loc[(test['bef_label'].isnull())&(test['E1']==1),'bef_label'] =2
# train.loc[(train['bef_label'].isnull()),'bef_label'] =3
# test.loc[(test['bef_label'].isnull()),'bef_label'] =3
train[cat_col] = train[cat_col].astype(np.int) 
test[cat_col] = test[cat_col].astype(np.int) 

In [50]:
train.loc[(train['bef_label'].isnull()),'bef_label'] =3
test.loc[(test['bef_label'].isnull()),'bef_label'] =3

In [52]:
test['bef_label'].unique()

array([ 0.,  1., -1.,  3.,  2.])

In [17]:
import pandas as pd
# train.to_pickle('train.pkl')
# test.to_pickle('test.pkl')
train = pd.read_pickle('train.pkl')
test = pd.read_pickle('test.pkl')


### 模型训练

In [56]:
import catboost as cat
from catboost import Pool

def kappa(preds, train_data):
    y_true = train_data.label
    preds = np.argmax(preds.reshape(3, -1), axis=0)
    score = cohen_kappa_score(y_true, preds)
    return 'kappa', score, True


def Cat_classfication_model(train, target, test, k):
#     cat_features = [i for i in stat_col if i in train.columns] + [i for i in train.columns[16:24]]
    feats = [f for f in train.columns if f not in ['cust_no', 'label', 'weight']]
    print('Current num of features:', len(feats))
    folds = KFold(n_splits=5, shuffle=False, random_state=2020)
    oof_preds = np.zeros(train.shape[0])
    oof_probs = np.zeros((train.shape[0], 3))
    output_preds = []
    feature_importance_df = pd.DataFrame()
    offline_score = []

    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
        train_weight, valid_weight = train['weight'][train_index], train['weight'][test_index]
        print(train_X.shape, test_X.shape)
        print(cat_col, [train_X.columns.get_loc(col) for col in cat_col])
        train_data = Pool(data=train_X,
                          label=train_y,
                          cat_features=cat_col,
                          weight=train_weight.values.flatten(order='F'),

                         )
        valid_data = Pool(data=test_X,
                          label=test_y,
                          cat_features=cat_col,
                         )
        cat_model =cat.CatBoostClassifier(iterations=2500, learning_rate=0.057, max_depth=7, l2_leaf_reg=2, verbose=100,
                                       early_stopping_rounds=50,loss_function='MultiClass'
                                          , eval_metric='Kappa',
                                     )

        cat_model.fit(train_data,
          eval_set= valid_data,
          use_best_model=True,
         )
        
        oof_probs[test_index] = cat_model.predict_proba(test_X[feats])
        oof_preds[test_index] = np.argmax(oof_probs[test_index], axis=1)
        offline_score.append(cat_model.get_best_score()['validation']['Kappa'])
        output_preds.append(cat_model.predict_proba(test[feats]))
        
        
        # feature importance
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = cat_model.get_feature_importance()
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-KAPPA score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(15))
    print('confusion matrix:')
    print(confusion_matrix(target, oof_preds))
    print('classfication report:')
    print(classification_report(target, oof_preds))

    return output_preds, oof_probs, np.mean(offline_score)
target = train['label'] + 1
cab_preds, cab_oof, cab_score = Cat_classfication_model(train, target,test, 5)

Current num of features: 183
(60936, 183) (15234, 183)
['bef_label', 'I3_x', 'I5', 'I6', 'I10', 'I13', 'I14', 'I16', 'I20', 'I3_y', 'E1', 'E2', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'E10', 'E12', 'E13', 'E14', 'E16', 'E18'] [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 180, 182]
0:	learn: 0.4235962	test: 0.4220361	best: 0.4220361 (0)	total: 605ms	remaining: 25m 11s
100:	learn: 0.4854554	test: 0.4795510	best: 0.4795510 (100)	total: 49.6s	remaining: 19m 39s
200:	learn: 0.5047862	test: 0.4872705	best: 0.4872826 (199)	total: 1m 36s	remaining: 18m 19s
300:	learn: 0.5230704	test: 0.4914439	best: 0.4924317 (299)	total: 2m 26s	remaining: 17m 47s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4936060427
bestIteration = 329

Shrink model to first 330 iterations.
(60936, 183) (15234, 183)
['bef_label', 'I3_x', 'I5', 'I6', 'I10', 'I13', 'I14', 'I16', 'I20', 'I3_y', 'E1', 'E2', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'E10', 'E12', 'E13', 

In [40]:
sub_df = test[['cust_no']].copy()
sub_df['label'] = np.argmax(np.mean(cab_preds, axis=0), axis=1) - 1
sub_df.to_csv('cab_final.csv', index=False)
print(sub_df['label'].value_counts(normalize=True))

 1    0.620995
 0    0.242668
-1    0.136336
Name: label, dtype: float64


In [57]:
def kappa(preds, train_data):
    y_true = train_data.label
    preds = np.argmax(preds.reshape(3, -1), axis=0)
    score = cohen_kappa_score(y_true, preds)
    return 'kappa', score, True

def LGB_classfication_model(train, target, test, k):
    feats = [f for f in train.columns if f not in ['cust_no', 'label', 'weight']]
    print('Current num of features:', len(feats))
    folds = KFold(n_splits=k, shuffle=False, random_state=2020)
    oof_preds = np.zeros(train.shape[0])
    oof_probs = np.zeros((train.shape[0], 3))
    output_preds = []
    feature_importance_df = pd.DataFrame()
    offline_score = []
    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
        train_weight, valid_weight = train['weight'][train_index], train['weight'][test_index]
        print(train_X.shape, test_X.shape)
        dtrain = lgb.Dataset(train_X,
                             label=train_y,
                             categorical_feature=cat_col, 
                             weight=train_weight.values.flatten(order='F'),
                            )
        dval = lgb.Dataset(test_X,
                           label=test_y,
                          categorical_feature=cat_col, )
        parameters = {
            'learning_rate': 0.05,
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'metric': 'None',
            'num_leaves': 63,
            'num_class': 3,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'min_data_in_leaf': 20,
            'verbose': -1,
            'nthread': 24
        }
        lgb_model = lgb.train(
            parameters,
            dtrain,
            num_boost_round=5000,
            valid_sets=[dval],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=kappa, 
        )
        oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration)
        oof_preds[test_index] = np.argmax(lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration), axis=1)
        offline_score.append(lgb_model.best_score['valid_0']['kappa'])
            

        output_preds.append(lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration))
        # feature importance
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-KAPPA score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(15))
    print('confusion matrix:')
    print(confusion_matrix(target, oof_preds))
    print('classfication report:')
    print(classification_report(target, oof_preds))

    return output_preds, oof_probs, np.mean(offline_score)
target = train['label'] + 1
lgb_preds, lgb_oof, lgb_score = LGB_classfication_model(train, target, test, 5)

Current num of features: 183
(60936, 183) (15234, 183)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's kappa: 0.498811
[200]	valid_0's kappa: 0.497067
Early stopping, best iteration is:
[118]	valid_0's kappa: 0.499666
(60936, 183) (15234, 183)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's kappa: 0.496294
[200]	valid_0's kappa: 0.490494
Early stopping, best iteration is:
[106]	valid_0's kappa: 0.497513
(60936, 183) (15234, 183)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's kappa: 0.47704
[200]	valid_0's kappa: 0.479691
[300]	valid_0's kappa: 0.479174
[400]	valid_0's kappa: 0.478031
Early stopping, best iteration is:
[311]	valid_0's kappa: 0.481414
(60936, 183) (15234, 183)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's kappa: 0.478329
[200]	valid_0's kappa: 0.480827
Early stopping, best iteration is:
[120]	valid_0's kappa: 0.482114
(60936, 183) (15234, 183)
Traini

In [43]:
sub_df = test[['cust_no']].copy()
sub_df['label'] = np.argmax(np.mean(lgb_preds, axis=0), axis=1) - 1
sub_df.to_csv('lgb_final.csv', index=False)

In [44]:
import pandas as pd
lgb_final = pd.read_csv("lgb_final.csv")
cab_final = pd.read_csv("cab_final.csv")
(lgb_final['label']==cab_final['label']).value_counts()

True     72509
False     4213
Name: label, dtype: int64