In [1]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb 
import random
from tqdm import tqdm 
from category_encoders import TargetEncoder
import matplotlib.pyplot as plt 


# 设置随机种子
def seed_everything(SEED):
    np.random.seed(SEED)
    random.seed(SEED)
SEED = 42 
seed_everything(SEED)

# 定义评价函数
def S(y_true, y_pred):
    return ((y_true * 2 + 1) * (y_true == y_pred)).sum()/(y_true * 2 + 1).sum()

# 读取数据
X1_train = pd.read_csv('./dataset/X1_train.csv')
X2_train = pd.read_csv('./dataset/X2_train.csv')
X3_train = pd.read_csv('./dataset/X3_train.csv')
y_train = pd.read_csv('./dataset/y_train.csv')
y_train = y_train['复购频率']

X1_test = pd.read_csv('./dataset/X1_test.csv')
X2_test = pd.read_csv('./dataset/X2_test.csv')
X3_test = pd.read_csv('./dataset/X3_test.csv')

print('训练集:')
print(X1_train.shape, X2_train.shape, X3_train.shape)
print(y_train.shape)
print('测试集:')
print(X1_test.shape, X2_test.shape, X3_test.shape)

训练集:
(10428, 17) (177924, 20) (10428, 50)
(10428,)
测试集:
(8000, 17) (135478, 20) (8000, 50)


In [2]:
# X1未使用的使用特征(缺失值过多)
col_not_used_1 = ['A1', 'A3', 'A9', 'A10', 'A11', 'A18']
X1_train.drop(columns=col_not_used_1, inplace=True)
X1_test.drop(columns=col_not_used_1, inplace=True)
print(X1_train.shape)
print(X1_test.shape)

# 把类别特征中类别个数小于20的设为空值
cls_cols = ['A8', 'A12', 'A13', 'A14', 'A15', 'A20']
for col in cls_cols:
    col_counts = X1_train[col].value_counts()
    # 这样操作是防止出现未见过的值
    value_to_nan = col_counts[col_counts>=20].index 
    X1_train.loc[~X1_train[col].isin(value_to_nan), col] = np.nan
    X1_test.loc[~X1_test[col].isin(value_to_nan), col] = np.nan

# TargetEncoding会造成信息泄露
# 对分类变量进行TargetEncoding
encoding_cols = ['A8', 'A12', 'A13', 'A14', 'A15', 'A20']
for col in encoding_cols:
    target_encoder = TargetEncoder(handle_missing='return_nan', handle_unknown='return_nan')
    X1_train[col] = target_encoder.fit_transform(X1_train[col].astype(object), y_train)
    X1_test[col] = target_encoder.transform(X1_test[col].astype(object))

# 将对A17中的' days'去掉并转化为float类型
X1_train['A17'] = X1_train['A17'].map(lambda x: float(x.replace(' days', '')))
X1_test['A17'] = X1_test['A17'].map(lambda x: float(x.replace(' days', '')))


X1_train.head(5)

(10428, 11)
(8000, 11)


Unnamed: 0,客户编号,A4,A7,A8,A12,A13,A14,A15,A16,A17,A20
0,0,320000,99.0,0.470681,0.480826,0.510011,0.526854,0.459435,-0.17326,2197.0,0.492703
1,1,410000,30.0,0.470681,0.621495,0.502095,0.486996,0.459435,-0.17326,3291.0,0.492703
2,2,510000,30.0,0.470681,0.532467,0.502095,0.481939,0.416393,5.716708,2355.0,0.492703
3,3,230000,99.0,0.470681,0.480826,0.45257,0.481939,0.459435,-0.17326,2746.0,0.492703
4,4,130000,99.0,0.470681,0.480826,0.45257,0.52193,0.459435,-0.17326,2119.0,0.492703


In [3]:
# B1和B8去掉' days'
X2_train.loc[X2_train['B1'].notna(), 'B1'] = X2_train.loc[X2_train['B1'].notna(), 'B1'].map(lambda x: float(x.replace(' days', '')))
X2_test.loc[X2_test['B1'].notna(), 'B1'] = X2_test.loc[X2_test['B1'].notna(), 'B1'].map(lambda x: float(x.replace(' days', '')))
X2_train.loc[X2_train['B8'].notna(), 'B8'] = X2_train.loc[X2_train['B8'].notna(), 'B8'].map(lambda x: float(x.replace(' days', '')))
X2_test.loc[X2_test['B8'].notna(), 'B8'] = X2_test.loc[X2_test['B8'].notna(), 'B8'].map(lambda x: float(x.replace(' days', '')))

# 货单结束日期-货单开始日期
use_index =  X2_train['B8'].notna()&(X2_train['B1'].notna())
X2_train.loc[use_index, 'B8-B1'] = X2_train.loc[use_index, 'B8'] - X2_train.loc[use_index, 'B1']
X2_train.loc[~use_index, 'B8-B1'] = np.nan 

use_index =  X2_test['B8'].notna()&(X2_test['B1'].notna())
X2_test.loc[use_index, 'B8-B1'] = X2_test.loc[use_index, 'B8'] - X2_test.loc[use_index, 'B1']
X2_test.loc[~use_index, 'B8-B1'] = np.nan 

In [4]:
# 去掉重复的列, B2与B7与B9重复, B10与B11重复, B5与B13与B14重复, B4与B15重复
# 去掉缺失值过多的列, B12与B18与B19缺失值过多
print(X2_train[['B2', 'B7', 'B9']].corr())
print(X2_train[['B10', 'B11']].corr())
print(X2_train[['B5', 'B13', 'B14']].corr())
print(X2_train.groupby(['B4'])['B15'].value_counts())
X2_train.drop(columns=['B7', 'B9', 'B11', 'B13', 'B14', 'B15', 'B12', 'B18', 'B19'], inplace=True)
X2_test.drop(columns=['B7', 'B9', 'B11', 'B13', 'B14', 'B15', 'B12', 'B18', 'B19'], inplace=True)

     B2   B7   B9
B2  1.0  1.0  1.0
B7  1.0  1.0  1.0
B9  1.0  1.0  1.0
     B10  B11
B10  1.0  1.0
B11  1.0  1.0
           B5       B13       B14
B5   1.000000  0.998387  0.999118
B13  0.998387  1.000000  0.999085
B14  0.999118  0.999085  1.000000
B4  B15
t1  A1     177422
t2  A2        502
Name: B15, dtype: int64


In [5]:
# 将X2压缩
X2_train_convert = pd.DataFrame()
X2_train_convert['客户编号'] = X2_train['客户编号'].unique()
X2_test_convert = pd.DataFrame()
X2_test_convert['客户编号'] = X2_test['客户编号'].unique()

# B5(贷款拖欠月数)、B10(贷款总额)、B17(货单分期期数)、B8-B1(货单结束日期-货单开始日期)
# 众数、最大值、最小值、平均值、方差、中位数
# 但以上指标目测相关系数很大
for col in tqdm(['B8', 'B8-B1', 'B1', 'B5', 'B10', 'B17']):
    X2_train_convert[f'{col}_mode'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).mode()[0] if pd.Series(x).mode().any() else np.nan).reset_index(drop=True)
    X2_test_convert[f'{col}_mode'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).mode()[0] if pd.Series(x).mode().any() else np.nan).reset_index(drop=True)
    X2_train_convert[f'{col}_unique'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: len(pd.Series(x).unique())).reset_index(drop=True)
    X2_test_convert[f'{col}_unique'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: len(pd.Series(x).unique())).reset_index(drop=True)
    X2_train_convert[f'{col}_sum'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).sum()).reset_index(drop=True)
    X2_test_convert[f'{col}_sum'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).sum()).reset_index(drop=True)
    X2_train_convert[f'{col}_median'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).median()).reset_index(drop=True)
    X2_test_convert[f'{col}_median'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).median()).reset_index(drop=True)
    X2_train_convert[f'{col}_max'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).max()).reset_index(drop=True)
    X2_test_convert[f'{col}_max'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).max()).reset_index(drop=True)
    X2_train_convert[f'{col}_min'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).min()).reset_index(drop=True)
    X2_test_convert[f'{col}_min'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).min()).reset_index(drop=True)
    X2_train_convert[f'{col}_mean'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).mean()).reset_index(drop=True)
    X2_test_convert[f'{col}_mean'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).mean()).reset_index(drop=True)
    X2_train_convert[f'{col}_std'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).std()).reset_index(drop=True)
    X2_test_convert[f'{col}_std'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).std()).reset_index(drop=True)
    X2_train_convert[f'{col}_skew'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).skew()).reset_index(drop=True)
    X2_test_convert[f'{col}_skew'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).skew()).reset_index(drop=True)

        
# 对于较少值的特征
for col in tqdm(['B2', 'B3', 'B4', 'B6', 'B16']):
    if X2_train[col].dtype == np.object_:
        X2_train_convert[f'{col}_mode'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).mode()[0]).reset_index(drop=True)
        X2_test_convert[f'{col}_mode'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).mode()[0]).reset_index(drop=True)
    if X2_train[col].dtype == np.float64:
        X2_train_convert[f'{col}_mean'] = X2_train.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).mean()).reset_index(drop=True)
        X2_test_convert[f'{col}_mean'] = X2_test.groupby(['客户编号'])[col].agg(lambda x: pd.Series(x).mean()).reset_index(drop=True)

  0%|          | 0/6 [00:00<?, ?it/s]

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [6]:
from sklearn.preprocessing import LabelEncoder


X2_train_convert.drop(columns=['B4_mode', 'B6_mode'], inplace=True)
X2_test_convert.drop(columns=['B4_mode', 'B6_mode'], inplace=True)
mode_b16_mode = X2_train_convert['B16_mode'].mode()[0]
X2_train_convert['B16_mode'] = (X2_train_convert['B16_mode']==mode_b16_mode).astype(int)
X2_test_convert['B16_mode'] = (X2_test_convert['B16_mode']==mode_b16_mode).astype(int)
label_encoder = LabelEncoder()
X2_train_convert['B3_mode'] = pd.Series(label_encoder.fit_transform(X2_train_convert['B3_mode'])).astype('category')
X2_test_convert['B3_mode'] = pd.Series(label_encoder.transform(X2_test_convert['B3_mode'])).astype('category')

# 加入count特征
X2_train_convert['id_count'] = X2_train.groupby(['客户编号'])['客户编号'].count()
X2_test_convert['id_count'] = X2_test.groupby(['客户编号'])['客户编号'].count()

print(X2_train_convert.head(5))

   客户编号  B8_mode  B8_unique   B8_sum  B8_median  B8_max  B8_min      B8_mean  \
0     0      NaN          1      0.0        NaN     NaN     NaN          NaN   
1     1   3787.0          8  86211.0     3776.0  3816.0  3625.0  3748.304348   
2     2   3603.0          6  21984.0     3649.5  3816.0  3511.0  3664.000000   
3     3      NaN          1      0.0        NaN     NaN     NaN          NaN   
4     4   3716.0          3   7465.0     3732.5  3749.0  3716.0  3732.500000   

       B8_std   B8_skew  ...  B17_median   B17_max   B17_min  B17_mean  \
0         NaN       NaN  ...    0.736114  0.736114  0.736114  0.736114   
1   68.639662 -0.928907  ...   -1.474298  0.736114 -1.474298 -0.875591   
2  112.481110  0.055359  ...   -1.474298 -1.474298 -1.474298 -1.474298   
3         NaN       NaN  ...    0.736114  0.736114  0.736114  0.736114   
4   23.334524       NaN  ...    0.736114  0.736114  0.736114  0.736114   

    B17_std  B17_skew   B2_mean  B3_mode  B16_mode  id_count  
0  0.000000

In [7]:
for col in X3_train.columns:
    print(f'{col}: {X3_train[col].isna().mean()}')
# X3未使用的使用特征, 空值过多或取值太过一致因此在后续的特征重要程度中特别不重要
col_not_used_3 = ['C9', 'C11', 'C17', 'C22', 'C24', 'C26', 'C33', 'C35', 'C37', 'C38', 'C39', 'C40', 'C41', 'C44', 'C48', 'C49']
X3_train.drop(columns=col_not_used_3, inplace=True)
X3_test.drop(columns=col_not_used_3, inplace=True)

客户编号: 0.0
C1: 0.0005753739930955121
C2: 0.0005753739930955121
C3: 0.09321058688147296
C4: 0.3558688147295742
C5: 0.3558688147295742
C6: 0.0005753739930955121
C7: 0.0005753739930955121
C8: 0.0005753739930955121
C9: 0.3558688147295742
C10: 0.09321058688147296
C11: 0.9800537015726889
C12: 0.09321058688147296
C13: 0.09321058688147296
C14: 0.0005753739930955121
C15: 0.0005753739930955121
C16: 0.00028768699654775604
C17: 0.9800537015726889
C18: 0.09321058688147296
C19: 0.0005753739930955121
C20: 0.0005753739930955121
C21: 0.041714614499424624
C22: 0.0
C23: 0.0005753739930955121
C24: 0.0
C25: 0.0005753739930955121
C26: 0.0005753739930955121
C27: 0.3558688147295742
C28: 0.2761795166858458
C29: 0.3558688147295742
C30: 0.3558688147295742
C31: 0.3558688147295742
C32: 0.3558688147295742
C33: 0.0005753739930955121
C34: 0.3558688147295742
C35: 0.0005753739930955121
C36: 0.3558688147295742
C37: 0.7153816647487533
C38: 0.9800537015726889
C39: 0.7153816647487533
C40: 0.0005753739930955121
C41: 0.715381

In [8]:
from sklearn.preprocessing import MinMaxScaler



# C13(长期内审批查询机构数量)/C10(长期内审批查询次数)
scaler_c13 = MinMaxScaler()
train_c13 = scaler_c13.fit_transform(X3_train[['C13']])
test_c13 = scaler_c13.transform(X3_test[['C13']])
scaler_c10 = MinMaxScaler()
train_c10 = scaler_c10.fit_transform(X3_train[['C10']])
test_c10 = scaler_c10.transform(X3_test[['C10']])

X3_train['C13/C10'] = train_c13 / train_c10
X3_test['C13/C10'] = test_c13 / test_c10
X3_train.loc[X3_train['C10']==X3_train['C10'].min(), 'C13/C10'] = 0
X3_test.loc[X3_test['C10']==X3_train['C10'].min(), 'C13/C10'] = 0

# C31(产品7的订货账户数量——未激活)+C32(产品7的订货账户数量——激活)
X3_train['C31+C32'] = X3_train['C31'] + X3_train['C32']
X3_test['C31+C32'] = X3_test['C31'] + X3_test['C32']

print(X3_train.head(5))

   客户编号        C1        C2        C3        C4        C5        C6        C7  \
0     0 -0.070858  0.487243 -0.439432 -0.334803 -0.422429 -0.397176 -0.397176   
1     1 -0.070858 -1.247584 -0.439432 -0.334803 -0.422429 -0.397176 -0.397176   
2     2 -0.070858  1.166246 -0.439432 -0.334803 -0.422429 -0.397176 -0.397176   
3     3 -0.070858  0.078606 -0.439432 -0.334803 -0.422429 -0.397176 -0.397176   
4     4 -0.070858  0.917253 -0.439432 -0.334803 -0.422429 -0.397176 -0.397176   

         C8       C10  ...       C32       C34       C36       C42       C43  \
0  1.080640 -0.770357  ...  0.817463 -0.701778 -1.328074 -0.286959 -0.334898   
1 -1.179976 -0.074098  ...  0.817463  0.924320  0.031835 -0.286959 -0.334898   
2  1.439786  1.047530  ...  0.817463  0.808186  0.713626  3.746403 -0.334898   
3  0.255290  0.089317  ... -0.993811  0.946517  0.294777 -0.286959 -0.334898   
4  1.257765  0.089317  ... -1.727473 -1.387081 -1.328074 -0.286959 -0.334898   

        C45       C46       C47 

  del sys.path[0]
  


In [9]:
# 拼接数据
X_train = pd.merge(left=X1_train, right=X2_train_convert, on=['客户编号'], how='outer')
X_train = pd.merge(left=X_train, right=X3_train, on=['客户编号'], how='outer')

X_test = pd.merge(left=X1_test, right=X2_test_convert, on=['客户编号'], how='outer')
X_test = pd.merge(left=X_test, right=X3_test, on=['客户编号'], how='outer')

X_train.drop(columns=['客户编号'], inplace=True)
X_test.drop(columns=['客户编号'], inplace=True)

X_train['A4'] = X_train['A4'].astype('category')
X_test['A4'] = X_test['A4'].astype('category')


print(X_train.shape, y_train.shape)
print(X_test.shape)

print(X_train.head(5))

(10428, 103) (10428,)
(8000, 103)
       A4    A7        A8       A12       A13       A14       A15       A16  \
0  320000  99.0  0.470681  0.480826  0.510011  0.526854  0.459435 -0.173260   
1  410000  30.0  0.470681  0.621495  0.502095  0.486996  0.459435 -0.173260   
2  510000  30.0  0.470681  0.532467  0.502095  0.481939  0.416393  5.716708   
3  230000  99.0  0.470681  0.480826  0.452570  0.481939  0.459435 -0.173260   
4  130000  99.0  0.470681  0.480826  0.452570  0.521930  0.459435 -0.173260   

      A17       A20  ...       C32       C34       C36       C42       C43  \
0  2197.0  0.492703  ...  0.817463 -0.701778 -1.328074 -0.286959 -0.334898   
1  3291.0  0.492703  ...  0.817463  0.924320  0.031835 -0.286959 -0.334898   
2  2355.0  0.492703  ...  0.817463  0.808186  0.713626  3.746403 -0.334898   
3  2746.0  0.492703  ... -0.993811  0.946517  0.294777 -0.286959 -0.334898   
4  2119.0  0.492703  ... -1.727473 -1.387081 -1.328074 -0.286959 -0.334898   

        C45       C46 

# 进一步的特征工程

In [10]:
# 两类以上占比极高
two_most = ['A7', 'C5', 'C15', 'C16', 'C42', 'C43', 'C45', ]
# 一类占比极高
one_most = ['A12', 'A16', 'A20', 'B5_mode', 'B5_unique', 'B5_median', 'B5_max', 'B5_min', 'B16_mode', 'C1', 'C3', 'C4', 'C6', 'C7', 'C19', 'C20', 'C23', 'C25', 'C28', 'C47']


feat_to_drop = [
    'B5_max', 'C3', 'C23', 'C6', 'B5_unique', 'A7', 'C16', 'C43', 'C5',
    'C45', 'B5_median', 'C7', 'A16', 'C4', 'C15', 'B5_mode', 'B16_mode',
    'C19', 'C42', 'A20', 'C1', 'B5_min'
]
feat_to_change = [
    'C47', 'C20', 'C28', 'C25', 'A12'
]

# 将这些不重要的特征聚合(这个人有多普通)
funsion_feat_train = pd.concat([X_train[i]==X_train[i].mode()[0] for i in feat_to_drop], axis=1)
funsion_feat_test = pd.concat([X_test[i]==X_test[i].mode()[0] for i in feat_to_drop], axis=1)


from sklearn.decomposition import PCA


pca = PCA(0.85)

pca_data_train = pca.fit_transform(funsion_feat_train)
pca_data_test = pca.fit_transform(funsion_feat_test)
funsion_feat_train = pd.DataFrame(pca_data_train, columns=[f'fusion_feat_pca_{i}' for i in range(pca_data_train.shape[1])])
funsion_feat_test = pd.DataFrame(pca_data_test, columns=[f'fusion_feat_pca_{i}' for i in range(pca_data_test.shape[1])])

X_train = pd.concat([X_train, funsion_feat_train], axis=1)
X_test = pd.concat([X_test, funsion_feat_test], axis=1)


X_train.drop(columns=feat_to_drop, inplace=True)
X_test.drop(columns=feat_to_drop, inplace=True)


for col in feat_to_change:
    X_train[col] = (X_train[col]==X_train[col].mode()[0]).astype(int)
    X_test[col] = (X_test[col]==X_test[col].mode()[0]).astype(int)
    

print(X_train.shape, X_test.shape)

(10428, 88) (8000, 88)


In [11]:
# 删掉不重要特征
feat_to_drop = [
    'B2_mean', 'C29', 'C27', 'A8', 'C31', 'A13', 'B17_unique', 'B17_max',
    'C25', 'C28', 'C20', 'B3_mode', 'B17_min', 'B17_median', 'A12', 'C47', 'B17_mode', 'C46'
]
X_train.drop(columns=feat_to_drop, inplace=True)
X_test.drop(columns=feat_to_drop, inplace=True)

In [15]:
from sklearn.model_selection import StratifiedKFold


kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)

# LGB参数
params = {
    'objective': 'multiclass', 
    'num_class': 3,
    'metric': 'multi_error', 
    'alpha': 1,
    'boosting_type': 'gbdt',
    'num_leaves': 2**6-1,
    'min_data_in_leaf': 2**2-1,
    'learning_rate': 0.01,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
    'verbose': -1,
    'seed': SEED,
    'n_jobs': -1,
}

y_pred = []
feat_imp = []
y_val = []
y_pred_val = []
scores = []
models = []
for train_idx, val_idx in kfold.split(X_train, y_train):
    X_train_1, y_train_1 = X_train.loc[train_idx], y_train.loc[train_idx]
    X_val_1, y_val_1 = X_train.loc[val_idx], y_train.loc[val_idx]
    weight_train_1 = (y_train_1==0) * 1 + (y_train_1==1) * 3 + (y_train_1==2) * 5
    weight_val_1 = (y_val_1==0) * 1 + (y_val_1==1) * 3 + (y_val_1==2) * 5
    Dtrain = lgb.Dataset(X_train_1, y_train_1, weight=weight_train_1)
    Dval = lgb.Dataset(X_val_1, y_val_1, weight=weight_val_1)
    model = lgb.train(params=params, train_set=Dtrain, num_boost_round=1000, valid_sets=[Dtrain, Dval], callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)])
    models.append(model)
    y_pred_1 = model.predict(X_val_1).argmax(axis=1)
    print(f'score: {S(y_val_1, y_pred_1)}')
    scores.append(S(y_val_1, y_pred_1))
    feat_imp_ = pd.DataFrame(model.feature_importance(), columns=['importance'], index=model.feature_name())
    feat_imp.append(feat_imp_)
    y_pred_ = model.predict(X_test)
    y_pred.append(y_pred_)
    y_pred_val_1 = model.predict(X_val_1)
    y_val.append(y_val_1)
    y_pred_val.append(y_pred_val_1)

feat_imp_merge = pd.concat(feat_imp, axis=1).mean(axis=1).sort_values(ascending=False)
feat_imp_merge.to_csv('./feat_imp.csv')
y_pred = np.array(y_pred)
y_val = pd.concat(y_val).reset_index(drop=True)
y_pred_val = np.concatenate(y_pred_val)
print(f'score: {S(y_val, y_pred_val.argmax(axis=1))}')

Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.174163	valid_1's multi_error: 0.378927
Early stopping, best iteration is:
[30]	training's multi_error: 0.236445	valid_1's multi_error: 0.371194
score: 0.6288061865635572
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.176831	valid_1's multi_error: 0.36478
Early stopping, best iteration is:
[39]	training's multi_error: 0.2203	valid_1's multi_error: 0.355104
score: 0.6448959845186261
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.172586	valid_1's multi_error: 0.373004
[200]	training's multi_error: 0.129117	valid_1's multi_error: 0.363812
Early stopping, best iteration is:
[199]	training's multi_error: 0.129708	valid_1's multi_error: 0.362361
score: 0.6376390904692791
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.177959	valid_1's multi_error: 0.386551
[200]	

In [13]:
submit = pd.DataFrame(y_pred.mean(axis=0).argmax(axis=1), columns=['复购频率'])
submit.to_excel('./output/省赛_X1_test.xlsx', index=False)

std_1 = y_pred.std(axis=0)[:, 1]
prob_1 = y_pred.mean(axis=0)[:, 1]

# 后处理, 将预测值为1的概率小且方差大的样本改为2
pred_of_1 = pd.DataFrame()
pred_of_1['prob'] = prob_1
pred_of_1['std'] = std_1
pred_of_1 = pred_of_1[y_pred.mean(axis=0).argmax(axis=1)==1]
submit.loc[pred_of_1.sort_values(['prob']).head(20).sort_values(['std'], ascending=False).head(10).index, '复购频率'] = 2


submit.to_excel('./output/省赛_X1_test.xlsx', index=False)
submit['复购频率'].value_counts()

0    4880
2    1611
1    1509
Name: 复购频率, dtype: int64