# 前海征信“好信杯”大数据算法赛（LB top20方案研究）

*赛题：https://www.kesci.com/apps/home/competition/58e46b3b9ed26b1e09bfbbb7/works*  
*参考：https://www.kesci.com/apps/home/project/59ca5ff521100106623f3db3 *  
注意：题目本意是想要做一些迁移学习内容，但这里仅在数据集B上进行数据的分析和测试 

 
### 内容包括：数据初探 + 数据探索性分析 + 特征工程 + 单模型训练


### 1、数据初探

In [7]:

# 导入需要的包
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

In [8]:
# read data
train_A = pd.read_csv('A_train.csv')
train_B = pd.read_csv('B_train.csv')
test_B = pd.read_csv('B_test.csv')

In [16]:
# 查看数据集B的基本信息

train_B.info()
train_B.head()

# 发现：
#  数据集B一共有491个特征,同时数据中还含有大量的缺失值
#  数据的range很大,有5226.59的又有0的
#  数据主要由float64和int64两种数据


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 491 entries, no to UserInfo_270
dtypes: float64(488), int64(3)
memory usage: 15.0 MB


Unnamed: 0,no,UserInfo_1,UserInfo_2,UserInfo_3,UserInfo_4,ProductInfo_1,UserInfo_5,UserInfo_6,ProductInfo_2,UserInfo_7,...,UserInfo_264,UserInfo_265,ProductInfo_214,UserInfo_266,UserInfo_267,ProductInfo_215,ProductInfo_216,UserInfo_268,UserInfo_269,UserInfo_270
0,8192,,5226.59,0.0,0.0,0.0,296.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.0,15.0,0.0,0.0,0.0,2940.0,
1,1,,0.0,0.0,0.0,0.0,,0.0,0.0,46000.0,...,0.0,3.0,0.0,0.0,,0.0,,0.0,0.0,
2,8195,,,,,,,,,,...,,,,,,,,,,
3,8196,,,,,,,,,,...,,,,,,,,,,
4,16387,,,,,,,,,,...,,,,,,,,,,


In [25]:
# 统计信息

pd.set_option('display.max_columns',200)
train_B.describe()

# 发现：
#   特征之间存在不同情况的缺失,例如：UserInfo_1中就只有一个数据,UserInfo_3中有1515个数据,缺失情况较为严重,UserInfo_12则不存在缺失情况. 而针对不同的缺失情况我们需要考虑删除特征或者样本删除的操作[1].
#   特征没有全部进行归一化,例如UserInfo_270的最小值为7000,最大的是401000,ProductInfo_216最小值为0,最大值为1,不同特征的range大小不一样. 如果希望尝试使用非基于树的模型,就不得不考虑Normalization操作了[2].
#   很多数据存在非常大的波动,例如UserInfo_3的std为11156.96,考虑是否需要进行特殊处理,本文暂不做处理[3].


Unnamed: 0,no,UserInfo_1,UserInfo_2,UserInfo_3,UserInfo_4,ProductInfo_1,UserInfo_5,UserInfo_6,ProductInfo_2,UserInfo_7,ProductInfo_3,ProductInfo_4,UserInfo_8,UserInfo_9,ProductInfo_5,ProductInfo_6,UserInfo_10,UserInfo_11,ProductInfo_7,UserInfo_12,ProductInfo_8,ProductInfo_9,ProductInfo_10,UserInfo_13,ProductInfo_11,ProductInfo_12,UserInfo_14,ProductInfo_13,ProductInfo_14,UserInfo_15,UserInfo_16,UserInfo_17,UserInfo_18,UserInfo_19,UserInfo_20,UserInfo_21,UserInfo_22,UserInfo_23,UserInfo_24,UserInfo_25,ProductInfo_15,ProductInfo_16,ProductInfo_17,UserInfo_26,ProductInfo_18,ProductInfo_19,UserInfo_27,ProductInfo_20,UserInfo_28,ProductInfo_21,ProductInfo_22,ProductInfo_23,UserInfo_29,UserInfo_30,UserInfo_31,UserInfo_32,UserInfo_33,ProductInfo_24,UserInfo_34,ProductInfo_25,ProductInfo_26,ProductInfo_27,ProductInfo_28,ProductInfo_29,ProductInfo_30,ProductInfo_31,UserInfo_35,ProductInfo_32,ProductInfo_33,UserInfo_36,UserInfo_37,ProductInfo_34,UserInfo_38,UserInfo_39,UserInfo_40,ProductInfo_35,ProductInfo_36,UserInfo_41,ProductInfo_37,UserInfo_42,UserInfo_43,UserInfo_44,UserInfo_45,ProductInfo_38,UserInfo_46,UserInfo_47,UserInfo_48,ProductInfo_39,UserInfo_49,UserInfo_50,UserInfo_51,ProductInfo_40,ProductInfo_41,UserInfo_52,ProductInfo_42,ProductInfo_43,ProductInfo_44,UserInfo_53,UserInfo_54,UserInfo_55,...,ProductInfo_168,UserInfo_220,ProductInfo_169,UserInfo_221,ProductInfo_170,ProductInfo_171,UserInfo_222,ProductInfo_172,UserInfo_223,ProductInfo_173,UserInfo_224,ProductInfo_174,UserInfo_225,UserInfo_226,UserInfo_227,ProductInfo_175,UserInfo_228,ProductInfo_176,UserInfo_229,ProductInfo_177,UserInfo_230,ProductInfo_178,UserInfo_231,UserInfo_232,UserInfo_233,UserInfo_234,ProductInfo_179,ProductInfo_180,UserInfo_235,ProductInfo_181,UserInfo_236,UserInfo_237,ProductInfo_182,UserInfo_238,UserInfo_239,ProductInfo_183,ProductInfo_184,UserInfo_240,ProductInfo_185,UserInfo_241,UserInfo_242,UserInfo_243,UserInfo_244,ProductInfo_186,UserInfo_245,ProductInfo_187,ProductInfo_188,ProductInfo_189,UserInfo_246,UserInfo_247,UserInfo_248,UserInfo_249,ProductInfo_190,ProductInfo_191,ProductInfo_192,ProductInfo_193,ProductInfo_194,UserInfo_250,ProductInfo_195,ProductInfo_196,UserInfo_251,UserInfo_252,UserInfo_253,ProductInfo_197,ProductInfo_198,UserInfo_254,ProductInfo_199,UserInfo_255,ProductInfo_200,ProductInfo_201,ProductInfo_202,ProductInfo_203,UserInfo_256,ProductInfo_204,UserInfo_257,UserInfo_258,UserInfo_259,UserInfo_260,ProductInfo_205,ProductInfo_206,UserInfo_261,UserInfo_262,ProductInfo_207,ProductInfo_208,ProductInfo_209,ProductInfo_210,UserInfo_263,ProductInfo_211,ProductInfo_212,ProductInfo_213,UserInfo_264,UserInfo_265,ProductInfo_214,UserInfo_266,UserInfo_267,ProductInfo_215,ProductInfo_216,UserInfo_268,UserInfo_269,UserInfo_270
count,4000.0,1.0,119.0,1515.0,1515.0,1515.0,140.0,1515.0,1515.0,1515.0,1515.0,1515.0,85.0,11.0,1515.0,1515.0,1515.0,116.0,1515.0,4000.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,880.0,740.0,730.0,26.0,182.0,1515.0,11.0,116.0,166.0,1515.0,123.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,6.0,1515.0,1515.0,166.0,95.0,1515.0,7.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,7.0,1515.0,1515.0,152.0,1515.0,1515.0,1515.0,94.0,2150.0,1515.0,1515.0,7.0,1515.0,1515.0,1515.0,1515.0,27.0,1515.0,1515.0,1515.0,46.0,1515.0,469.0,1515.0,1515.0,1515.0,1515.0,321.0,1515.0,1515.0,1515.0,14.0,933.0,1515.0,...,1515.0,1515.0,1515.0,1515.0,880.0,1515.0,2536.0,1515.0,7.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,120.0,94.0,1515.0,1515.0,469.0,94.0,182.0,1515.0,1515.0,1515.0,1515.0,1515.0,11.0,33.0,1515.0,1515.0,1515.0,1515.0,1515.0,27.0,1515.0,1515.0,1996.0,217.0,6.0,1515.0,33.0,1515.0,1515.0,1515.0,47.0,47.0,1515.0,94.0,1515.0,1515.0,1515.0,1515.0,182.0,13.0,1515.0,1515.0,1515.0,116.0,2536.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,120.0,1515.0,1515.0,0.0,7.0,94.0,1515.0,7.0,85.0,805.0,3.0,1515.0,1515.0,1515.0,7.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,1515.0,7.0,1515.0,94.0,730.0,1515.0,24.0
mean,8789.822,2054.47,5272.206555,2412.077228,174.257426,0.0,887.092857,0.176238,0.0,5430.363036,0.0,0.088449,41694.670588,0.0,0.036304,0.0,0.473267,42.25,0.590759,1617.8505,0.00132,0.006601,0.0,0.343234,0.00132,0.093729,0.0,0.108911,0.49769,19478.0,3704.414865,2058.57574,3.076923,3514.25989,1884.810772,228.468182,1.568966,2.807229,0.028383,4629.613577,0.0,0.093729,0.010561,0.440924,0.0,0.006601,5712.453624,0.00066,-1151.652,0.00462,0.0,0.147195,188833.333333,0.0,0.00396,0.753012,0.989474,0.029703,0.571429,0.00264,0.0,0.532673,0.00066,0.0,0.039604,0.083828,1.714286,0.005281,0.40462,152.668882,27.832343,0.026403,0.275908,0.042553,2.312558,0.533333,0.309571,1.285714,0.0,0.029703,0.29571,35.470785,0.296296,0.071947,0.722112,0.00396,2.347826,0.111551,49800.26,1.244884,0.741254,0.006601,0.081188,9407.696,0.0,0.015182,0.066007,0.285714,0.0,0.233003,...,0.011881,0.585479,0.071947,0.00132,0.230682,0.011221,0.576893,0.223102,0.285714,0.018482,0.547855,0.052805,0.070627,2.221122,198.225083,0.00198,0.0,0.038944,1816.397583,0.021277,0.017822,0.009901,55539.445629,0.148936,0.0,7.775578,0.0,0.00132,0.695314,0.50231,1.090909,81.309697,0.508911,-941.345175,5.917492,0.00396,0.036304,4151.283704,0.091749,0.174257,27146.036072,1.769585,3.833333,0.005941,0.559697,0.025743,0.0,0.036304,4.723404,2.489362,1641.254455,0.042553,0.50231,0.021122,0.066007,0.011881,0.0,3099.238462,0.005281,0.0,1.191419,0.008621,1.102524,0.013861,0.0,0.692409,0.211221,0.027723,0.254785,0.012541,0.00396,0.10429,0.0,0.063366,2.249505,,5625.97,0.0,0.00462,0.571429,14406.162471,1.503106,0.666667,0.040264,0.49835,0.043564,5919.465714,0.00066,0.0,0.00462,0.00396,2.343234,0.00198,0.00198,10.571429,0.0,0.010638,7.118315,105.415036,158875.0
std,5063.261352,,8599.716757,11156.962294,6782.621941,0.0,1034.953646,4.31947,0.0,22507.02816,0.0,0.28404,27264.771171,0.0,0.187106,0.0,0.964244,33.995876,0.491856,11011.405744,0.036322,0.081003,0.0,0.622942,0.036322,0.291548,0.0,0.31163,0.50016,66582.19,6823.252975,7307.826761,2.65214,21687.122168,7637.715306,651.584588,1.031876,1.316173,0.166119,2766.0354,0.0,0.291548,0.102257,0.85681,0.0,0.081003,12929.560122,0.025692,41269.14,0.067839,0.0,0.354417,98302.424521,0.0,0.062828,1.075551,1.153115,0.169823,0.534522,0.051333,0.0,0.499096,0.025692,0.0,0.195091,0.277222,1.380131,0.072499,0.490981,171.678549,59.109069,0.160382,0.447118,0.20293,1.568056,0.499052,0.462469,0.48795,0.0,0.169823,0.548519,1239.086153,0.465322,0.258486,2.84194,0.062828,0.848983,0.314917,91202.02,1.400167,2.199221,0.081003,0.273214,122958.7,0.0,0.122315,0.248376,0.468807,0.0,0.720867,...,0.108387,0.492802,0.258486,0.036322,0.421509,0.105369,0.969964,0.416464,0.48795,0.13473,1.483792,0.223718,0.276135,1.867635,1461.422043,0.04447,0.0,0.193525,3009.307583,0.145079,0.150996,0.099043,63101.922375,0.357935,0.0,19.101752,0.0,0.036322,8.782254,0.50016,1.375103,298.226111,0.500086,12901.361697,15.544641,0.062828,0.187106,3320.163748,0.288767,0.59611,44663.141175,1.229422,3.125167,0.076871,2.453341,0.158419,0.0,0.187106,15.646506,7.279729,7096.435851,0.20293,0.50016,0.143839,0.248376,0.108387,0.0,1670.499033,0.072499,0.0,17.45324,0.092848,1.917566,0.116954,0.0,1.101417,0.40831,0.164232,0.435884,0.11132,0.062828,0.305738,0.0,0.243701,1.873772,,7552.537273,0.0,0.067839,0.534522,12279.64972,0.559008,0.57735,0.196643,0.500162,0.204191,7488.212032,0.025692,0.0,0.067839,0.062828,1.921429,0.04447,0.04447,18.972411,0.0,0.103142,103.750148,1246.940391,113541.914306
min,1.0,2054.47,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1560.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-533212.9,0.0,0.0,0.0,42000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.56,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,-283362.96,0.0,0.0,0.0,504.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,100.0,0.0,0.0,0.0,600.0,1.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7000.0
25%,4374.25,2054.47,0.0,0.0,0.0,0.0,228.5,0.0,0.0,0.0,0.0,0.0,23390.0,0.0,0.0,0.0,0.0,20.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363.75,0.0,0.0,36.975,0.0,0.125,1.0,1.0,0.0,3245.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,66.09,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,9352.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38,0.0,0.0,0.0,0.0,0.0,1806.75,0.0,0.0,5000.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,612.8,0.0,0.0,0.0,4200.0,1.0,0.5,0.0,0.0,0.0,612.8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,40250.0
50%,8798.5,2054.47,624.0,0.0,0.0,0.0,557.5,0.0,0.0,0.0,0.0,0.0,34190.0,0.0,0.0,0.0,0.0,20.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1439.5,0.0,3.5,357.75,0.0,0.91,1.0,3.0,0.0,4045.0,0.0,0.0,0.0,0.0,0.0,0.0,2.88,0.0,0.0,0.0,0.0,0.0,220500.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,90.645,2.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,22473.98,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.01,1.0,0.0,0.0,0.0,0.0,3700.55,0.0,0.0,10500.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2843.92,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,1633.0,0.0,0.0,1.0,11492.0,1.0,1.0,0.0,0.0,0.0,1633.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,190000.0
75%,13215.5,2054.47,6794.65,0.0,0.0,0.0,1097.5,0.0,0.0,0.0,0.0,0.0,54996.0,0.0,0.0,0.0,1.0,99.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4250.25,0.0,5.75,1380.0125,0.0,49.3,2.0,4.0,0.0,5636.84,0.0,0.0,0.0,1.0,0.0,0.0,5732.885,0.0,0.0,0.0,0.0,0.0,247500.0,0.0,0.0,1.0,1.5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.5,0.0,1.0,176.36,35.0,0.0,1.0,0.0,3.0,1.0,1.0,1.5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,56385.27,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4213.4025,0.0,0.0,0.0,65000.0,0.0,0.0,9.0,0.0,0.0,0.0,1.0,1.5,8.07,1.0,0.0,6.0,0.0,0.0,5728.865,0.0,0.0,30000.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4800.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,,9470.19,0.0,0.0,1.0,20604.0,2.0,1.0,0.0,1.0,0.0,10497.425,0.0,0.0,0.0,0.0,3.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,240000.0
max,17460.0,2054.47,38100.0,103323.0,264000.0,0.0,5404.0,126.0,0.0,259000.0,0.0,1.0,103323.0,0.0,1.0,0.0,12.0,99.0,1.0,300001.0,1.0,1.0,0.0,5.0,1.0,1.0,0.0,1.0,1.0,1110000.0,113055.0,79544.0,7.0,286808.6,143465.0,2182.23,5.0,6.0,1.0,23209.68,0.0,1.0,1.0,6.0,0.0,1.0,157559.15,1.0,1005375.0,1.0,0.0,1.0,300000.0,0.0,1.0,6.0,5.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,4.0,1.0,1.0,1306.8,994.0,1.0,1.0,1.0,12.0,1.0,1.0,2.0,0.0,1.0,4.0,48056.37,1.0,1.0,12.0,1.0,3.0,1.0,1477488.0,14.0,18.0,1.0,1.0,2153584.0,0.0,1.0,1.0,1.0,0.0,8.0,...,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,7.0,1.0,2.0,28.0,20520.0,1.0,0.0,1.0,11578.19,1.0,2.0,1.0,750000.0,1.0,0.0,331.0,0.0,1.0,200.0,1.0,4.0,1245.05,1.0,50060.38,331.0,1.0,1.0,15863.81,1.0,12.0,606003.0,4.0,7.0,1.0,13.29,1.0,0.0,1.0,108.0,48.0,64600.0,1.0,1.0,1.0,1.0,1.0,0.0,5613.09,1.0,0.0,480.0,1.0,5.0,1.0,0.0,24.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,28.0,,17482.81,0.0,1.0,1.0,49500.0,5.0,1.0,1.0,1.0,1.0,17482.81,1.0,0.0,1.0,1.0,12.0,1.0,1.0,52.0,0.0,1.0,2182.23,32700.0,401000.0


### 2、数据探索性分析 + 特征工程

In [18]:
# 数据分析往往和特征工程的构造结合在一起

In [26]:
# int64型数据探索
train_B.dtypes[train_B.dtypes == np.int64]

no              int64
flag            int64
UserInfo_170    int64
dtype: object

In [30]:
train_B['UserInfo_170'].describe()
train_B['UserInfo_170'].head()
train_B['UserInfo_170'].unique()

array([0], dtype=int64)

In [None]:
# 发现UserInfo_170特征只有一个值0,而且是所有的数据都是0,这个特征对任何model都不会带来增益,所以毫无疑问地我们可以将其直接删除.
train_B = train_B.drop('UserInfo_170',axis=1)
train_B.shape

In [35]:
# 针对严重缺失的特征处理
# 我们设立阈值为1%,我们将缺失的特征大于99%的特征删除.(阈值可以自己进行调整).
meaningful_col = []
for col in train_B.columns:
    if train_B[col].count() > train_B.shape[0] * 0.01:
        meaningful_col.append(col)

In [56]:
train_B_1 = train_B[meaningful_col].copy()

In [57]:
train_B.shape
train_B_1.shape

# 我们将数据的维度由490降维至431.既降低了内存消耗,同时后面会发现模型的性能大大提升

(4000, 431)

In [60]:
# 缺失值填充
# 在贷款业务中,很多缺失值是有实际意义的,它们往往来源于用户故意不填写,或者没有,所以我们没法搜集这些数据,这个需要一些简单的业务理解.而针对这样的情况,一个非常有效的方式是直接填充某一个值(可以是-100,999等其他值)[6],此处我们直接用-999进行填充

train_B_1 = train_B_1.fillna(-999)

In [61]:
# 将高线性相关的特征进行删除

train_B_1.corr()

Unnamed: 0,no,UserInfo_2,UserInfo_3,UserInfo_4,ProductInfo_1,UserInfo_5,UserInfo_6,ProductInfo_2,UserInfo_7,ProductInfo_3,ProductInfo_4,UserInfo_8,ProductInfo_5,ProductInfo_6,UserInfo_10,UserInfo_11,ProductInfo_7,UserInfo_12,ProductInfo_8,ProductInfo_9,ProductInfo_10,UserInfo_13,ProductInfo_11,ProductInfo_12,UserInfo_14,ProductInfo_13,ProductInfo_14,UserInfo_15,UserInfo_16,UserInfo_17,UserInfo_19,UserInfo_20,UserInfo_22,UserInfo_23,UserInfo_24,UserInfo_25,ProductInfo_15,ProductInfo_16,ProductInfo_17,UserInfo_26,ProductInfo_18,ProductInfo_19,UserInfo_27,ProductInfo_20,UserInfo_28,ProductInfo_21,ProductInfo_22,ProductInfo_23,UserInfo_30,UserInfo_31,UserInfo_32,UserInfo_33,ProductInfo_24,ProductInfo_25,ProductInfo_26,ProductInfo_27,ProductInfo_28,ProductInfo_29,ProductInfo_30,ProductInfo_31,ProductInfo_32,ProductInfo_33,UserInfo_36,UserInfo_37,ProductInfo_34,UserInfo_38,UserInfo_39,UserInfo_40,ProductInfo_35,ProductInfo_36,ProductInfo_37,UserInfo_42,UserInfo_43,UserInfo_44,ProductInfo_38,UserInfo_46,UserInfo_47,UserInfo_48,ProductInfo_39,UserInfo_49,UserInfo_50,UserInfo_51,ProductInfo_40,ProductInfo_41,UserInfo_52,ProductInfo_42,ProductInfo_43,ProductInfo_44,UserInfo_54,UserInfo_55,UserInfo_56,UserInfo_58,ProductInfo_45,UserInfo_59,UserInfo_60,ProductInfo_46,UserInfo_61,UserInfo_62,UserInfo_63,ProductInfo_47,...,ProductInfo_162,ProductInfo_163,UserInfo_211,UserInfo_212,ProductInfo_164,UserInfo_213,ProductInfo_165,ProductInfo_166,ProductInfo_167,UserInfo_215,UserInfo_216,UserInfo_217,UserInfo_218,UserInfo_219,ProductInfo_168,UserInfo_220,ProductInfo_169,UserInfo_221,ProductInfo_170,ProductInfo_171,UserInfo_222,ProductInfo_172,ProductInfo_173,UserInfo_224,ProductInfo_174,UserInfo_225,UserInfo_226,UserInfo_227,ProductInfo_175,UserInfo_228,ProductInfo_176,UserInfo_229,ProductInfo_177,UserInfo_230,ProductInfo_178,UserInfo_231,UserInfo_232,UserInfo_233,UserInfo_234,ProductInfo_179,ProductInfo_180,UserInfo_235,ProductInfo_181,ProductInfo_182,UserInfo_238,UserInfo_239,ProductInfo_183,ProductInfo_184,ProductInfo_185,UserInfo_241,UserInfo_242,UserInfo_243,ProductInfo_186,ProductInfo_187,ProductInfo_188,ProductInfo_189,UserInfo_246,UserInfo_247,UserInfo_248,UserInfo_249,ProductInfo_190,ProductInfo_191,ProductInfo_192,ProductInfo_193,ProductInfo_194,ProductInfo_195,ProductInfo_196,UserInfo_251,UserInfo_252,UserInfo_253,ProductInfo_197,ProductInfo_198,UserInfo_254,ProductInfo_199,UserInfo_255,ProductInfo_200,ProductInfo_201,ProductInfo_202,ProductInfo_203,UserInfo_256,ProductInfo_204,UserInfo_257,UserInfo_260,ProductInfo_205,UserInfo_261,UserInfo_262,ProductInfo_208,ProductInfo_209,ProductInfo_210,ProductInfo_211,ProductInfo_212,ProductInfo_213,UserInfo_264,UserInfo_265,ProductInfo_214,UserInfo_266,ProductInfo_215,ProductInfo_216,UserInfo_268,UserInfo_269
no,1.000000,-0.009470,0.017689,0.017645,0.000778,0.000450,0.000809,0.000778,0.008722,0.000778,0.000780,0.019377,0.000769,0.000778,0.000760,0.010039,0.000782,0.005970,0.000777,0.000778,0.000778,0.000787,0.000777,0.000777,0.000778,0.000776,0.000769,0.006905,0.007909,-0.001672,-0.005141,0.003592,0.009659,-0.000141,0.000781,0.011761,0.000778,0.000781,0.000779,0.000774,0.000778,0.000778,0.005081,0.000778,-0.007499,0.000777,0.000778,0.000785,0.000778,0.000777,-0.000100,-0.010888,0.000781,0.000777,0.000778,0.000783,0.000778,0.000778,0.000773,0.000780,0.000777,0.000776,0.043201,0.000204,0.000777,0.000772,-0.023536,-0.033093,0.000776,0.000774,0.000778,0.000781,0.000773,-0.013881,0.000779,0.000884,0.000777,-0.017573,0.000777,0.011592,0.000784,0.000797,0.000777,0.000778,-0.004326,0.000778,0.000780,0.000784,0.000910,0.000786,0.017665,-0.017567,0.000778,0.000751,0.004462,0.000778,0.000778,-0.000131,0.005477,0.000779,...,0.000780,0.000785,0.009135,0.000781,0.000773,0.015604,0.000777,0.000776,0.000779,0.016026,0.000802,0.028847,0.016001,-0.023544,0.000777,0.000782,0.000779,0.000778,0.027054,0.000779,0.009319,0.000771,0.000777,0.000791,0.000776,0.000785,0.000800,-0.002519,0.000779,0.000778,0.000773,0.012074,-0.023545,0.000779,0.000773,0.021000,-0.023542,-0.002950,0.000949,0.000778,0.000778,0.000752,0.000772,0.000778,0.027806,0.000857,0.000777,0.000777,0.000783,0.000762,-0.003160,0.010657,0.000779,0.000775,0.000778,0.000769,0.022122,0.021929,-0.001566,-0.023539,0.000772,0.000776,0.000778,0.000778,-0.002950,0.000778,0.000778,-0.000041,0.009653,0.009300,0.000777,0.000778,0.000777,0.000784,0.000778,0.000777,0.000777,0.000777,0.000783,0.003756,0.000785,0.000801,-0.023544,0.000777,0.018388,0.004477,0.000773,0.000770,0.000781,0.000777,0.000778,0.000778,0.000777,0.000788,0.000778,0.000778,0.000778,-0.023541,0.020643,0.003622
UserInfo_2,-0.009470,1.000000,0.039423,0.013491,0.131199,0.686113,0.131160,0.131199,0.072412,0.131199,0.131298,0.086100,0.131192,0.131199,0.131282,0.571994,0.131244,0.056604,0.131201,0.131198,0.131199,0.131256,0.131201,0.131305,0.131199,0.131196,0.131201,0.022143,0.075512,0.757056,0.042883,0.521378,0.582989,0.489544,0.131220,0.050744,0.131199,0.131321,0.131206,0.131215,0.131199,0.131198,0.406309,0.131199,-0.034083,0.131203,0.131199,0.131379,0.131199,0.131204,0.489628,0.491373,0.131219,0.131206,0.131199,0.131215,0.131199,0.131199,0.131192,0.131213,0.131198,0.131206,0.012077,0.134193,0.131210,0.131352,0.498880,0.058426,0.131180,0.131201,0.131199,0.131219,0.131203,0.068155,0.131191,0.131423,0.131204,-0.004430,0.131198,0.106622,0.131243,0.132379,0.131198,0.131190,0.009982,0.131199,0.131210,0.131210,0.140322,0.131603,0.011700,-0.004434,0.131199,0.131292,0.124636,0.131199,0.131199,0.489553,0.398306,0.131206,...,0.131363,0.131379,0.745852,0.131349,0.131175,0.071879,0.131201,0.131230,0.131300,0.388707,0.131513,0.098726,0.119837,0.498874,0.131197,0.131245,0.131320,0.131199,0.050466,0.131197,0.066580,0.131210,0.131196,0.131974,0.131298,0.131209,0.131741,0.060183,0.131198,0.131199,0.131222,0.238398,0.498872,0.131209,0.131203,0.115326,0.498910,0.113605,0.131465,0.131199,0.131199,0.131072,0.131200,0.131225,0.015998,0.131350,0.131204,0.131208,0.131340,0.131180,0.039520,0.406308,0.131201,0.131220,0.131199,0.131192,0.108881,0.108706,0.118703,0.498876,0.131200,0.131197,0.131207,0.131255,0.113605,0.131199,0.131199,0.133255,0.582758,0.066632,0.131196,0.131199,0.131222,0.131220,0.131196,0.131196,0.131197,0.131204,0.131321,0.582504,0.131224,0.131749,0.498874,0.131203,0.126632,0.156210,0.131192,0.131201,0.131211,0.131208,0.131199,0.131198,0.131204,0.132081,0.131201,0.131201,0.131199,0.498869,0.217026,0.449022
UserInfo_3,0.017689,0.039423,1.000000,0.026315,0.234346,0.043899,0.234295,0.234346,0.895062,0.234346,0.234362,0.681255,0.234336,0.234346,0.234323,0.054306,0.234327,0.718026,0.234345,0.234356,0.234346,0.234378,0.234345,0.234347,0.234346,0.234332,0.234335,0.113811,0.073711,0.068833,0.032991,0.097999,0.054708,0.047930,0.234341,0.121192,0.234346,0.234342,0.234355,0.234374,0.234346,0.234356,0.463881,0.234345,0.070709,0.234344,0.234346,0.234359,0.234346,0.234345,0.048002,0.054731,0.234340,0.234345,0.234346,0.234344,0.234345,0.234346,0.234335,0.234590,0.234344,0.234364,0.104373,0.233912,0.234377,0.234362,0.030032,0.059955,0.234339,0.234338,0.234346,0.234340,0.234359,0.124611,0.234330,0.234278,0.234345,0.017498,0.234332,0.070364,0.234397,0.234433,0.234347,0.234334,-0.002991,0.234346,0.234344,0.234595,0.203629,0.234383,0.023120,0.017480,0.234346,0.233608,0.048682,0.234345,0.234346,0.047953,0.184523,0.234350,...,0.234360,0.234359,0.042650,0.234364,0.234311,0.177187,0.234345,0.234340,0.234364,0.030362,0.234913,0.004425,0.131131,0.030033,0.234342,0.234324,0.234362,0.234345,0.159691,0.234343,0.120849,0.234354,0.234341,0.234385,0.234345,0.234622,0.234858,0.708456,0.234345,0.234346,0.234342,0.051422,0.030032,0.234343,0.234344,0.105091,0.030035,0.068201,0.233983,0.234346,0.234345,0.234146,0.234333,0.234343,-0.027409,0.234047,0.234345,0.234380,0.234344,0.234354,0.216952,0.101000,0.234344,0.234342,0.234346,0.234336,0.007128,0.007147,0.674836,0.030032,0.234333,0.234340,0.234342,0.234347,0.068201,0.234344,0.234346,0.233971,0.054685,0.120972,0.234342,0.234346,0.234527,0.234336,0.234338,0.234349,0.234342,0.234345,0.234355,0.053550,0.234339,0.234853,0.030033,0.234344,0.555611,0.164513,0.234335,0.234334,0.234481,0.234346,0.234346,0.234344,0.234345,0.234640,0.234345,0.234345,0.234346,0.030033,0.200846,0.131111
UserInfo_4,0.017645,0.013491,0.026315,1.000000,0.135136,0.018684,0.135128,0.135136,0.100985,0.135136,0.135133,0.016332,0.135135,0.135136,0.135185,0.022747,0.135149,0.443706,0.135136,0.135136,0.135136,0.135157,0.135136,0.135133,0.135136,0.135132,0.135120,0.018343,0.041585,0.026723,0.005744,0.032533,0.022759,0.021160,0.135135,0.079708,0.135136,0.135133,0.135135,0.135186,0.135136,0.135136,0.148551,0.135168,0.000317,0.135136,0.135136,0.135131,0.135136,0.135136,0.021164,0.017039,0.135135,0.135136,0.135136,0.135151,0.135136,0.135136,0.135134,0.135165,0.135136,0.135155,0.025879,0.135190,0.135135,0.135159,0.014558,-0.007171,0.135118,0.135158,0.135136,0.135135,0.135191,0.073692,0.135133,0.135112,0.135136,0.003637,0.135132,0.028600,0.135192,0.135111,0.135136,0.135133,-0.001260,0.135136,0.135135,0.135166,0.109694,0.135128,0.999906,0.003637,0.135136,0.134895,0.012605,0.135136,0.135136,0.021156,0.056843,0.135130,...,0.135128,0.135131,0.019824,0.135129,0.135132,0.103625,0.135136,0.135151,0.135132,0.014434,0.135125,0.012132,0.091232,0.014558,0.135135,0.135149,0.135133,0.135168,0.069941,0.135135,0.049599,0.135128,0.135135,0.135118,0.135134,0.135133,0.135225,0.070419,0.135136,0.135136,0.135134,0.015729,0.014558,0.135135,0.135135,0.084229,0.014558,0.028753,0.135393,0.135136,0.135168,0.135105,0.135119,0.135152,0.002335,0.135466,0.135136,0.135135,0.135133,0.135130,0.015582,0.029971,0.135136,0.135135,0.135136,0.135135,0.014358,0.014360,0.196373,0.014558,0.135119,0.135135,0.135134,0.135135,0.028753,0.135136,0.135136,0.135064,0.022759,0.049636,0.135135,0.135136,0.135113,0.135161,0.135135,0.135160,0.135135,0.135136,0.135132,0.023160,0.135134,0.135224,0.014558,0.135136,0.015146,0.033567,0.135134,0.135120,0.135134,0.135136,0.135136,0.135168,0.135136,0.135221,0.135136,0.135136,0.135136,0.014558,0.101850,0.075606
ProductInfo_1,0.000778,0.131199,0.234346,0.135136,1.000000,0.185039,0.999985,1.000000,0.219726,1.000000,1.000000,0.158824,1.000000,1.000000,0.999999,0.221213,1.000000,0.165047,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.178381,0.314815,0.214237,0.055860,0.285325,0.221333,0.212234,1.000000,0.204270,1.000000,1.000000,1.000000,0.999999,1.000000,1.000000,0.378774,1.000000,-0.002916,1.000000,1.000000,1.000000,1.000000,1.000000,0.212267,0.169328,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.251672,0.997345,1.000000,1.000000,0.147649,0.084504,1.000000,1.000000,1.000000,1.000000,1.000000,0.549826,1.000000,0.999994,1.000000,0.046294,1.000000,0.216605,0.999998,0.999996,1.000000,1.000000,-0.007688,1.000000,1.000000,1.000000,0.706384,1.000000,0.121543,0.046296,1.000000,0.998584,0.122587,1.000000,1.000000,0.212205,0.470514,1.000000,...,1.000000,1.000000,0.193109,1.000000,1.000000,0.397820,1.000000,1.000000,1.000000,0.161134,0.999993,0.117986,0.376415,0.147647,1.000000,1.000000,1.000000,1.000000,0.680175,1.000000,0.326886,1.000000,1.000000,0.999998,1.000000,1.000000,0.999997,0.542575,1.000000,1.000000,1.000000,0.152968,0.147648,1.000000,1.000000,0.300786,0.147651,0.279624,0.999711,1.000000,1.000000,0.999938,1.000000,1.000000,0.003523,0.999808,1.000000,1.000000,1.000000,1.000000,0.111934,0.293088,1.000000,1.000000,1.000000,1.000000,0.139634,0.139647,0.281486,0.147649,1.000000,1.000000,1.000000,1.000000,0.279624,1.000000,1.000000,0.999755,0.221333,0.327073,1.000000,1.000000,0.999999,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.225233,1.000000,0.999997,0.147647,1.000000,0.147290,0.360087,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.999997,1.000000,1.000000,1.000000,0.147648,0.601232,0.572545
UserInfo_5,0.000450,0.686113,0.043899,0.018684,0.185039,1.000000,0.184981,0.185039,0.050463,0.185039,0.185158,0.064960,0.185032,0.185039,0.185059,0.791874,0.185084,0.022824,0.185044,0.185039,0.185039,0.185093,0.185044,0.185196,0.185039,0.185031,0.185024,0.012230,0.132260,0.726044,0.037664,0.493779,0.792268,0.799602,0.185056,0.055897,0.185039,0.185217,0.185045,0.185032,0.185039,0.185039,0.375416,0.185039,-0.020794,0.185044,0.185039,0.185279,0.185039,0.185044,0.799665,0.629883,0.185056,0.185041,0.185039,0.185061,0.185039,0.185039,0.185032,0.185044,0.185037,0.185046,-0.007768,0.189423,0.185048,0.185266,0.690493,0.063487,0.185000,0.185047,0.185039,0.185056,0.185022,0.095801,0.185045,0.185256,0.185044,0.009774,0.185037,0.170821,0.185163,0.186372,0.185037,0.185047,0.016221,0.185039,0.185047,0.185047,0.165191,0.185456,0.016157,0.009790,0.185039,0.185480,0.287736,0.185039,0.185039,0.799689,0.402532,0.185040,...,0.185283,0.185279,0.907118,0.185243,0.185008,0.077609,0.185044,0.185050,0.185162,0.635064,0.185275,0.112349,0.179248,0.690477,0.185039,0.185081,0.185140,0.185039,0.068394,0.185041,0.084208,0.185051,0.185033,0.185961,0.185133,0.185045,0.185566,0.093453,0.185038,0.185039,0.185061,0.427472,0.690477,0.185046,0.185041,0.182065,0.690501,0.130158,0.185810,0.185039,0.185039,0.185042,0.185023,0.185045,0.031764,0.185666,0.185044,0.185045,0.185233,0.185039,0.035247,0.476948,0.185046,0.185055,0.185039,0.185032,0.100547,0.100131,0.075037,0.690497,0.185023,0.185038,0.185051,0.185080,0.130158,0.185041,0.185039,0.186469,0.792252,0.084296,0.185035,0.185039,0.185054,0.185046,0.185033,0.185023,0.185037,0.185044,0.185201,0.770896,0.185062,0.185573,0.690477,0.185044,0.070803,0.219526,0.185032,0.185024,0.185045,0.185046,0.185039,0.185037,0.185044,0.186013,0.185043,0.185043,0.185039,0.690470,0.307124,0.372699
UserInfo_6,0.000809,0.131160,0.234295,0.135128,0.999985,0.184981,1.000000,0.999985,0.219670,0.999985,0.999985,0.158777,0.999985,0.999985,0.999984,0.221147,0.999985,0.164995,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.178493,0.315105,0.214189,0.055844,0.285778,0.221267,0.212164,0.999985,0.204209,0.999985,0.999985,0.999985,0.999984,0.999985,0.999985,0.379028,0.999985,-0.002910,0.999985,0.999985,0.999985,0.999985,0.999985,0.212198,0.169274,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.251953,0.997468,0.999985,0.999985,0.147600,0.084340,0.999985,0.999985,0.999985,0.999985,0.999985,0.553234,0.999985,0.999978,0.999985,0.046271,0.999985,0.216540,0.999984,0.999982,0.999985,0.999985,-0.007690,0.999985,0.999985,0.999985,0.706364,0.999985,0.121536,0.046272,0.999985,0.998568,0.122551,0.999985,0.999985,0.212136,0.470677,0.999985,...,0.999985,0.999985,0.193050,0.999985,0.999985,0.397906,0.999985,0.999985,0.999985,0.161067,0.999978,0.117951,0.376326,0.147598,0.999985,0.999985,0.999985,0.999985,0.680304,0.999985,0.326811,0.999985,0.999985,0.999984,0.999985,0.999985,0.999983,0.542541,0.999985,0.999985,0.999985,0.152922,0.147599,0.999985,0.999985,0.300696,0.147602,0.279541,0.999725,0.999985,0.999985,0.999923,0.999985,0.999985,0.001455,0.999793,0.999985,0.999985,0.999985,0.999985,0.112028,0.293286,0.999985,0.999985,0.999985,0.999985,0.139592,0.139605,0.281432,0.147600,0.999985,0.999985,0.999985,0.999985,0.279541,0.999985,0.999985,0.999772,0.221267,0.326997,0.999985,0.999985,0.999984,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.225165,0.999985,0.999983,0.147598,0.999985,0.147246,0.360249,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999985,0.999983,0.999985,0.999985,0.999985,0.147598,0.601501,0.572521
ProductInfo_2,0.000778,0.131199,0.234346,0.135136,1.000000,0.185039,0.999985,1.000000,0.219726,1.000000,1.000000,0.158824,1.000000,1.000000,0.999999,0.221213,1.000000,0.165047,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.178381,0.314815,0.214237,0.055860,0.285325,0.221333,0.212234,1.000000,0.204270,1.000000,1.000000,1.000000,0.999999,1.000000,1.000000,0.378774,1.000000,-0.002916,1.000000,1.000000,1.000000,1.000000,1.000000,0.212267,0.169328,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.251672,0.997345,1.000000,1.000000,0.147649,0.084504,1.000000,1.000000,1.000000,1.000000,1.000000,0.549826,1.000000,0.999994,1.000000,0.046294,1.000000,0.216605,0.999998,0.999996,1.000000,1.000000,-0.007688,1.000000,1.000000,1.000000,0.706384,1.000000,0.121543,0.046296,1.000000,0.998584,0.122587,1.000000,1.000000,0.212205,0.470514,1.000000,...,1.000000,1.000000,0.193109,1.000000,1.000000,0.397820,1.000000,1.000000,1.000000,0.161134,0.999993,0.117986,0.376415,0.147647,1.000000,1.000000,1.000000,1.000000,0.680175,1.000000,0.326886,1.000000,1.000000,0.999998,1.000000,1.000000,0.999997,0.542575,1.000000,1.000000,1.000000,0.152968,0.147648,1.000000,1.000000,0.300786,0.147651,0.279624,0.999711,1.000000,1.000000,0.999938,1.000000,1.000000,0.003523,0.999808,1.000000,1.000000,1.000000,1.000000,0.111934,0.293088,1.000000,1.000000,1.000000,1.000000,0.139634,0.139647,0.281486,0.147649,1.000000,1.000000,1.000000,1.000000,0.279624,1.000000,1.000000,0.999755,0.221333,0.327073,1.000000,1.000000,0.999999,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.225233,1.000000,0.999997,0.147647,1.000000,0.147290,0.360087,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.999997,1.000000,1.000000,1.000000,0.147648,0.601232,0.572545
UserInfo_7,0.008722,0.072412,0.895062,0.100985,0.219726,0.050463,0.219670,0.219726,1.000000,0.219726,0.219742,0.605215,0.219715,0.219726,0.219688,0.049403,0.219700,0.696994,0.219725,0.219733,0.219726,0.219753,0.219725,0.219732,0.219726,0.219708,0.219707,0.119898,0.070640,0.079047,0.033681,0.100807,0.050447,0.043910,0.219724,0.091848,0.219726,0.219719,0.219731,0.219734,0.219726,0.219733,0.577616,0.219728,0.034847,0.219724,0.219726,0.219737,0.219726,0.219725,0.043983,0.054363,0.219723,0.219726,0.219726,0.219726,0.219726,0.219726,0.219714,0.220000,0.219724,0.219740,0.124285,0.218772,0.219759,0.219747,0.025697,0.048325,0.219712,0.219709,0.219726,0.219723,0.219729,0.116279,0.219710,0.219663,0.219725,0.009094,0.219707,0.063907,0.219675,0.219801,0.219727,0.219713,-0.003096,0.219726,0.219723,0.219979,0.185093,0.219763,0.098125,0.009079,0.219726,0.218963,0.049182,0.219726,0.219726,0.043932,0.179032,0.219731,...,0.219749,0.219737,0.051899,0.219748,0.219687,0.177794,0.219725,0.219714,0.219744,0.027296,0.220089,0.011425,0.112292,0.025698,0.219722,0.219697,0.219741,0.219728,0.168661,0.219722,0.120662,0.219737,0.219720,0.219753,0.219729,0.220000,0.220155,0.581051,0.219725,0.219726,0.219723,0.041395,0.025697,0.219722,0.219725,0.093081,0.025696,0.070118,0.219210,0.219726,0.219728,0.219506,0.219706,0.219716,-0.060784,0.219213,0.219725,0.219769,0.219723,0.219716,0.264235,0.096372,0.219724,0.219721,0.219726,0.219715,0.018302,0.018343,0.881494,0.025697,0.219706,0.219721,0.219725,0.219732,0.070118,0.219724,0.219726,0.219331,0.050425,0.120856,0.219722,0.219726,0.219830,0.219721,0.219718,0.219719,0.219722,0.219725,0.219725,0.053760,0.219720,0.220150,0.025698,0.219724,0.630573,0.163639,0.219714,0.219707,0.219867,0.219729,0.219726,0.219727,0.219725,0.219987,0.219725,0.219725,0.219726,0.025698,0.192779,0.141057
ProductInfo_3,0.000778,0.131199,0.234346,0.135136,1.000000,0.185039,0.999985,1.000000,0.219726,1.000000,1.000000,0.158824,1.000000,1.000000,0.999999,0.221213,1.000000,0.165047,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.178381,0.314815,0.214237,0.055860,0.285325,0.221333,0.212234,1.000000,0.204270,1.000000,1.000000,1.000000,0.999999,1.000000,1.000000,0.378774,1.000000,-0.002916,1.000000,1.000000,1.000000,1.000000,1.000000,0.212267,0.169328,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.251672,0.997345,1.000000,1.000000,0.147649,0.084504,1.000000,1.000000,1.000000,1.000000,1.000000,0.549826,1.000000,0.999994,1.000000,0.046294,1.000000,0.216605,0.999998,0.999996,1.000000,1.000000,-0.007688,1.000000,1.000000,1.000000,0.706384,1.000000,0.121543,0.046296,1.000000,0.998584,0.122587,1.000000,1.000000,0.212205,0.470514,1.000000,...,1.000000,1.000000,0.193109,1.000000,1.000000,0.397820,1.000000,1.000000,1.000000,0.161134,0.999993,0.117986,0.376415,0.147647,1.000000,1.000000,1.000000,1.000000,0.680175,1.000000,0.326886,1.000000,1.000000,0.999998,1.000000,1.000000,0.999997,0.542575,1.000000,1.000000,1.000000,0.152968,0.147648,1.000000,1.000000,0.300786,0.147651,0.279624,0.999711,1.000000,1.000000,0.999938,1.000000,1.000000,0.003523,0.999808,1.000000,1.000000,1.000000,1.000000,0.111934,0.293088,1.000000,1.000000,1.000000,1.000000,0.139634,0.139647,0.281486,0.147649,1.000000,1.000000,1.000000,1.000000,0.279624,1.000000,1.000000,0.999755,0.221333,0.327073,1.000000,1.000000,0.999999,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.225233,1.000000,0.999997,0.147647,1.000000,0.147290,0.360087,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.999997,1.000000,1.000000,1.000000,0.147648,0.601232,0.572545


In [63]:
relation = train_B_1.corr()
length = relation.shape[0]
final_cols = []
del_cols = []
for i in range(length):
    if relation.columns[i] not in del_cols:
        final_cols.append(relation.columns[i])
        for j in range(i+1, length):
            if (relation.iloc[i,j] > 0.98) and (relation.columns[j] not in del_cols):
                del_cols.append(relation.columns[j])
            

In [65]:
train_B_1 = train_B_1[final_cols]

### 3、模型训练与测试

In [70]:
# 这里我们直接选用XGBoost
# 为了能够复现成绩,注意在每个算法的包中都有一个随机种子,在XGBoost里面就是random_seed[9],此处我们选择随意一组参数(可以自己调整)

train_B_flag = train_B_1['flag']
train_B_1.drop('no', axis=1, inplace=True)
train_B_1.drop('flag', axis=1, inplace=True)

In [71]:
train_B_1.head()

Unnamed: 0,UserInfo_2,UserInfo_3,UserInfo_4,ProductInfo_1,UserInfo_5,UserInfo_7,UserInfo_8,UserInfo_11,UserInfo_12,UserInfo_15,UserInfo_16,UserInfo_17,UserInfo_19,UserInfo_20,UserInfo_23,UserInfo_25,UserInfo_27,UserInfo_28,UserInfo_33,UserInfo_36,UserInfo_39,UserInfo_40,UserInfo_44,UserInfo_48,UserInfo_49,UserInfo_52,UserInfo_54,UserInfo_60,UserInfo_63,UserInfo_66,ProductInfo_49,ProductInfo_50,UserInfo_67,UserInfo_68,WebInfo_1,UserInfo_72,UserInfo_73,UserInfo_75,UserInfo_76,UserInfo_80,UserInfo_82,UserInfo_83,UserInfo_85,UserInfo_92,UserInfo_94,UserInfo_96,UserInfo_97,UserInfo_100,UserInfo_101,UserInfo_104,UserInfo_105,UserInfo_106,UserInfo_107,UserInfo_108,ProductInfo_89,UserInfo_113,UserInfo_115,UserInfo_118,UserInfo_121,UserInfo_122,UserInfo_125,UserInfo_127,UserInfo_130,UserInfo_132,UserInfo_133,UserInfo_134,UserInfo_135,UserInfo_136,UserInfo_147,UserInfo_149,UserInfo_153,UserInfo_155,UserInfo_160,UserInfo_169,UserInfo_179,WebInfo_3,UserInfo_184,UserInfo_186,UserInfo_190,UserInfo_193,UserInfo_196,UserInfo_197,UserInfo_200,UserInfo_210,UserInfo_217,UserInfo_222,UserInfo_227,UserInfo_229,UserInfo_231,UserInfo_238,UserInfo_242,UserInfo_243,UserInfo_246,UserInfo_248,UserInfo_261,UserInfo_262,UserInfo_269
0,5226.59,0.0,0.0,0.0,296.0,0.0,-999.0,20.0,2.0,-999.0,-999.0,5226.59,5463.65,5226.59,4.0,-999.0,35360.16,0.0,0.0,-999.0,0.0,1.0,0.0,-999.0,-999.0,131379.6,0.0,0.0,5226.59,0.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,1798.16,-999.0,0.0,5226.59,74.0,0.0,-999.0,-20009.32,0.0,33009.63,4683.03,-999.0,2977.01,9630.4,-999.0,0.0,-999.0,-999.0,5226.59,-999.0,1.0,0.0,-999.0,1.0,0.0,0.0,-999.0,0.0,0.0,2450.53,0.0,31.0,26900.0,0.0,0.0,0.0,-999.0,-999.0,33009.63,-999.0,33009.63,0.0,0.0,-999.0,0.0,-999.0,1.0,-999.0,-999.0,0.0,0.0,0.0,-999.0,0.0,10000.0,2.0,-999.0,0.0,-999.0,1.0,2940.0
1,0.0,0.0,0.0,0.0,-999.0,46000.0,-999.0,-999.0,1.0,0.0,-999.0,736.0,3064.5,1472.0,-999.0,-999.0,32382.47,0.0,-999.0,-999.0,-999.0,5.0,0.0,-999.0,-999.0,0.0,-999.0,200000.0,736.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,1094.41,-999.0,-999.0,736.0,62.0,0.0,-999.0,738.38,0.0,1472.0,1448.61,31901.59,0.0,0.0,-999.0,0.0,0.0,2.0,736.0,-999.0,1.0,0.0,-999.0,-999.0,2603.53,0.0,39.0,0.0,0.0,0.0,0.0,-999.0,-999.0,1.0,0.0,0.0,-999.0,707609.0,1472.0,-999.0,1472.0,0.0,-999.0,-999.0,621.45,14084.0,1.0,-999.0,-999.0,3.0,0.0,736.0,-999.0,0.0,86500.0,0.0,-999.0,30910.47,-999.0,2.0,0.0
2,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,37.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,1.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,14072.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,1.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,3.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,5.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2358.0,1.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [72]:
# xgboost模型
dtrain_B = xgb.DMatrix(data=train_B_1, label=train_B_flag)
Trate=0.25  
params = {'booster':'gbtree',
                  'eta': 0.1,
                   'max_depth': 4,
                   'max_delta_step': 0,
                   'subsample':0.9,      
                   'colsample_bytree': 0.9,
                   'base_score': Trate,
                   'objective': 'binary:logistic',
                   'lambda':5,
                   'alpha':8,
                   'random_seed':100
                   }
params['eval_metric'] = 'auc' 
xgb_model = xgb.train(params, dtrain_B, num_boost_round=200, maximize=True, verbose_eval=True )


In [73]:
# 模型测试

res = xgb_model.predict(xgb.DMatrix(test_B[train_B_1.columns].fillna(-999)))
test_B['pred'] = res
test_B[['no', 'pred']].to_csv('submit.csv',index = None)

In [74]:
# 线上AUC = 0.602393

## 测试LGB

In [76]:
import lightgbm as lgb
def lgb_feature_selection(tr_x, tr_y, model_seed =666,num_rounds = 500):
    lgb_tr = lgb.Dataset(tr_x, tr_y)
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'random_state': model_seed}
    model = lgb.train(lgb_params, lgb_tr, num_boost_round=num_rounds,verbose_eval=100)
    return model

In [77]:
f_model = lgb_feature_selection(train_B_1, train_B_flag)


In [81]:
lgb_pred = f_model.predict(test_B[train_B_1.columns].fillna(-999))
test_B['pred'] = lgb_pred
test_B[['no', 'pred']].to_csv('submitLGB.csv',index = None)

In [82]:
# 线上AUC = 0.571084
# 应该是参数的问题，导致lgb测评小于xgboost

In [84]:
# 改变参数
def lgb_feature_selection2(tr_x, tr_y, model_seed =666,num_rounds = 500):
    lgb_tr = lgb.Dataset(tr_x, tr_y)
    lgb_params = {
         'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth':17,
    'feature_fraction':0.80,
    'lambda_l1':0.6,
#     'scale_pos_weight':1.1,
    'random_state': 0}
    model = lgb.train(lgb_params, lgb_tr, num_boost_round=num_rounds,verbose_eval=100)
    return model
f_model = lgb_feature_selection2(train_B_1, train_B_flag)
lgb_pred = f_model.predict(test_B[train_B_1.columns].fillna(-999))
test_B['pred'] = lgb_pred
test_B[['no', 'pred']].to_csv('submitLGB.csv',index = None)

In [None]:
# 线上AUC = 0.580101