### 一、引入相关包（从baseline 开始做起，不断提高）

In [1]:
import numpy as np
import pandas as pd

# 用于数据拆分
from sklearn.model_selection import StratifiedKFold
# 用于对应评分标准
from sklearn.metrics import f1_score
# 选择机器学习库
import lightgbm as lgb
# auc 用于评判
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline

### 二、读取train_feature/test_feature/train_labels

In [2]:
trainDF = pd.read_csv('data/train_features_3.csv') # 105列
testDF = pd.read_csv('data/test_features_3.csv') # 105 列
trainLabel = pd.read_csv('train_labels.csv') # 2 列

In [3]:
len(trainDF)

63817

In [4]:
len(testDF)

52250

In [5]:
# 数据初探
trainDF.head(3)

Unnamed: 0,file_name,低压开关,分配压力_max,分配压力_mean,分配压力_median,分配压力_min,分配压力_std,分配压力range,反泵,发动机转速_max,...,流量档位_min,流量档位_std,流量档位range,液压油温_max,液压油温_mean,液压油温_median,液压油温_min,液压油温_std,液压油温range,高压开关
0,0001c9f874fa7f7a5d5d21eb4aca55cd.csv,0.0,75.58,60.766058,61.27,44.75,5.026009,30.83,1.0,4989.44,...,9.86,2.706142,8.24,97.3,89.095839,88.9,73.31,4.531313,23.99,1.0
1,0001cd6beea6ec459f2171f58f029217.csv,0.0,76.63,69.228435,69.31,47.75,4.665739,28.88,0.0,4429.11,...,0.12,1.973907,6.92,80.47,69.811739,70.19,60.57,4.027943,19.9,1.0
2,000247cf38d9c373e79a43121fd6b3f6.csv,0.0,83.6,63.588103,64.8,37.26,7.429722,46.34,0.0,5328.43,...,6.28,2.847176,11.11,72.61,62.118707,61.855,53.3,4.893397,19.31,1.0


In [6]:
testDF.head(3)

Unnamed: 0,file_name,低压开关,分配压力_max,分配压力_mean,分配压力_median,分配压力_min,分配压力_std,分配压力range,反泵,发动机转速_max,...,流量档位_min,流量档位_std,流量档位range,液压油温_max,液压油温_mean,液压油温_median,液压油温_min,液压油温_std,液压油温range,高压开关
0,0000c0df1049027f16115f76a35a9859.csv,0.0,84.97,67.377937,66.98,51.72,8.062021,33.25,0.0,4664.38,...,2.12,1.621375,9.34,79.53,69.669524,69.375,60.77,4.93009,18.76,1.0
1,0002edc18cd1df5fae01acd823596185.csv,0.0,70.85,62.093643,64.2,3.31,9.69734,67.54,1.0,5297.24,...,5.38,6.884416,24.1,73.73,59.058682,59.41,46.07,6.048316,27.66,1.0
2,00041c3a10b84c87e236c65ce5fce6da.csv,0.0,68.4,58.062158,57.61,43.65,4.048956,24.75,1.0,5051.28,...,0.34,4.232711,24.83,123.93,112.386619,113.16,99.88,5.455524,24.05,1.0


In [7]:
trainLabel.head(3)

Unnamed: 0,sample_file_name,label
0,0001c9f874fa7f7a5d5d21eb4aca55cd.csv,1
1,0001cd6beea6ec459f2171f58f029217.csv,1
2,000247cf38d9c373e79a43121fd6b3f6.csv,1


In [5]:
len(trainDF) == len(trainLabel)

True

In [6]:
trainLable_columns = ['file_name', 'label']
trainLabel.columns = trainLable_columns
trainLabel.head(3)

Unnamed: 0,file_name,label
0,0001c9f874fa7f7a5d5d21eb4aca55cd.csv,1
1,0001cd6beea6ec459f2171f58f029217.csv,1
2,000247cf38d9c373e79a43121fd6b3f6.csv,1


### 将训练集特征与标签相融合：为了保证最后参与训练的特征都具有对应的标签，采用left join，同样，测试集也是如此。

In [7]:
train_label_df = pd.merge(trainLabel, trainDF, on='file_name', how='left')
features_x = train_label_df.drop('label', 1).drop('file_name', 1).get_values()
features_y = train_label_df['label'].get_values()

### 构造测试集特征，并保留预测的index

In [8]:
predID = testDF['file_name']
test = testDF.drop('file_name', 1).get_values()

In [9]:
columns = trainDF.columns

### 构造模型，使用5折交叉验证

In [10]:
# N  代表交叉验证个数
N = 5
skf = StratifiedKFold(n_splits=N, shuffle=False, random_state=17)

# 寻找相对最佳的分割阈值， 不同的阈值放入不同的list
xx_cv = []
xx_f1score = []
xx_f1score27 = []
xx_f1score35 = []
xx_f1score38 = []

# 预测结果
xx_pred = []

sort_feature_list = []
for train_in, test_in in skf.split(features_x, features_y):
    # 构造训练集和验证集
    X_train, X_test, y_train, y_test = features_x[train_in], features_x[test_in], features_y[train_in], features_y[test_in]
    
    # 创建lightGBM 输入数据， 以及验证集
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    
    # lgb 输入参数
    params = {
        # step1: learning_rate/boosting_type/num_boost_round
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss'},
        'learning_rate': 0.01,
        
        # step2:max_depth/num_leaves
        'max_depth': 20,
        'num_leaves': 128,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 4,
        'verbose': 0,
        'lambda_l2':0.05,
        'lambda_l1':0.05
    }
    params['is_unbalance']='false'
    params['max_bin'] = 100
    
    # step3:min_data_in_leaf:过大会导致欠拟合
    params['min_data_in_leaf'] = 80
    
    print('I begin to train.....')
    
    
    # 开始训练模型
    gbm = lgb.train(params, 
                    lgb_train, 
                    num_boost_round=10000, 
                    valid_sets=lgb_eval, 
                    verbose_eval=500, 
                    early_stopping_rounds=100)
    
    sort_feature = gbm.feature_importance().argsort()
    sort_feature_list.append(columns[sort_feature])
    print('I begin to predict.....')
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    xx_cv.append(roc_auc_score(y_test, y_pred))
    
    
    # 预测测试集， 并将验证集的分数存入list
    xx_pred.append(gbm.predict(test, num_iteration=gbm.best_iteration))
    xx_f1score.append(f1_score(y_test, pd.Series(y_pred).map(lambda x: 1 if x>0.3 else 0), average='binary'))
    xx_f1score27.append(f1_score(y_test, pd.Series(y_pred).map(lambda x: 1 if x>0.27 else 0), average='binary'))
    xx_f1score35.append(f1_score(y_test, pd.Series(y_pred).map(lambda x: 1 if x>0.35 else 0), average='binary'))
    xx_f1score38.append(f1_score(y_test, pd.Series(y_pred).map(lambda x: 1 if x>0.38 else 0), average='binary'))

I begin to train.....
Training until validation scores don't improve for 100 rounds.
[500]	valid_0's binary_logloss: 0.61468
[1000]	valid_0's binary_logloss: 0.606308
[1500]	valid_0's binary_logloss: 0.601473
[2000]	valid_0's binary_logloss: 0.598494
[2500]	valid_0's binary_logloss: 0.596532
[3000]	valid_0's binary_logloss: 0.595474
Early stopping, best iteration is:
[3332]	valid_0's binary_logloss: 0.594734
I begin to predict.....
I begin to train.....
Training until validation scores don't improve for 100 rounds.
[500]	valid_0's binary_logloss: 0.612611
[1000]	valid_0's binary_logloss: 0.603777
[1500]	valid_0's binary_logloss: 0.598198
[2000]	valid_0's binary_logloss: 0.595439
[2500]	valid_0's binary_logloss: 0.592852
[3000]	valid_0's binary_logloss: 0.591992
[3500]	valid_0's binary_logloss: 0.591006
Early stopping, best iteration is:
[3433]	valid_0's binary_logloss: 0.590939
I begin to predict.....
I begin to train.....
Training until validation scores don't improve for 100 rounds.


In [11]:
sort_feature_list[0][:20]

Index(['搅拌超压信号diya', '排量电流range', '样本量', '液压油温range', '分配压力range', 'file_name',
       '油泵转速_std', '液压油温_max', '发动机转速_std', '油泵转速_max', '流量档位_std', '排量电流_std',
       '液压油温_mean', '发动机转速_max', '泵送压力range', '泵送压力_std', '油泵转速_min',
       '泵送压力_max', '流量档位_median', '油泵转速_mean'],
      dtype='object')

In [12]:
print('xx_cv:', np.mean(xx_cv))
print('xx_f1score:', np.mean(xx_f1score))
print('xx_f1score27', np.mean(xx_f1score27))
print('xx_f1score35', np.mean(xx_f1score35))
print('xx_f1score38', np.mean(xx_f1score38))

xx_cv: 0.7417468499320157
xx_f1score: 0.6900993944117266
xx_f1score27 0.6866299965939867
xx_f1score35 0.6929768712160405
xx_f1score38 0.6923420732383911


xx_cv: 0.7417468499320157
xx_f1score: 0.6900993944117266
xx_f1score27 0.6866299965939867
xx_f1score35 0.6929768712160405
xx_f1score38 0.6923420732383911

### 根据上面的预测结果选择最优阈值：0.35

In [16]:
s = 0
for i in xx_pred:
    s += pd.Series(i).map(lambda x:1 if x > 0.35 else 0)
s  = s.map(lambda x: 1 if x>3 else 0)

In [17]:
res = pd.DataFrame()
res['sample_file_name'] = predID
res['label'] = s

In [18]:
res.label.value_counts()

1    31913
0    20337
Name: label, dtype: int64

In [19]:
res.to_csv('submit_2.csv', header=True, index=False)