In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'max_bin': 10,
    'is_unbalance': True,
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 84,
    'learning_rate': 0.04,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.550879 + 0.00681451
[10]	cv_agg's res: 0.554428 + 0.00608791
[15]	cv_agg's res: 0.752439 + 0.0140967
[20]	cv_agg's res: 0.814819 + 0.0137344
[25]	cv_agg's res: 0.828137 + 0.0114722
[30]	cv_agg's res: 0.835806 + 0.0133856
[35]	cv_agg's res: 0.840827 + 0.0155765
[40]	cv_agg's res: 0.843581 + 0.0169484
[45]	cv_agg's res: 0.845893 + 0.0152568
[50]	cv_agg's res: 0.847244 + 0.0154753
[55]	cv_agg's res: 0.849239 + 0.015634
[60]	cv_agg's res: 0.851325 + 0.0158735
[65]	cv_agg's res: 0.850906 + 0.0162103
[70]	cv_agg's res: 0.851626 + 0.0156691
[75]	cv_agg's res: 0.853119 + 0.0162766
[80]	cv_agg's res: 0.852591 + 0.0174087
[85]	cv_agg's res: 0.851825 + 0.0174281
[90]	cv_agg's res: 0.851999 + 0.0179103
[95]	cv_agg's res: 0.853913 + 0.0165198
[100]	cv_agg's res: 0.853947 + 0.0161616
[105]	cv_agg's res: 0.853139 + 0.0160521
[110]	cv_agg's res: 0.853323 + 0.0159994
[115]	cv_agg's res: 0.852636 + 0.0160243
[120]	cv_agg's res: 0.853156 + 0.0156392
[125]	cv_agg's res: 0.85432 + 0.015

{'res-mean': [0.522138848397152,
  0.5333231463739694,
  0.5415750843543141,
  0.5483221824266695,
  0.5508791331471844,
  0.5541695952813708,
  0.5545660532391664,
  0.5541213218716429,
  0.5544630313714863,
  0.5544277676801322,
  0.5550126084306829,
  0.5570248114786488,
  0.6613474135424022,
  0.7115280950448231,
  0.7524394072703809,
  0.7753063368925016,
  0.7912776426437235,
  0.8018809907339598,
  0.8106801082169125,
  0.8148193904718263,
  0.8201004763141583,
  0.823948235553177,
  0.8250982283622541,
  0.8273003848885351,
  0.8281368458908481,
  0.8295158948192348,
  0.8305583184406117,
  0.8328113138155935,
  0.8338177399002333,
  0.835806412333303,
  0.8375357807894916,
  0.838130682147819,
  0.8387305072566876,
  0.8410893250564108,
  0.8408273049089056,
  0.8416426599482286,
  0.8427532210590144,
  0.8432443306624539,
  0.8447408871425548,
  0.8435807747899026,
  0.845135728173991,
  0.8446653206708442,
  0.845348987294999,
  0.8465119551334771,
  0.8458925614124967,
  0.

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=350,valid_sets=[dtrain])

  'precision', 'predicted', average, warn_for)


[5]	training's res: 0.585185
[10]	training's res: 0.587809
[15]	training's res: 0.895734
[20]	training's res: 0.943397
[25]	training's res: 0.951538
[30]	training's res: 0.952871
[35]	training's res: 0.953736
[40]	training's res: 0.954183
[45]	training's res: 0.955261
[50]	training's res: 0.957043
[55]	training's res: 0.957413
[60]	training's res: 0.960334
[65]	training's res: 0.962737
[70]	training's res: 0.964391
[75]	training's res: 0.965938
[80]	training's res: 0.96785
[85]	training's res: 0.970586
[90]	training's res: 0.973762
[95]	training's res: 0.976137
[100]	training's res: 0.979925
[105]	training's res: 0.983989
[110]	training's res: 0.985847
[115]	training's res: 0.987304
[120]	training's res: 0.990032
[125]	training's res: 0.992799
[130]	training's res: 0.994304
[135]	training's res: 0.995822
[140]	training's res: 0.996913
[145]	training's res: 0.99757
[150]	training's res: 0.998671
[155]	training's res: 0.999113
[160]	training's res: 0.999334
[165]	training's res: 0.999556

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])