In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'max_bin': 10,
    'is_unbalance': True,
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 45,
    'num_leaves': 84,
    'learning_rate': 0.06,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.532076 + 0.00656879
[10]	cv_agg's res: 0.66736 + 0.0249002
[15]	cv_agg's res: 0.776856 + 0.0159484
[20]	cv_agg's res: 0.793352 + 0.0129779
[25]	cv_agg's res: 0.798481 + 0.0134013
[30]	cv_agg's res: 0.801592 + 0.0132262
[35]	cv_agg's res: 0.804545 + 0.0108797
[40]	cv_agg's res: 0.803477 + 0.0113134
[45]	cv_agg's res: 0.805241 + 0.0113293
[50]	cv_agg's res: 0.806787 + 0.0111546
[55]	cv_agg's res: 0.806818 + 0.0085637
[60]	cv_agg's res: 0.80841 + 0.00806579
[65]	cv_agg's res: 0.808085 + 0.00873363
[70]	cv_agg's res: 0.80931 + 0.0108592
[75]	cv_agg's res: 0.810291 + 0.0095475
[80]	cv_agg's res: 0.807991 + 0.0134097
[85]	cv_agg's res: 0.808291 + 0.0126103
[90]	cv_agg's res: 0.809188 + 0.011212
[95]	cv_agg's res: 0.808374 + 0.0117471
[100]	cv_agg's res: 0.808346 + 0.0112915
[105]	cv_agg's res: 0.808351 + 0.011493
[110]	cv_agg's res: 0.808531 + 0.0116089
[115]	cv_agg's res: 0.806874 + 0.0125816
[120]	cv_agg's res: 0.808436 + 0.0118868
[125]	cv_agg's res: 0.807357 + 0.01244

{'res-mean': [0.4944317527096736,
  0.5265434686679641,
  0.5297530597293879,
  0.5328255242891513,
  0.532076194500288,
  0.534570075220021,
  0.5353874918198361,
  0.5623493322870697,
  0.6101280736504392,
  0.6673597835722799,
  0.7174758676845702,
  0.7462743098436642,
  0.7603074652910563,
  0.767662131500209,
  0.7768563421341822,
  0.7788540349076757,
  0.7827563073463848,
  0.7861669919314135,
  0.787304593820759,
  0.793352485303969,
  0.7952378524632667,
  0.797693481347263,
  0.7988883567556551,
  0.7998625387123447,
  0.7984813315392,
  0.799191029362559,
  0.8013087576963148,
  0.8029389151810089,
  0.802003227665239,
  0.801592046360606,
  0.8036599122884963,
  0.8035923240552388,
  0.803532176056863,
  0.8037593280421872,
  0.8045447698890013,
  0.8059723791414299,
  0.8042080162194317,
  0.8034948347362217,
  0.8044980720463162,
  0.8034773899326999,
  0.8034168346009191,
  0.8044710408574574,
  0.8052049951598329,
  0.8056137240665606,
  0.8052411813154295,
  0.8050801

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

  'precision', 'predicted', average, warn_for)


[5]	training's res: 0.568088
[10]	training's res: 0.758842
[15]	training's res: 0.876726
[20]	training's res: 0.884498
[25]	training's res: 0.890475
[30]	training's res: 0.891861
[35]	training's res: 0.899045
[40]	training's res: 0.901831
[45]	training's res: 0.908916
[50]	training's res: 0.912328
[55]	training's res: 0.919076
[60]	training's res: 0.923947
[65]	training's res: 0.928734
[70]	training's res: 0.931679
[75]	training's res: 0.936524
[80]	training's res: 0.941042
[85]	training's res: 0.944484
[90]	training's res: 0.948526
[95]	training's res: 0.954637
[100]	training's res: 0.958813
[105]	training's res: 0.964097
[110]	training's res: 0.968026
[115]	training's res: 0.970594
[120]	training's res: 0.974912
[125]	training's res: 0.978498
[130]	training's res: 0.979919
[135]	training's res: 0.982142
[140]	training's res: 0.983777
[145]	training's res: 0.985433
[150]	training's res: 0.986886
[155]	training's res: 0.98814
[160]	training's res: 0.989399
[165]	training's res: 0.99045

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])