In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'max_bin': 10,
    'is_unbalance': True,
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 84,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.532553 + 0.00237825
[10]	cv_agg's res: 0.687682 + 0.00669678
[15]	cv_agg's res: 0.767398 + 0.0120484
[20]	cv_agg's res: 0.788201 + 0.0117589
[25]	cv_agg's res: 0.793356 + 0.0117568
[30]	cv_agg's res: 0.7968 + 0.0105036
[35]	cv_agg's res: 0.797758 + 0.0109273
[40]	cv_agg's res: 0.797752 + 0.010425
[45]	cv_agg's res: 0.800466 + 0.00976344
[50]	cv_agg's res: 0.8004 + 0.0108276
[55]	cv_agg's res: 0.800468 + 0.0112744
[60]	cv_agg's res: 0.801999 + 0.0100814
[65]	cv_agg's res: 0.802938 + 0.00945136
[70]	cv_agg's res: 0.801146 + 0.0106368
[75]	cv_agg's res: 0.798958 + 0.0112715
[80]	cv_agg's res: 0.796775 + 0.013597
[85]	cv_agg's res: 0.798424 + 0.0130415
[90]	cv_agg's res: 0.796063 + 0.0141493
[95]	cv_agg's res: 0.796732 + 0.0142432
[100]	cv_agg's res: 0.799454 + 0.0157992
[105]	cv_agg's res: 0.79785 + 0.01656
[110]	cv_agg's res: 0.795762 + 0.0167144
[115]	cv_agg's res: 0.795839 + 0.0143263
[120]	cv_agg's res: 0.796363 + 0.0147508
[125]	cv_agg's res: 0.79361 + 0.0152039
[

{'res-mean': [0.4879859081615276,
  0.5200680230833247,
  0.5273777807648243,
  0.5310211422204276,
  0.5325527750436186,
  0.5337715484171093,
  0.5352484979380452,
  0.5396498492327391,
  0.6310508095514048,
  0.687682163384725,
  0.7190010749493871,
  0.7366123253656905,
  0.7489407804142031,
  0.7617573940109955,
  0.767397545418051,
  0.7717924334042552,
  0.7767698171981007,
  0.7811038268662668,
  0.7846872870003483,
  0.7882010981928277,
  0.7892917896209101,
  0.7885907868678146,
  0.7914140420540807,
  0.7922654010814609,
  0.7933564181870111,
  0.7943586227491992,
  0.7951400424747845,
  0.7947332889761031,
  0.7954304166447194,
  0.796799564319823,
  0.7976297401852285,
  0.7978405124522151,
  0.7963213694863801,
  0.7981668960491911,
  0.7977579268146756,
  0.799075460435683,
  0.798261433182435,
  0.798105528169593,
  0.7998186222989463,
  0.7977521979525592,
  0.7981225248527601,
  0.8002999635276685,
  0.7989055367008518,
  0.7998974302995384,
  0.8004662675509743,
  0.

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

  'precision', 'predicted', average, warn_for)


[5]	training's res: 0.581648
[10]	training's res: 0.82683
[15]	training's res: 0.927991
[20]	training's res: 0.928469
[25]	training's res: 0.929449
[30]	training's res: 0.930171
[35]	training's res: 0.933117
[40]	training's res: 0.935895
[45]	training's res: 0.938802
[50]	training's res: 0.944034
[55]	training's res: 0.94809
[60]	training's res: 0.954298
[65]	training's res: 0.957591
[70]	training's res: 0.962431
[75]	training's res: 0.965601
[80]	training's res: 0.969498
[85]	training's res: 0.972087
[90]	training's res: 0.976637
[95]	training's res: 0.979243
[100]	training's res: 0.98231
[105]	training's res: 0.985808
[110]	training's res: 0.987714
[115]	training's res: 0.990027
[120]	training's res: 0.991728
[125]	training's res: 0.992584
[130]	training's res: 0.993872
[135]	training's res: 0.996039
[140]	training's res: 0.996912
[145]	training's res: 0.996913
[150]	training's res: 0.99779
[155]	training's res: 0.99845
[160]	training's res: 0.998892
[165]	training's res: 0.998892
[1

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])