In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 84,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.529634 + 0.00653209
[10]	cv_agg's res: 0.642583 + 0.00514065
[15]	cv_agg's res: 0.702066 + 0.0122767
[20]	cv_agg's res: 0.728373 + 0.0197974
[25]	cv_agg's res: 0.744952 + 0.0155806
[30]	cv_agg's res: 0.75683 + 0.0172481
[35]	cv_agg's res: 0.761578 + 0.0169686
[40]	cv_agg's res: 0.760835 + 0.0174664
[45]	cv_agg's res: 0.763316 + 0.0178766
[50]	cv_agg's res: 0.767587 + 0.0187443
[55]	cv_agg's res: 0.769855 + 0.0164224
[60]	cv_agg's res: 0.771625 + 0.0169042
[65]	cv_agg's res: 0.769969 + 0.0122571
[70]	cv_agg's res: 0.769567 + 0.0129578
[75]	cv_agg's res: 0.771167 + 0.0140236
[80]	cv_agg's res: 0.769632 + 0.0119173
[85]	cv_agg's res: 0.769561 + 0.0130891
[90]	cv_agg's res: 0.772438 + 0.0150975
[95]	cv_agg's res: 0.770168 + 0.0132747
[100]	cv_agg's res: 0.772735 + 0.0138687
[105]	cv_agg's res: 0.770068 + 0.0133036
[110]	cv_agg's res: 0.769702 + 0.0136725
[115]	cv_agg's res: 0.77033 + 0.0122568
[120]	cv_agg's res: 0.771707 + 0.0132237
[125]	cv_agg's res: 0.772194 + 0.012

{'res-mean': [0.4847720030789191,
  0.5157911226165955,
  0.5224307366378161,
  0.5285257507810651,
  0.5296335200891972,
  0.5540041970605484,
  0.5839585794003117,
  0.6083260874736881,
  0.6235898470286084,
  0.642583475812745,
  0.6614482242281294,
  0.6762946238820721,
  0.6860214901621683,
  0.692312013429507,
  0.702065855453205,
  0.7091564000814965,
  0.7155888500333955,
  0.7223434081182197,
  0.7274112052082599,
  0.7283726227930122,
  0.7332023172977653,
  0.735960754793413,
  0.7373340089051105,
  0.7411461094629029,
  0.7449521659294254,
  0.7492025134522232,
  0.7522462055692717,
  0.7535739951997803,
  0.756261848926485,
  0.7568304895536865,
  0.7577409929049458,
  0.7583197940112102,
  0.7594878698452087,
  0.7604478925878712,
  0.7615777589562697,
  0.7626411730937392,
  0.7616461102817391,
  0.762894558937754,
  0.7617374908977542,
  0.7608345467533358,
  0.7642544669560869,
  0.7632578119520277,
  0.7642948763445322,
  0.7655585827182106,
  0.7633161448152546,
  0.

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

  'precision', 'predicted', average, warn_for)


[5]	training's res: 0.580168
[10]	training's res: 0.796436
[15]	training's res: 0.895343
[20]	training's res: 0.929084
[25]	training's res: 0.949959
[30]	training's res: 0.963436
[35]	training's res: 0.973406
[40]	training's res: 0.981669
[45]	training's res: 0.987504
[50]	training's res: 0.991184
[55]	training's res: 0.994711
[60]	training's res: 0.997056
[65]	training's res: 0.998429
[70]	training's res: 0.999329
[75]	training's res: 0.999555
[80]	training's res: 1
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[19

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])