In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV2.csv')
test = pd.read_csv('../data/test_featureV2.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 24,
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.525577 + 0.0039519
[10]	cv_agg's res: 0.531116 + 0.00540206
[15]	cv_agg's res: 0.635791 + 0.00773845
[20]	cv_agg's res: 0.689343 + 0.00171947
[25]	cv_agg's res: 0.720879 + 0.0102147
[30]	cv_agg's res: 0.734244 + 0.00511144
[35]	cv_agg's res: 0.744501 + 0.00999875
[40]	cv_agg's res: 0.74931 + 0.0103026
[45]	cv_agg's res: 0.752181 + 0.012369
[50]	cv_agg's res: 0.760762 + 0.0132479
[55]	cv_agg's res: 0.764217 + 0.0120439
[60]	cv_agg's res: 0.768407 + 0.0128001
[65]	cv_agg's res: 0.771194 + 0.0139532
[70]	cv_agg's res: 0.77316 + 0.0119519
[75]	cv_agg's res: 0.773164 + 0.0118396
[80]	cv_agg's res: 0.774791 + 0.0145869
[85]	cv_agg's res: 0.774439 + 0.0137582
[90]	cv_agg's res: 0.775086 + 0.012668
[95]	cv_agg's res: 0.774297 + 0.0138291
[100]	cv_agg's res: 0.775758 + 0.0145594
[105]	cv_agg's res: 0.774617 + 0.0139532
[110]	cv_agg's res: 0.775312 + 0.0137933
[115]	cv_agg's res: 0.77456 + 0.0140501
[120]	cv_agg's res: 0.774165 + 0.0135441
[125]	cv_agg's res: 0.775362 + 0.012

{'res-mean': [0.5103190988663623,
  0.5186897885492344,
  0.5220508915976998,
  0.5236196538857966,
  0.5255769038940972,
  0.527102694125598,
  0.5280781611312886,
  0.5284539329585364,
  0.5290912213676412,
  0.5311157432944077,
  0.5456817181682289,
  0.5798049523271248,
  0.5994863415988039,
  0.6177718427606168,
  0.6357911806446948,
  0.6447208529422305,
  0.6575458637678272,
  0.672529512901939,
  0.6822807119948616,
  0.6893428224926718,
  0.6939957761779875,
  0.7007410574097364,
  0.7089573063320763,
  0.7139480934955351,
  0.7208789626352261,
  0.7230451024791806,
  0.726277746237018,
  0.7290119486419053,
  0.7332009115357584,
  0.7342442579424744,
  0.7395807809900729,
  0.7381890154872509,
  0.7421019288223422,
  0.7439699921736803,
  0.7445011037885164,
  0.7466310271002948,
  0.7467836201049499,
  0.7475091245490108,
  0.7485884874808507,
  0.7493098379694697,
  0.7519108444470491,
  0.7502120700985184,
  0.7520216935301827,
  0.7525028573385653,
  0.7521810139009831,
 

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.556255
[10]	training's res: 0.562612
[15]	training's res: 0.69127
[20]	training's res: 0.764924
[25]	training's res: 0.815168
[30]	training's res: 0.843963
[35]	training's res: 0.860799
[40]	training's res: 0.87675
[45]	training's res: 0.888256
[50]	training's res: 0.897138
[55]	training's res: 0.905546
[60]	training's res: 0.911003
[65]	training's res: 0.917587
[70]	training's res: 0.924769
[75]	training's res: 0.928785
[80]	training's res: 0.934608
[85]	training's res: 0.939462
[90]	training's res: 0.942975
[95]	training's res: 0.948748
[100]	training's res: 0.951558
[105]	training's res: 0.95617
[110]	training's res: 0.959732
[115]	training's res: 0.963586
[120]	training's res: 0.967326
[125]	training's res: 0.970824
[130]	training's res: 0.973479
[135]	training's res: 0.97495
[140]	training's res: 0.977615
[145]	training's res: 0.979665
[150]	training's res: 0.981889
[155]	training's res: 0.982797
[160]	training's res: 0.983193
[165]	training's res: 0.984725
[

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})
res.head()

Unnamed: 0,label,uid
0,0.893715,u7000
1,0.442315,u7001
2,0.009293,u7002
3,0.001795,u7003
4,0.019196,u7004


In [10]:
res=res.sort_values(by='label',ascending=False)
# res.preds=res.preds.map(lambda x: 1 if x>=0.5 else 0)
res['label']=res['label'].map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])