In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'max_bin': 10,
    'is_unbalance': True,
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 84,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.551332 + 0.00673857
[10]	cv_agg's res: 0.559037 + 0.00541301
[15]	cv_agg's res: 0.561591 + 0.00506333
[20]	cv_agg's res: 0.752794 + 0.00212423
[25]	cv_agg's res: 0.819963 + 0.00953812
[30]	cv_agg's res: 0.837705 + 0.00676748
[35]	cv_agg's res: 0.844389 + 0.00738321
[40]	cv_agg's res: 0.850464 + 0.00901257
[45]	cv_agg's res: 0.852194 + 0.00643418
[50]	cv_agg's res: 0.854779 + 0.00806238
[55]	cv_agg's res: 0.857471 + 0.00956261
[60]	cv_agg's res: 0.85716 + 0.0104975
[65]	cv_agg's res: 0.859485 + 0.0105673
[70]	cv_agg's res: 0.860133 + 0.0100083
[75]	cv_agg's res: 0.861911 + 0.00893253
[80]	cv_agg's res: 0.86103 + 0.00971641
[85]	cv_agg's res: 0.862787 + 0.00961695
[90]	cv_agg's res: 0.861789 + 0.0095992
[95]	cv_agg's res: 0.8621 + 0.00870883
[100]	cv_agg's res: 0.863174 + 0.00841004
[105]	cv_agg's res: 0.862987 + 0.00796397
[110]	cv_agg's res: 0.863271 + 0.0086225
[115]	cv_agg's res: 0.864115 + 0.00875237
[120]	cv_agg's res: 0.865192 + 0.00898565
[125]	cv_agg's res: 0

{'res-mean': [0.5305997092806346,
  0.5419029499643517,
  0.5462342702544071,
  0.5483928488677011,
  0.5513323156548969,
  0.5526781587035695,
  0.5549500812750381,
  0.5564770719779449,
  0.5566958892645903,
  0.5590366801583586,
  0.5590878811117382,
  0.5595583013177873,
  0.5606315936226675,
  0.5606530807220179,
  0.5615908368597738,
  0.5675258301031393,
  0.6497928871553894,
  0.687416016186052,
  0.7275564666428802,
  0.7527936863447691,
  0.7764496951377854,
  0.7998361939446496,
  0.8083137755573176,
  0.817308241301555,
  0.8199634846767877,
  0.827268370597649,
  0.8293666472511179,
  0.832571286976236,
  0.8353579246441809,
  0.8377052555534487,
  0.8382619943473081,
  0.8404214522347347,
  0.8423387526113114,
  0.8442261477339018,
  0.8443894393496008,
  0.8459432331543694,
  0.8466541616382776,
  0.8484848896870535,
  0.8492405306183457,
  0.85046402527936,
  0.85051132479604,
  0.8500826552795638,
  0.8516380765341264,
  0.8517843129796581,
  0.8521935281582812,
  0.85

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=350,valid_sets=[dtrain])

  'precision', 'predicted', average, warn_for)


[5]	training's res: 0.588187
[10]	training's res: 0.59078
[15]	training's res: 0.592538
[20]	training's res: 0.916719
[25]	training's res: 0.951852
[30]	training's res: 0.958047
[35]	training's res: 0.959293
[40]	training's res: 0.95792
[45]	training's res: 0.957101
[50]	training's res: 0.956152
[55]	training's res: 0.957622
[60]	training's res: 0.959031
[65]	training's res: 0.960222
[70]	training's res: 0.961952
[75]	training's res: 0.963142
[80]	training's res: 0.9647
[85]	training's res: 0.966586
[90]	training's res: 0.968603
[95]	training's res: 0.970016
[100]	training's res: 0.972366
[105]	training's res: 0.974333
[110]	training's res: 0.9775
[115]	training's res: 0.978506
[120]	training's res: 0.980921
[125]	training's res: 0.98338
[130]	training's res: 0.984849
[135]	training's res: 0.987601
[140]	training's res: 0.9889
[145]	training's res: 0.990856
[150]	training's res: 0.991939
[155]	training's res: 0.992583
[160]	training's res: 0.99517
[165]	training's res: 0.996913
[170]	t

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])