In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.527603 + 0.00828904
[10]	cv_agg's res: 0.638823 + 0.0147401
[15]	cv_agg's res: 0.697774 + 0.014801
[20]	cv_agg's res: 0.725381 + 0.0151415
[25]	cv_agg's res: 0.737411 + 0.0157205
[30]	cv_agg's res: 0.748324 + 0.0199827
[35]	cv_agg's res: 0.75122 + 0.0193976
[40]	cv_agg's res: 0.759311 + 0.016527
[45]	cv_agg's res: 0.762123 + 0.0150032
[50]	cv_agg's res: 0.762837 + 0.0145641
[55]	cv_agg's res: 0.767502 + 0.015413
[60]	cv_agg's res: 0.767884 + 0.0130821
[65]	cv_agg's res: 0.768021 + 0.0121449
[70]	cv_agg's res: 0.769143 + 0.0110887
[75]	cv_agg's res: 0.769768 + 0.0159048
[80]	cv_agg's res: 0.768273 + 0.014145
[85]	cv_agg's res: 0.76629 + 0.0128077
[90]	cv_agg's res: 0.768633 + 0.0120366
[95]	cv_agg's res: 0.767113 + 0.00930545
[100]	cv_agg's res: 0.767846 + 0.00972059
[105]	cv_agg's res: 0.769352 + 0.00861048
[110]	cv_agg's res: 0.768729 + 0.00739414
[115]	cv_agg's res: 0.768213 + 0.0103566
[120]	cv_agg's res: 0.768673 + 0.00969217
[125]	cv_agg's res: 0.769847 + 0.009

{'res-mean': [0.48350757341261974,
  0.497528780431727,
  0.5116135283577229,
  0.5230659150733866,
  0.5276026887703352,
  0.5381722697209533,
  0.5848468177663483,
  0.6044063547999606,
  0.6240882451205835,
  0.6388227966470841,
  0.6528878794431081,
  0.6682033002548405,
  0.6814892908140923,
  0.6907989185869629,
  0.6977743890005498,
  0.7091696315610109,
  0.710020225273761,
  0.7158750150974994,
  0.719187640108156,
  0.7253812713175543,
  0.7292562837676017,
  0.7299101774975586,
  0.730313187583073,
  0.7332888604253621,
  0.7374111236360075,
  0.7404286351054798,
  0.7393157707569479,
  0.7436097004761534,
  0.7463919300256384,
  0.7483236227173894,
  0.746814053489533,
  0.7479020948800142,
  0.7496008575488569,
  0.7526024683019644,
  0.7512204141929105,
  0.753749230628141,
  0.7569822739268589,
  0.7583849981418517,
  0.7581284475528923,
  0.7593110168461047,
  0.7606477638940632,
  0.761795053117608,
  0.7604094338791402,
  0.7623709664950638,
  0.7621226623646726,
  0.

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

  'precision', 'predicted', average, warn_for)


[5]	training's res: 0.579808
[10]	training's res: 0.784924
[15]	training's res: 0.89353
[20]	training's res: 0.928722
[25]	training's res: 0.947548
[30]	training's res: 0.962023
[35]	training's res: 0.973139
[40]	training's res: 0.981353
[45]	training's res: 0.986774
[50]	training's res: 0.991198
[55]	training's res: 0.994305
[60]	training's res: 0.995929
[65]	training's res: 0.997981
[70]	training's res: 0.999331
[75]	training's res: 0.999556
[80]	training's res: 1
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[195

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])