In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'max_bin': 10,
    'is_unbalance': True,
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 84,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.527858 + 0.0070957
[10]	cv_agg's res: 0.592762 + 0.0156297
[15]	cv_agg's res: 0.663102 + 0.0132074
[20]	cv_agg's res: 0.702072 + 0.00563239
[25]	cv_agg's res: 0.720582 + 0.00903161
[30]	cv_agg's res: 0.731924 + 0.0120349
[35]	cv_agg's res: 0.741225 + 0.0141546
[40]	cv_agg's res: 0.747459 + 0.0155156
[45]	cv_agg's res: 0.750798 + 0.0181093
[50]	cv_agg's res: 0.756782 + 0.0152948
[55]	cv_agg's res: 0.758768 + 0.015663
[60]	cv_agg's res: 0.759955 + 0.0144285
[65]	cv_agg's res: 0.762334 + 0.0114004
[70]	cv_agg's res: 0.760244 + 0.0120964
[75]	cv_agg's res: 0.760728 + 0.0117681
[80]	cv_agg's res: 0.760108 + 0.00992901
[85]	cv_agg's res: 0.761873 + 0.0112665
[90]	cv_agg's res: 0.762215 + 0.0127438
[95]	cv_agg's res: 0.764207 + 0.0121565
[100]	cv_agg's res: 0.764182 + 0.0125383
[105]	cv_agg's res: 0.764122 + 0.0121811
[110]	cv_agg's res: 0.763728 + 0.0129817
[115]	cv_agg's res: 0.763046 + 0.0121997
[120]	cv_agg's res: 0.760426 + 0.0116988
[125]	cv_agg's res: 0.762741 + 0.0

{'res-mean': [0.4802140428913706,
  0.510005328843481,
  0.5183892626267278,
  0.5265223521527979,
  0.5278577695758951,
  0.5268601239279209,
  0.5292643466597262,
  0.5455877584304754,
  0.5741938743663758,
  0.5927622694254807,
  0.6113192758795011,
  0.6257390190584428,
  0.6369804218923624,
  0.649797988883318,
  0.6631020738521772,
  0.6785452440038094,
  0.6838615531180663,
  0.6914943234357632,
  0.7001160087833632,
  0.7020721601365526,
  0.7050013828442223,
  0.7088820634336193,
  0.7170198786184319,
  0.7194871802551216,
  0.7205815736518288,
  0.7238563085359812,
  0.7241541766339514,
  0.7275265018394902,
  0.7299159308875036,
  0.7319240621494744,
  0.7360999805620875,
  0.7352480527145967,
  0.7386734724789764,
  0.7408921603188428,
  0.7412245092260386,
  0.744854108819765,
  0.7449921186093876,
  0.7456693034594739,
  0.7459719024567502,
  0.7474594519559088,
  0.748189374674662,
  0.7481335458157465,
  0.750310334562282,
  0.7506336066474931,
  0.750797761409587,
  0.

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

  'precision', 'predicted', average, warn_for)


[5]	training's res: 0.584994
[10]	training's res: 0.707274
[15]	training's res: 0.858745
[20]	training's res: 0.918634
[25]	training's res: 0.945272
[30]	training's res: 0.961615
[35]	training's res: 0.973687
[40]	training's res: 0.978826
[45]	training's res: 0.984356
[50]	training's res: 0.988368
[55]	training's res: 0.993122
[60]	training's res: 0.995692
[65]	training's res: 0.99753
[70]	training's res: 0.998656
[75]	training's res: 0.999331
[80]	training's res: 0.999555
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res:

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])