In [148]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [228]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

def lgb_f1_score_sk(y_hat, y_true):
    y_true = np.round(y_true)
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [149]:
pd.set_option('display.expand_frame_repr', False)

In [150]:
train_data = pd.read_table('../data/oppo_round1_train_20180929.txt', 
        names= ['prefix','query_prediction','title','tag','label'], header= None, encoding='utf-8').astype(str)
val_data = pd.read_table('../data/oppo_round1_vali_20180929.txt', 
        names = ['prefix','query_prediction','title','tag','label'], header = None, encoding='utf-8').astype(str)
test_data = pd.read_table('../data/oppo_round1_test_A_20180929.txt',
        names = ['prefix','query_prediction','title','tag'],header = None, encoding='utf-8').astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


In [151]:
# train_data.describe(include='all')
# print(train_data.dtypes)
# print()
# # Focus first on null values
# print(val_data.isna().sum())
# print(val_data.head())

In [152]:
train_data = train_data[train_data['label'] != '音乐' ]

In [153]:
test_data['label'] = -1

In [154]:
# train_data = pd.concat([train_data,val_data.copy()])
train_data['label'] = train_data['label'].apply(lambda x: int(x))
val_data['label'] = val_data['label'].apply(lambda x: int(x))
test_data['label'] = test_data['label'].apply(lambda x: int(x))

In [155]:
items = ['prefix', 'title', 'tag']

## 以下部分用于生成统计特征

共21组，分别为prefix，title，tag的统计特征或是他们任意组合的统计特征。
统计特征为：CTR，CTC，Count

## groupby(items，as_index)[‘target’].agg({name:func,...}) 的用法解释

以items进行分组，然后对每一组（items相同）的target对象进行多个函数操作。
func代表操作的名称，name代表操作后保存的列的名称

In [156]:
temp = train_data.groupby(items, as_index=False)['label'].agg({'_'.join(items)+'_click': 'sum','_'.join(items)+'_count':'count'})
temp['_'.join(items)+'_ctr'] = temp['_'.join(items)+'_click']/(temp['_'.join(items)+'_count'])
train_data = pd.merge(train_data, temp, on=items, how='left')
val_data = pd.merge(val_data, temp, on=items, how='left')
test_data = pd.merge(test_data, temp, on=items, how='left')

for item in items:
    temp = train_data.groupby(item, as_index = False)['label'].agg({item+'_click':'sum', item+'_count':'count'})
    temp[item+'_ctr'] = temp[item+'_click']/(temp[item+'_count'])
    train_data = pd.merge(train_data, temp, on=item, how='left')
    val_data = pd.merge(val_data, temp, on=item, how='left')
    test_data = pd.merge(test_data, temp, on=item, how='left')
    
for i in range(len(items)):
    for j in range(i+1, len(items)):
        item_g = [items[i], items[j]]
        temp = train_data.groupby(item_g, as_index=False)['label'].agg({'_'.join(item_g)+'_click': 'sum','_'.join(item_g)+'_count':'count'})
        temp['_'.join(item_g)+'_ctr'] = temp['_'.join(item_g)+'_click']/(temp['_'.join(item_g)+'_count'])
        train_data = pd.merge(train_data, temp, on=item_g, how='left')
        val_data = pd.merge(val_data, temp, on=item_g, how='left')
        test_data = pd.merge(test_data, temp, on=item_g, how='left')

In [157]:
# train_data_.head()
# train_data_.describe()
# train_data_.dtypes
# train_data_.isna().sum()

In [158]:
train_data_ = train_data.drop(['prefix', 'query_prediction', 'title', 'tag'], axis = 1)
val_data_ = val_data.drop(['prefix', 'query_prediction', 'title', 'tag'], axis = 1)
test_data_ = test_data.drop(['prefix', 'query_prediction', 'title', 'tag'], axis = 1)

In [165]:
print('train beginning')

X = np.array(train_data_.drop(['label'], axis = 1))
y = np.array(train_data_['label'])
X_test_ = np.array(test_data_.drop(['label'], axis = 1))
X_vali_label_ = np.array(val_data_.loc[:, 'label'])
X_vali_ = np.array(val_data_.drop(['label'], axis = 1))
print('================================')
print(X.shape)
print(y.shape)
print(X_vali_label_.shape)
print(X_vali_.shape)
print('================================')

train beginning
(1999998, 21)
(1999998,)
(50000,)
(50000, 21)


In [None]:
# print('================================')
# print(X_vali_label_.shape)
# print(X_vali_.shape)
# print('================================')
# skf = StratifiedKFold(n_splits=N, random_state=42, shuffle=True)
# k_fold = skf.split(X_vali_, X_vali_label_)
# k_fold = list(k_fold)

In [166]:
xx_logloss = []
xx_submit = []
N = 5
skf = StratifiedKFold(n_splits=N, random_state=42, shuffle=True)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [225]:
xx_logloss = []
xx_submit = []
for k, (train_vali, test_vali) in enumerate(skf.split(X_vali_, X_vali_label_)):
    print('train _K_ flod', k)
    X_train_combine = np.vstack([X, X_vali_[train_vali]])
    Y_train_combine = np.hstack([y, X_vali_label_[train_vali]])
    
    lgb_train = lgb.Dataset(X_train_combine, Y_train_combine)
    lgb_eval = lgb.Dataset(X_vali_[test_vali], X_vali_label_[test_vali], reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=5000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=50,
                    verbose_eval=50,
                    feval=lgb_f1_score
                    )
    
    print(f1_score(X_vali_label_[test_vali], 
                   np.where(gbm.predict(X_vali_[test_vali], num_iteration=gbm.best_iteration)>0.5, 1,0)))
    xx_logloss.append(gbm.best_score['valid_0']['f1'])
    xx_submit.append(gbm.predict(X_vali_, num_iteration=gbm.best_iteration))
print('Eventually score:', np.mean(xx_logloss))

train _K_ flod 0


  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.436646	valid_0's f1: 0.708729
Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.436639	valid_0's f1: 0.70798
0.7079803834150691
train _K_ flod 1
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.426766	valid_0's f1: 0.707993
[100]	valid_0's binary_logloss: 0.436949	valid_0's f1: 0.71263
Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.426544	valid_0's f1: 0.708962
0.7089618456078084
train _K_ flod 2
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.434612	valid_0's f1: 0.713247
Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.434602	valid_0's f1: 0.713247
0.713247292686545
train _K_ flod 3
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.424211	valid_0's f1: 0.714094
[100]	valid_0's binary_logloss: 0.437

In [229]:
LGBM_classify = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=32, 
                                   learning_rate=0.05, subsample_freq=5, n_estimators=5000, silent=False)
# LGBM_classify.print_evaluation(period=50, show_stdv=True)
for k, (train_vali, test_vali) in enumerate(skf.split(X_vali_, X_vali_label_)):
    print('train _K_ flod', k)
    X_train_combine = np.vstack([X, X_vali_[train_vali]])
    Y_train_combine = np.hstack([y, X_vali_label_[train_vali]])
    
    LGBM_classify.fit(X_train_combine, Y_train_combine, 
                      eval_set=(X_vali_[test_vali], X_vali_label_[test_vali]),
                      early_stopping_rounds=50, eval_sample_weight=None,eval_metric=lgb_f1_score_sk)
#     gbm = lgb.train(params,
#                     lgb_train,
#                     num_boost_round=5000,
#                     valid_sets=lgb_eval,
#                     early_stopping_rounds=50,
#                     verbose_eval=50,
#                     )
    
    print(f1_score(X_vali_label_[test_vali],LGBM_classify.predict(X_vali_[test_vali], num_iterationgbm.best_iteration_)))
    xx_logloss.append(gbm.best_score['valid_0']['binary_logloss'])
    xx_submit.append(gbm.predict(X_vali_, num_iteration=gbm.best_iteration))

train _K_ flod 0


  'recall', 'true', average, warn_for)


[1]	valid_0's binary_logloss: 0.639613	valid_0's f1: 0
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.62148	valid_0's f1: 0
[3]	valid_0's binary_logloss: 0.605142	valid_0's f1: 0
[4]	valid_0's binary_logloss: 0.590486	valid_0's f1: 0
[5]	valid_0's binary_logloss: 0.57711	valid_0's f1: 0.377246
[6]	valid_0's binary_logloss: 0.565153	valid_0's f1: 0.485793
[7]	valid_0's binary_logloss: 0.554116	valid_0's f1: 0.532426
[8]	valid_0's binary_logloss: 0.544132	valid_0's f1: 0.588955
[9]	valid_0's binary_logloss: 0.534946	valid_0's f1: 0.611674
[10]	valid_0's binary_logloss: 0.526499	valid_0's f1: 0.623623
[11]	valid_0's binary_logloss: 0.518698	valid_0's f1: 0.627947
[12]	valid_0's binary_logloss: 0.511623	valid_0's f1: 0.668584
[13]	valid_0's binary_logloss: 0.505131	valid_0's f1: 0.671855
[14]	valid_0's binary_logloss: 0.499245	valid_0's f1: 0.679584
[15]	valid_0's binary_logloss: 0.493894	valid_0's f1: 0.687296
[16]	valid_0's binary_logloss: 0

NameError: name 'num_iterationgbm' is not defined

In [201]:
LGBM_classify._best_score

defaultdict(dict,
            {'valid_0': {'binary_logloss': 0.43290392776712117,
              'f1': 0.7028245192307693}})

In [196]:
# lgb_train = lgb.Dataset(X, y)
# lgb_eval = lgb.Dataset(X_vali_, X_vali_label_, reference=lgb_train)

def lgb_f1_score(y_hat, y_true):
    y_true = np.round(y_true)
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

LGBM_classify = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=32, 
                                   learning_rate=0.05, subsample_freq=5)
LGBM_classify.fit(X, y, eval_metric=lgb_f1_score, eval_set=(X_vali_, X_vali_label_), early_stopping_rounds=50, 
                  eval_sample_weight=None)
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'binary_logloss',
#     'num_leaves': 32,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'verbose': 1


# gbm = lgb.train(params,
#                 lgb_train,
#                 num_boost_round=5000,
#                 valid_sets=lgb_eval,
#                 early_stopping_rounds=50,
#                 verbose_eval=50,
#                 )
print(LGBM_classify.best_score_)
print(f1_score(X_vali_label_, LGBM_classify.predict(X_vali_)))

  'recall', 'true', average, warn_for)


[1]	valid_0's binary_logloss: 0.640763	valid_0's f1: 0
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.624082	valid_0's f1: 0
[3]	valid_0's binary_logloss: 0.60945	valid_0's f1: 0
[4]	valid_0's binary_logloss: 0.596602	valid_0's f1: 0
[5]	valid_0's binary_logloss: 0.585298	valid_0's f1: 0.364564
[6]	valid_0's binary_logloss: 0.575374	valid_0's f1: 0.488913
[7]	valid_0's binary_logloss: 0.566668	valid_0's f1: 0.538586
[8]	valid_0's binary_logloss: 0.559053	valid_0's f1: 0.565674
[9]	valid_0's binary_logloss: 0.552421	valid_0's f1: 0.592228
[10]	valid_0's binary_logloss: 0.546672	valid_0's f1: 0.603862
[11]	valid_0's binary_logloss: 0.541724	valid_0's f1: 0.608896
[12]	valid_0's binary_logloss: 0.537503	valid_0's f1: 0.629001
[13]	valid_0's binary_logloss: 0.533943	valid_0's f1: 0.632613
[14]	valid_0's binary_logloss: 0.530986	valid_0's f1: 0.634759
[15]	valid_0's binary_logloss: 0.528588	valid_0's f1: 0.640441
[16]	valid_0's binary_logloss: 

  if diff:


In [136]:
print('train_logloss:', np.mean(xx_logloss))
s = 0
for i in xx_submit:
    s = s + i

train_logloss: 0.3204018945711848


In [137]:
val_data_['pred_label'] = list(s / N)
val_data_['pred_label'] = val_data_['pred_label'].apply(lambda x: round(x))


In [138]:
print(f1_score(X_vali_label_, val_data_['pred_label']))

0.8269731376350041


In [None]:
print('test_logloss:', np.mean(test_data_.label))
test_data_['label'].to_csv('./submit/result.csv',index = False)