In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [3]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

def lgb_f1_score_sk(y_hat, y_true):
    y_true = np.round(y_true)
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [4]:
pd.set_option('display.expand_frame_repr', False)

In [5]:
train_data = pd.read_table('../data/oppo_round1_train_20180929.txt', 
        names= ['prefix','query_prediction','title','tag','label'], header= None, encoding='utf-8').astype(str)
val_data = pd.read_table('../data/oppo_round1_vali_20180929.txt', 
        names = ['prefix','query_prediction','title','tag','label'], header = None, encoding='utf-8').astype(str)
test_data = pd.read_table('../data/oppo_round1_test_A_20180929.txt',
        names = ['prefix','query_prediction','title','tag'],header = None, encoding='utf-8').astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# train_data.describe(include='all')
# print(train_data.dtypes)
# print()
# # Focus first on null values
# print(val_data.isna().sum())
# print(val_data.head())

In [7]:
train_data = train_data[train_data['label'] != '音乐' ]

In [8]:
test_data['label'] = -1

In [9]:
# train_data = pd.concat([train_data,val_data.copy()])
train_data['label'] = train_data['label'].apply(lambda x: int(x))
val_data['label'] = val_data['label'].apply(lambda x: int(x))
test_data['label'] = test_data['label'].apply(lambda x: int(x))

In [10]:
items = ['prefix', 'title', 'tag']

## 以下部分用于生成统计特征

共21组，分别为prefix，title，tag的统计特征或是他们任意组合的统计特征。
统计特征为：CTR，CTC，Count

## groupby(items，as_index)[‘target’].agg({name:func,...}) 的用法解释

以items进行分组，然后对每一组（items相同）的target对象进行多个函数操作。
func代表操作的名称，name代表操作后保存的列的名称

In [11]:
temp = train_data.groupby(items, as_index=False)['label'].agg({'_'.join(items)+'_click': 'sum','_'.join(items)+'_count':'count'})
temp['_'.join(items)+'_ctr'] = temp['_'.join(items)+'_click']/(temp['_'.join(items)+'_count'])
train_data = pd.merge(train_data, temp, on=items, how='left')
val_data = pd.merge(val_data, temp, on=items, how='left')
test_data = pd.merge(test_data, temp, on=items, how='left')

for item in items:
    temp = train_data.groupby(item, as_index = False)['label'].agg({item+'_click':'sum', item+'_count':'count'})
    temp[item+'_ctr'] = temp[item+'_click']/(temp[item+'_count'])
    train_data = pd.merge(train_data, temp, on=item, how='left')
    val_data = pd.merge(val_data, temp, on=item, how='left')
    test_data = pd.merge(test_data, temp, on=item, how='left')
    
for i in range(len(items)):
    for j in range(i+1, len(items)):
        item_g = [items[i], items[j]]
        temp = train_data.groupby(item_g, as_index=False)['label'].agg({'_'.join(item_g)+'_click': 'sum','_'.join(item_g)+'_count':'count'})
        temp['_'.join(item_g)+'_ctr'] = temp['_'.join(item_g)+'_click']/(temp['_'.join(item_g)+'_count'])
        train_data = pd.merge(train_data, temp, on=item_g, how='left')
        val_data = pd.merge(val_data, temp, on=item_g, how='left')
        test_data = pd.merge(test_data, temp, on=item_g, how='left')

In [1]:
# train_data_.head()
# train_data_.describe()
# train_data_.dtypes
# train_data_.isna().sum()


NameError: name 'val_data' is not defined

In [13]:
train_data_ = train_data.drop(['prefix', 'query_prediction', 'title', 'tag'], axis = 1)
val_data_ = val_data.drop(['prefix', 'query_prediction', 'title', 'tag'], axis = 1)
test_data_ = test_data.drop(['prefix', 'query_prediction', 'title', 'tag'], axis = 1)

In [14]:
print('train beginning')

X = np.array(train_data_.drop(['label'], axis = 1))
y = np.array(train_data_['label'])
X_test_ = np.array(test_data_.drop(['label'], axis = 1))
X_vali_label_ = np.array(val_data_.loc[:, 'label'])
X_vali_ = np.array(val_data_.drop(['label'], axis = 1))
print('================================')
print(X.shape)
print(y.shape)
print(X_vali_label_.shape)
print(X_vali_.shape)
print('================================')

train beginning
(1999998, 21)
(1999998,)
(50000,)
(50000, 21)


In [21]:
print('================================')
print(X_test_.shape)
print(X_vali_.shape)
print('================================')
# skf = StratifiedKFold(n_splits=N, random_state=42, shuffle=True)
# k_fold = skf.split(X_vali_, X_vali_label_)
# k_fold = list(k_fold)

(50000, 21)
(50000, 21)


In [16]:
xx_logloss = []
xx_submit = []
N = 5
skf = StratifiedKFold(n_splits=N, random_state=42, shuffle=True)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

In [17]:
xx_logloss = []
xx_submit = []
for k, (train_vali, test_vali) in enumerate(skf.split(X_vali_, X_vali_label_)):
    print('train _K_ flod', k)
    X_train_combine = np.vstack([X, X_vali_[train_vali]])
    Y_train_combine = np.hstack([y, X_vali_label_[train_vali]])
    
    lgb_train = lgb.Dataset(X_train_combine, Y_train_combine)
    lgb_eval = lgb.Dataset(X_vali_[test_vali], X_vali_label_[test_vali], reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=5000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=50,
                    verbose_eval=50,
                    feval=lgb_f1_score
                    )
    
    print(f1_score(X_vali_label_[test_vali], 
                   np.where(gbm.predict(X_vali_[test_vali], num_iteration=gbm.best_iteration)>0.5, 1,0)))
    xx_logloss.append(gbm.best_score['valid_0']['f1'])
    xx_submit.append(gbm.predict(X_vali_, num_iteration=gbm.best_iteration))
print('\n\nEventually score:', np.mean(xx_logloss))

train _K_ flod 0


  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.436646	valid_0's f1: 0.708729
Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.436639	valid_0's f1: 0.70798
0.7079803834150691
train _K_ flod 1
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.426766	valid_0's f1: 0.707993
[100]	valid_0's binary_logloss: 0.436949	valid_0's f1: 0.71263
Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.426544	valid_0's f1: 0.708962
0.7089618456078084
train _K_ flod 2
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.434612	valid_0's f1: 0.713247
Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.434602	valid_0's f1: 0.713247
0.713247292686545
train _K_ flod 3
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.424211	valid_0's f1: 0.714094
[100]	valid_0's binary_logloss: 0.437

# Sklearing API

能够比较方便、容易的使用Sklearning的接口进行调试。

In [23]:
xx_logloss = []
xx_submit = []
LGBM_classify = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=32, 
                                   learning_rate=0.05, subsample_freq=5, n_estimators=5000, silent=False)
# LGBM_classify.print_evaluation(period=50, show_stdv=True)
for k, (train_vali, test_vali) in enumerate(skf.split(X_vali_, X_vali_label_)):
    print('train _K_ flod', k)
    X_train_combine = np.vstack([X, X_vali_[train_vali]])
    Y_train_combine = np.hstack([y, X_vali_label_[train_vali]])
    
    LGBM_classify.fit(X_train_combine, Y_train_combine, 
                      eval_set=(X_vali_[test_vali], X_vali_label_[test_vali]),
                      early_stopping_rounds=50, eval_sample_weight=None,eval_metric=lgb_f1_score_sk)
    print(f1_score(X_vali_label_[test_vali],
                   LGBM_classify.predict(X_vali_[test_vali], num_iteration=LGBM_classify.best_iteration_)))
    xx_logloss.append(LGBM_classify._best_score['valid_0']['f1'])
    xx_submit.append(LGBM_classify.predict_proba(X_test_, num_iteration=gbm.best_iteration))
print('\n\nEventually score:', np.mean(xx_logloss))

train _K_ flod 0


  'recall', 'true', average, warn_for)


[1]	valid_0's binary_logloss: 0.639613	valid_0's f1: 0
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.62148	valid_0's f1: 0
[3]	valid_0's binary_logloss: 0.605142	valid_0's f1: 0
[4]	valid_0's binary_logloss: 0.590486	valid_0's f1: 0
[5]	valid_0's binary_logloss: 0.57711	valid_0's f1: 0.377246
[6]	valid_0's binary_logloss: 0.565153	valid_0's f1: 0.485793
[7]	valid_0's binary_logloss: 0.554116	valid_0's f1: 0.532426
[8]	valid_0's binary_logloss: 0.544132	valid_0's f1: 0.588955
[9]	valid_0's binary_logloss: 0.534946	valid_0's f1: 0.611674
[10]	valid_0's binary_logloss: 0.526499	valid_0's f1: 0.623623
[11]	valid_0's binary_logloss: 0.518698	valid_0's f1: 0.627947
[12]	valid_0's binary_logloss: 0.511623	valid_0's f1: 0.668584
[13]	valid_0's binary_logloss: 0.505131	valid_0's f1: 0.671855
[14]	valid_0's binary_logloss: 0.499245	valid_0's f1: 0.679584
[15]	valid_0's binary_logloss: 0.493894	valid_0's f1: 0.687296
[16]	valid_0's binary_logloss: 0

  if diff:


0.7073859414474661
train _K_ flod 1


  'recall', 'true', average, warn_for)


[1]	valid_0's binary_logloss: 0.639008	valid_0's f1: 0
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.620463	valid_0's f1: 0
[3]	valid_0's binary_logloss: 0.603695	valid_0's f1: 0
[4]	valid_0's binary_logloss: 0.588572	valid_0's f1: 0
[5]	valid_0's binary_logloss: 0.574918	valid_0's f1: 0.39524
[6]	valid_0's binary_logloss: 0.562484	valid_0's f1: 0.504983
[7]	valid_0's binary_logloss: 0.551148	valid_0's f1: 0.552769
[8]	valid_0's binary_logloss: 0.540814	valid_0's f1: 0.600178
[9]	valid_0's binary_logloss: 0.53143	valid_0's f1: 0.625975
[10]	valid_0's binary_logloss: 0.522637	valid_0's f1: 0.641509
[11]	valid_0's binary_logloss: 0.514582	valid_0's f1: 0.666667
[12]	valid_0's binary_logloss: 0.507329	valid_0's f1: 0.678414
[13]	valid_0's binary_logloss: 0.500685	valid_0's f1: 0.67995
[14]	valid_0's binary_logloss: 0.494599	valid_0's f1: 0.684645
[15]	valid_0's binary_logloss: 0.488959	valid_0's f1: 0.689708
[16]	valid_0's binary_logloss: 0.

  if diff:
  'recall', 'true', average, warn_for)


[1]	valid_0's binary_logloss: 0.63942	valid_0's f1: 0
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.621219	valid_0's f1: 0
[3]	valid_0's binary_logloss: 0.604602	valid_0's f1: 0
[4]	valid_0's binary_logloss: 0.589731	valid_0's f1: 0
[5]	valid_0's binary_logloss: 0.576244	valid_0's f1: 0.382171
[6]	valid_0's binary_logloss: 0.564143	valid_0's f1: 0.478192
[7]	valid_0's binary_logloss: 0.553076	valid_0's f1: 0.53092
[8]	valid_0's binary_logloss: 0.542891	valid_0's f1: 0.582269
[9]	valid_0's binary_logloss: 0.533595	valid_0's f1: 0.608635
[10]	valid_0's binary_logloss: 0.525061	valid_0's f1: 0.621413
[11]	valid_0's binary_logloss: 0.517302	valid_0's f1: 0.627197
[12]	valid_0's binary_logloss: 0.510197	valid_0's f1: 0.652318
[13]	valid_0's binary_logloss: 0.503675	valid_0's f1: 0.687047
[14]	valid_0's binary_logloss: 0.497694	valid_0's f1: 0.688669
[15]	valid_0's binary_logloss: 0.492197	valid_0's f1: 0.691034
[16]	valid_0's binary_logloss: 0

  if diff:


0.7139683009924456
train _K_ flod 3


  'recall', 'true', average, warn_for)


[1]	valid_0's binary_logloss: 0.638871	valid_0's f1: 0
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.620182	valid_0's f1: 0
[3]	valid_0's binary_logloss: 0.603178	valid_0's f1: 0
[4]	valid_0's binary_logloss: 0.588067	valid_0's f1: 0
[5]	valid_0's binary_logloss: 0.574176	valid_0's f1: 0.396014
[6]	valid_0's binary_logloss: 0.561704	valid_0's f1: 0.497542
[7]	valid_0's binary_logloss: 0.55021	valid_0's f1: 0.54206
[8]	valid_0's binary_logloss: 0.53972	valid_0's f1: 0.596844
[9]	valid_0's binary_logloss: 0.530199	valid_0's f1: 0.623716
[10]	valid_0's binary_logloss: 0.521303	valid_0's f1: 0.634146
[11]	valid_0's binary_logloss: 0.513144	valid_0's f1: 0.639632
[12]	valid_0's binary_logloss: 0.505664	valid_0's f1: 0.679221
[13]	valid_0's binary_logloss: 0.49886	valid_0's f1: 0.685696
[14]	valid_0's binary_logloss: 0.492601	valid_0's f1: 0.687126
[15]	valid_0's binary_logloss: 0.486852	valid_0's f1: 0.69347
[16]	valid_0's binary_logloss: 0.48

  if diff:
  'recall', 'true', average, warn_for)


[1]	valid_0's binary_logloss: 0.63955	valid_0's f1: 0
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.621334	valid_0's f1: 0
[3]	valid_0's binary_logloss: 0.604988	valid_0's f1: 0
[4]	valid_0's binary_logloss: 0.590281	valid_0's f1: 0
[5]	valid_0's binary_logloss: 0.57699	valid_0's f1: 0.370753
[6]	valid_0's binary_logloss: 0.564764	valid_0's f1: 0.47744
[7]	valid_0's binary_logloss: 0.553883	valid_0's f1: 0.530271
[8]	valid_0's binary_logloss: 0.543752	valid_0's f1: 0.578012
[9]	valid_0's binary_logloss: 0.534496	valid_0's f1: 0.605291
[10]	valid_0's binary_logloss: 0.526083	valid_0's f1: 0.615224
[11]	valid_0's binary_logloss: 0.518275	valid_0's f1: 0.620452
[12]	valid_0's binary_logloss: 0.511099	valid_0's f1: 0.654695
[13]	valid_0's binary_logloss: 0.504515	valid_0's f1: 0.666774
[14]	valid_0's binary_logloss: 0.498395	valid_0's f1: 0.679587
[15]	valid_0's binary_logloss: 0.492888	valid_0's f1: 0.682327
[16]	valid_0's binary_logloss: 0.

  if diff:


In [28]:
print('train_logloss:', np.mean(xx_logloss))
s = 0
for i in xx_submit:
    s = s + i

train_logloss: 0.7102504188701821


In [40]:
test_data_['pred_label'] = list(s[:, 0] / N)
test_data_['pred_label'] = test_data_['pred_label'].apply(lambda x: round(x))

In [138]:
# print(f1_score(X_vali_label_, val_data_['pred_label']))

0.8269731376350041


In [42]:
# print('test_logloss:', np.mean(test_data_.label))
test_data_['pred_label'].to_csv('../data/result.csv',index = False)