<a href="https://colab.research.google.com/github/Mrajie88/-/blob/master/competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMClassifier
import time
from mpl_toolkits.mplot3d import Axes3D
import os

In [0]:
os.listdir()

['.config', 'drive', 'sample_data']

In [0]:
path = "/content/drive/My Drive/"
train_df_source = pd.read_csv(path+'train.csv')
test_df_source = pd.read_csv(path+'test.csv')
event_df_source = pd.read_csv(path+'event.csv')

In [0]:
def f_eng(df, event_df, is_train=True):
    df = pd.merge(df, event_df.loc[:, ['event_id', 'energymc', 'thetamc', 'phimc', 'xcmc', 'ycmc']], how='left',
                  on='event_id')
    # df['dis'] = np.sqrt(df['x']**2+df['y']**2+df['t']**2)
    # 时间差
    df['t_o']=df['t']-df['terror']
    df['q_mean'] = df['q']-df['q'].groupby(df['event_id']).transform(np.mean)
    df['t_mean'] = df['t']-df['t'].groupby(df['event_id']).transform(np.mean)
    if is_train:
        df = df.drop('flag', axis=1)
    df = df.drop(['hit_id', 'z', 'event_id'], axis=1)
    return df

In [0]:
labels = train_df_source['flag']
train_df = f_eng(train_df_source, event_df_source)
test_df = f_eng(test_df_source, event_df_source, False)
print(train_df.columns.values.tolist())
# 小样本训练
train_df = train_df.iloc[:3000000, :]
labels = labels[:3000000]

train_x, val_x, train_y, val_y = train_test_split(train_df, labels, test_size=0.33, random_state=42)

threshold = 0.5

['x', 'y', 't', 'terror', 'q', 'energymc', 'thetamc', 'phimc', 'xcmc', 'ycmc', 't_o', 'q_mean', 't_mean']


In [0]:
fea_imp_list = []
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=20000,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019,
    metric=None
)

In [0]:
t = time.time()
clf.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    eval_metric='auc',
    # categorical_feature=cate_cols,
    early_stopping_rounds=200,
    # early_stopping_rounds=50,
    verbose=50
)
print('runtime:', time.time() - t)

Training until validation scores don't improve for 200 rounds.
[50]	valid_0's auc: 0.968412
[100]	valid_0's auc: 0.972578
[150]	valid_0's auc: 0.974786
[200]	valid_0's auc: 0.977434
[250]	valid_0's auc: 0.980925
[300]	valid_0's auc: 0.983867
[350]	valid_0's auc: 0.986435
[400]	valid_0's auc: 0.988212
[450]	valid_0's auc: 0.989675
[500]	valid_0's auc: 0.991069
[550]	valid_0's auc: 0.992041
[600]	valid_0's auc: 0.992702
[650]	valid_0's auc: 0.993218
[700]	valid_0's auc: 0.993651
[750]	valid_0's auc: 0.994002
[800]	valid_0's auc: 0.994235
[850]	valid_0's auc: 0.994441
[900]	valid_0's auc: 0.994589
[950]	valid_0's auc: 0.994735
[1000]	valid_0's auc: 0.994856
[1050]	valid_0's auc: 0.994922
[1100]	valid_0's auc: 0.994989
[1150]	valid_0's auc: 0.995045
[1200]	valid_0's auc: 0.99509
[1250]	valid_0's auc: 0.995126
[1300]	valid_0's auc: 0.995188
[1350]	valid_0's auc: 0.995231
[1400]	valid_0's auc: 0.995261
[1450]	valid_0's auc: 0.995294
[1500]	valid_0's auc: 0.995317
[1550]	valid_0's auc: 0.9953

In [0]:
print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

In [0]:
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=best_rounds,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019
)

In [0]:
clf.fit(
    train_df, labels,
    eval_set=[(train_df, labels)],
    eval_metric='auc',
    # categorical_feature=cate_cols,
    early_stopping_rounds=200,
    # early_stopping_rounds=50,
    verbose=50
)
print('runtime:', time.time() - t)

In [0]:
test_pre = pd.DataFrame(clf.predict_proba(test_df)[:, 1], columns=['flag_pred'])
print('test_pre runtime:', time.time() - t)
sub = pd.concat([test_df_source['hit_id'], test_pre,test_df_source['event_id']], axis=1)
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

In [0]:
# 特征重要性可以好好看看
fea_imp_dict = dict(zip(train_df.columns.values, np.mean(fea_imp_list, axis=0)))
fea_imp_item = sorted(fea_imp_dict.items(), key=lambda x: x[1], reverse=True)
for f, imp in fea_imp_item:
    print('{} = {}'.format(f, imp))

In [0]:
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0
for step in range(475):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = score(val_y, y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best score: {}'.format(step, best_t, best_f1))
print('search finish.')

val_pred = [1 if x >= best_t else 0 for x in val_pred]
print('\nbest auc:', best_auc)
print('best score:', score(val_y, val_pred))
print('validate mean:', np.mean(val_pred))
print('runtime:', time.time() - t)

In [0]:
sub.to_csv(
    'C://Users//Lin//Desktop//PolyU//competition//turing_wuli//sub_prob_{}_{}_{}.csv'.format(best_auc, best_f1,
                                                                                             sub[
                                                                                                 'flag_pred'].mean()),
    index=False)
sub['flag_pred'] = sub['flag_pred'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('C://Users//Lin//Desktop//PolyU//competition//turing_wuli//output//sub_{}_{}_{}.csv'.format(best_auc, best_f1,
                                                                                            sub['flag_pred'].mean()),
           index=False)
print('runtime:', time.time() - t)
print('finish.')