In [None]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb

from sklearn.metrics import roc_auc_score, accuracy_score

In [46]:
df = pd.read_csv('../data/train.csv', index_col=0)
df_eval = pd.read_csv('../data/test.csv', index_col=0)

df.columns = df.columns.str.strip()
df_eval.columns = df_eval.columns.str.strip()

df_eval.columns

Index(['B_OWNPV_CHI2', 'B_IPCHI2_OWNPV', 'B_FDCHI2_OWNPV', 'B_DIRA_OWNPV',
       'B_PT', 'Kst_892_0_IP_OWNPV', 'Kst_892_0_cosThetaH', 'Kplus_IP_OWNPV',
       'Kplus_P', 'piminus_IP_OWNPV', 'piminus_P', 'gamma_PT', 'piminus_ETA',
       'Kplus_ETA', 'BUTTER'],
      dtype='object')

In [47]:
def preprocess_data(df):
    df2 = df.copy()
    
    del df2['BUTTER']
    # df2['gamma_PT'] = np.log(df2['gamma_PT'])
    # df2['B_DIRA_OWNPV'] = np.log(df2['B_DIRA_OWNPV'])
    # df2['Kst_892_0_IP_OWNPV'] = np.log(df2['Kst_892_0_IP_OWNPV'])
    
    return df2
    
df_train = preprocess_data(df)
df_eval = preprocess_data(df_eval)
X = df_train[df_train.columns[:-1]]
y = df_train[df_train.columns[-1]]

len(X.columns), len(df_eval.columns)

(14, 14)

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler

scaler = RobustScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0)

X_train = pd.DataFrame(data=scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_train.columns)

print(X_train.shape, X_test.shape)
X_test[:3]

(191395, 14) (21267, 14)


Unnamed: 0,B_OWNPV_CHI2,B_IPCHI2_OWNPV,B_FDCHI2_OWNPV,B_DIRA_OWNPV,B_PT,Kst_892_0_IP_OWNPV,Kst_892_0_cosThetaH,Kplus_IP_OWNPV,Kplus_P,piminus_IP_OWNPV,piminus_P,gamma_PT,piminus_ETA,Kplus_ETA
0,0.155007,-0.518109,0.590314,0.064027,2.272691,0.409998,0.918624,0.115959,-0.533184,0.569714,-0.236118,2.921955,-1.16402,-1.196681
1,-0.236251,-0.034201,1.067547,0.186846,0.011378,0.494437,-0.970477,0.741977,1.119536,-0.298632,-0.066184,-0.188864,0.258893,0.333509
2,-0.479638,-0.522993,4.507572,0.276628,0.009654,2.312267,0.072016,2.419477,-0.032379,2.148556,0.268203,0.169498,-0.132132,0.046565


### XGBoost

In [None]:
xgb_params = {
    'scale_pos_weight': 2,
    'learning_rate': 0.01,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'objective': 'binary:logistic',
    'n_estimators': 1000,
    'reg_alpha': 0.3,
    'max_depth': 6,
    'gamma': 10
}

model = xgb.XGBClassifier(**xgb_params)
model.fit(X_train, y_train, eval_set=[(X_train,y_train), (X_test,y_test)], eval_metric=['auc', 'error'], verbose=True)

In [None]:
y_hat_train = model.predict_proba(X_train)[:,1]
y_hat_test = model.predict_proba(X_test)[:,1]

print("AUC Train\t", roc_auc_score(y_train, y_hat_train))
print("AUC Test\t", roc_auc_score(y_test, y_hat_test))

### LightGBM

In [19]:
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1,
    'boosting_type': "gbdt",
    'feature_pre_filter': False,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'num_leaves': 4,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.86966,
    'bagging_freq': 1,
    'min_child_samples': 100,
}

dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test)

clf = lgb.train(lgb_params, dtrain, 5000, valid_sets=(dtrain, dtest), verbose_eval=200)

[200]	training's auc: 0.83618	valid_1's auc: 0.829708
[400]	training's auc: 0.847083	valid_1's auc: 0.840138
[600]	training's auc: 0.853225	valid_1's auc: 0.845519
[800]	training's auc: 0.857651	valid_1's auc: 0.849242
[1000]	training's auc: 0.861334	valid_1's auc: 0.852396
[1200]	training's auc: 0.86422	valid_1's auc: 0.854534
[1400]	training's auc: 0.866949	valid_1's auc: 0.856722
[1600]	training's auc: 0.869261	valid_1's auc: 0.858498
[1800]	training's auc: 0.871347	valid_1's auc: 0.859988
[2000]	training's auc: 0.873163	valid_1's auc: 0.861401
[2200]	training's auc: 0.87483	valid_1's auc: 0.862522
[2400]	training's auc: 0.876335	valid_1's auc: 0.863566
[2600]	training's auc: 0.877751	valid_1's auc: 0.86444
[2800]	training's auc: 0.879054	valid_1's auc: 0.865131
[3000]	training's auc: 0.880186	valid_1's auc: 0.865704
[3200]	training's auc: 0.881331	valid_1's auc: 0.866399
[3400]	training's auc: 0.882472	valid_1's auc: 0.867052
[3600]	training's auc: 0.883522	valid_1's auc: 0.867562


In [49]:
y_hat_train = clf.predict(X_train)
y_hat_test = clf.predict(X_test)

print("AUC Train\t", roc_auc_score(y_train, y_hat_train))
print("AUC Test\t", roc_auc_score(y_test, y_hat_test))

AUC Train	 0.8896782953998287
AUC Test	 0.8700478759653306


In [50]:
pd.DataFrame(data=y_hat_test, columns=['lgbm']).to_csv('lgbm_proba.csv')

## Write Predictions

In [51]:
X_eval = pd.DataFrame(data=scaler.transform(df_eval), columns=df_eval.columns)
y_eval_hat = clf.predict(X_eval)

pd.DataFrame(data=y_eval_hat, columns=['lgbm']).to_csv('lgbm_eval_proba.csv')