In [33]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [4]:
train_df = pd.read_csv('../datasets/flight/flight_delays_train.csv')
test_df = pd.read_csv('../datasets/flight/flight_delays_test.csv')

In [19]:
X_train=train_df[['Distance','DepTime']]
y_train = train_df['dep_delayed_15min']
X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, 
                     test_size=0.3, random_state=17)
X_test = test_df[['Distance','DepTime']]

In [6]:
logit_pipe = Pipeline([('scaler', StandardScaler()),
                       ('logit', LogisticRegression(C=1, random_state=17, solver='liblinear'))])

In [7]:
logit_pipe.fit(X_train_part, y_train_part)
logit_valid_pred = logit_pipe.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, logit_valid_pred)

0.6795691465352607

In [17]:
cat_cols = [ i for i in train_df.columns if i not in ['Distance','DepTime','dep_delayed_15min']]

df_cat = pd.concat([train_df[cat_cols],test_df[cat_cols]],axis = 0)
df_num = pd.concat([train_df[['Distance','DepTime']],test_df[['Distance','DepTime']]],axis = 0)

In [20]:
from sklearn.preprocessing import OneHotEncoder
onehot=OneHotEncoder()
onehot.fit(df_cat)
X_train_onehot = onehot.transform(train_df[cat_cols])
X_test_onehot = onehot.transform(test_df[cat_cols])

scaler = StandardScaler()
scaler.fit(df_num)
X_train_num = scaler.transform(X_train)
X_test_num = scaler.transform(X_test)
X_train, y_train = np.hstack([X_train_onehot.toarray(),X_train_num]), train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values


X_test = np.hstack([X_train_onehot.toarray(),X_test_num])

X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, 
                     test_size=0.3, random_state=17)

In [34]:
lgb_train = lgb.Dataset(X_train_part,y_train_part)
lgb_eval = lgb.Dataset(X_valid,y_valid,reference=lgb_train)

In [35]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 64,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# number of leaves,will be used in feature transformation
num_leaf = 64

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train)

print('Save model...')
# save model to file
gbm.save_model('lgb-model.txt')


Start training...
[1]	training's binary_logloss: 0.486785
[2]	training's binary_logloss: 0.485722
[3]	training's binary_logloss: 0.484719
[4]	training's binary_logloss: 0.48375
[5]	training's binary_logloss: 0.482772
[6]	training's binary_logloss: 0.481828
[7]	training's binary_logloss: 0.480922
[8]	training's binary_logloss: 0.480026
[9]	training's binary_logloss: 0.479151
[10]	training's binary_logloss: 0.47831
[11]	training's binary_logloss: 0.478085
[12]	training's binary_logloss: 0.477243
[13]	training's binary_logloss: 0.476432
[14]	training's binary_logloss: 0.475639
[15]	training's binary_logloss: 0.475412
[16]	training's binary_logloss: 0.474615
[17]	training's binary_logloss: 0.47383
[18]	training's binary_logloss: 0.473066
[19]	training's binary_logloss: 0.472305
[20]	training's binary_logloss: 0.471567
[21]	training's binary_logloss: 0.470863
[22]	training's binary_logloss: 0.470171
[23]	training's binary_logloss: 0.469492
[24]	training's binary_logloss: 0.468833
[25]	train

<lightgbm.basic.Booster at 0x110b08438>

In [36]:
y_pred = gbm.predict(X_train, pred_leaf=True)

In [40]:
y_pred[0,:]

array([28, 29, 26, 24, 62, 18, 52, 63, 52, 39, 17, 61, 30, 49, 11, 15, 12,
       37, 12, 12, 40, 40, 21, 37, 46,  6, 11, 63,  2, 49,  5, 45, 63, 42,
       24,  5,  7, 61,  7,  7, 43, 45, 44, 16, 37,  8,  9, 33, 33, 24, 23,
       27, 53, 54, 49, 61, 55, 10,  2, 46, 17, 34, 49, 62, 53, 26, 42, 14,
        2, 12, 14, 13, 14, 15, 18, 28, 35, 50, 56, 16, 31, 46,  2,  2,  8,
       56, 42, 23, 37, 31, 38, 14, 14, 16, 15, 48, 52, 18, 42, 39],
      dtype=int32)

In [41]:
print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)

print(np.array(y_pred).shape)
print(y_pred[:10])

print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
                                       dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    b[i][temp] += 1

Start predicting...
(100000, 100)
[[28 29 26 24 62 18 52 63 52 39 17 61 30 49 11 15 12 37 12 12 40 40 21 37
  46  6 11 63  2 49  5 45 63 42 24  5  7 61  7  7 43 45 44 16 37  8  9 33
  33 24 23 27 53 54 49 61 55 10  2 46 17 34 49 62 53 26 42 14  2 12 14 13
  14 15 18 28 35 50 56 16 31 46  2  2  8 56 42 23 37 31 38 14 14 16 15 48
  52 18 42 39]
 [17 32 21 20 25 16 13 17 16 11 11 25 20 21  0 18 18 30 31 19 27 29 18 53
  30 57 13 13  1 12 17 20 21 21 12  1 24  1  1  1 13 19 13 36 19  8  9 31
  10 14 43 46 45  1 39 17 17 10 20  9  1  1  1 41  1 30 53 11 11 10  5 63
  29 27 48 52  1 42 58 47 22 22  1 22  8 59 11  5 19 16  1 30 51 41 57 31
  35 28 13 13]
 [17 34 21 20 26 58 13 55 62 11 22 28 57  9  2 52 56 26  1 16 10 10 10 31
  10 24 13 13 54 12 13 13 12 12 12  7  8  8  9  9 13  9 13 12 23  1  1 13
   9  1  1 14 14 14 50  7  6 27  7  0  1  1  1 12 13  1  1 11 11 10 18 19
  15  0 22  1  1 27 12 47  8  8  7  8  1  7  7  1 48 13 14 13 13  8 10 40
  40 43  1  1]
 [ 3  3  3  3  3  3  3  3  3  3 1

In [None]:
print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)

print(np.array(y_pred).shape)
print(y_pred[:10])

print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
                                       dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_training_matrix[i][temp] += 1


y_pred = gbm.predict(X_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_testing_matrix[i][temp] += 1


lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
lm.fit(transformed_training_matrix,y_train)  # fitting the data
y_pred_test = lm.predict_proba(transformed_testing_matrix)   # Give the probabilty on each label

print(y_pred_test)

NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
print("Normalized Cross Entropy " + str(NE))

In [21]:
xgb_clf = xgb.XGBClassifier(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=8,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 num
                 seed=42,
                 silent=1)

In [22]:
xgb_clf.fit(X_train_part,y_train_part)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.2, gamma=0.0, learning_rate=0.01,
       max_delta_step=0, max_depth=8, min_child_weight=1.5, missing=None,
       n_estimators=1000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0.9,
       reg_lambda=0.6, scale_pos_weight=1, seed=42, silent=1,
       subsample=0.2)

In [23]:
xgb_valid_pred = xgb_clf.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, xgb_valid_pred)

0.7313949358315118