In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [2]:
train_df = pd.read_csv('../datasets/flight/flight_delays_train.csv')
test_df = pd.read_csv('../datasets/flight/flight_delays_test.csv')

In [3]:
X_train=train_df[['Distance','DepTime']]
y_train = train_df['dep_delayed_15min']
X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, 
                     test_size=0.3, random_state=17)
X_test = test_df[['Distance','DepTime']]

In [4]:
logit_pipe = Pipeline([('scaler', StandardScaler()),
                       ('logit', LogisticRegression(C=1, random_state=17, solver='liblinear'))])

In [5]:
logit_pipe.fit(X_train_part, y_train_part)
logit_valid_pred = logit_pipe.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, logit_valid_pred)

0.6795691465352607

In [6]:
cat_cols = [ i for i in train_df.columns if i not in ['Distance','DepTime','dep_delayed_15min']]

df_cat = pd.concat([train_df[cat_cols],test_df[cat_cols]],axis = 0)
df_num = pd.concat([train_df[['Distance','DepTime']],test_df[['Distance','DepTime']]],axis = 0)

In [7]:
from sklearn.preprocessing import OneHotEncoder
onehot=OneHotEncoder()
onehot.fit(df_cat)
X_train_onehot = onehot.transform(train_df[cat_cols])
X_test_onehot = onehot.transform(test_df[cat_cols])

scaler = StandardScaler()
scaler.fit(df_num)
X_train_num = scaler.transform(X_train)
X_test_num = scaler.transform(X_test)
X_train, y_train = np.hstack([X_train_onehot.toarray(),X_train_num]), train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values


X_test = np.hstack([X_train_onehot.toarray(),X_test_num])

X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, 
                     test_size=0.3, random_state=17)

In [8]:
lgb_train = lgb.Dataset(X_train_part,y_train_part)
lgb_eval = lgb.Dataset(X_valid,y_valid,reference=lgb_train)

In [9]:
X_train_part.shape

(70000, 689)

In [10]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 32,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# number of leaves,will be used in feature transformation
num_leaf = 64

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train)

print('Save model...')
# save model to file
gbm.save_model('lgb-model.txt')


Start training...
[1]	training's binary_logloss: 0.48689
[2]	training's binary_logloss: 0.485939
[3]	training's binary_logloss: 0.485032
[4]	training's binary_logloss: 0.484151
[5]	training's binary_logloss: 0.483289
[6]	training's binary_logloss: 0.482449
[7]	training's binary_logloss: 0.481636
[8]	training's binary_logloss: 0.480828
[9]	training's binary_logloss: 0.480031
[10]	training's binary_logloss: 0.479269
[11]	training's binary_logloss: 0.479111
[12]	training's binary_logloss: 0.478356
[13]	training's binary_logloss: 0.477626
[14]	training's binary_logloss: 0.476908
[15]	training's binary_logloss: 0.476746
[16]	training's binary_logloss: 0.476027
[17]	training's binary_logloss: 0.475325
[18]	training's binary_logloss: 0.474646
[19]	training's binary_logloss: 0.473968
[20]	training's binary_logloss: 0.473312
[21]	training's binary_logloss: 0.47268
[22]	training's binary_logloss: 0.472058
[23]	training's binary_logloss: 0.471455
[24]	training's binary_logloss: 0.470849
[25]	trai

<lightgbm.basic.Booster at 0x112f28dd8>

In [11]:
print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train_part, pred_leaf=True)

print(np.array(y_pred).shape)

print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
                                       dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_training_matrix[i][temp] += 1

Start predicting...
(70000, 100)
Writing transformed training data


In [12]:
y_pred = gbm.predict(X_valid, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_testing_matrix[i][temp] += 1



Writing transformed testing data


In [13]:
lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
lm.fit(transformed_training_matrix,y_train_part)  # fitting the data
y_pred_test = lm.predict_proba(transformed_testing_matrix)   # Give the probabilty on each label


In [14]:
lgb_lr_valid_pred = y_pred_test[:,1]

roc_auc_score(y_valid, lgb_lr_valid_pred)

0.7262627806084931

In [15]:
p=np.sum(y_valid)/len(y_valid)
y_valid_eval=np.ones(y_valid.shape)
y_valid_eval[y_valid==0]=-1
NE = 1/len(y_pred_test) * sum(((1+y_valid_eval)/2 * np.log(y_pred_test[:,1]) +  (1-y_valid_eval)/2 * np.log(1 - y_pred_test[:,1])))/(p*np.log(p)+(1-p)*np.log(1-p))
print("Normalized Cross Entropy " + str(NE))

Normalized Cross Entropy 0.8849835781063187


In [27]:
lgb_full_train = lgb.Dataset(X_train,y_train)
gbm_full = lgb.train(params,
                lgb_full_train,
                num_boost_round=100,
                valid_sets=lgb_full_train)
print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm_full.predict(X_train, pred_leaf=True)

print(np.array(y_pred).shape)

print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
                                       dtype=np.int64)  # N * num_tress * num_leafs
for i in range(y_pred.shape[0]):
    temp=np.arange(y_pred.shape[1])*num_leaf+y_pred[i]
    transformed_training_matrix[i,temp]+=1

[1]	training's binary_logloss: 0.485926
[2]	training's binary_logloss: 0.485034
[3]	training's binary_logloss: 0.484147
[4]	training's binary_logloss: 0.483287
[5]	training's binary_logloss: 0.48246
[6]	training's binary_logloss: 0.481621
[7]	training's binary_logloss: 0.480815
[8]	training's binary_logloss: 0.480033
[9]	training's binary_logloss: 0.479266
[10]	training's binary_logloss: 0.478507
[11]	training's binary_logloss: 0.477767
[12]	training's binary_logloss: 0.477052
[13]	training's binary_logloss: 0.476357
[14]	training's binary_logloss: 0.475678
[15]	training's binary_logloss: 0.475006
[16]	training's binary_logloss: 0.474335
[17]	training's binary_logloss: 0.473684
[18]	training's binary_logloss: 0.473057
[19]	training's binary_logloss: 0.472432
[20]	training's binary_logloss: 0.471814
[21]	training's binary_logloss: 0.471211
[22]	training's binary_logloss: 0.470621
[23]	training's binary_logloss: 0.470043
[24]	training's binary_logloss: 0.469491
[25]	training's binary_log

In [28]:
lm = LogisticRegression(penalty='l2',C=1) # logestic model construction
lm.fit(transformed_training_matrix,y_train)  # fitting the data

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
y_pred = gbm_full.predict(X_test,pred_leaf=True)
transformed_testing_matrix=np.zeros((y_pred.shape[0],num_leaf*y_pred.shape[1]))
for i in range(y_pred.shape[0]):
    temp=np.arange(y_pred.shape[1])*num_leaf+y_pred[i]
    transformed_testing_matrix[i,temp]+=1

In [30]:
lgb_lr_pred_test = lm.predict_proba(transformed_testing_matrix)[:,1]   # Give the probabilty on each label


pd.Series(lgb_lr_pred_test, 
          name='dep_delayed_15min').to_csv('lgb_lr_2feat.csv', 
                                           index_label='id', header=True)

In [21]:
xgb_clf = xgb.XGBClassifier(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=8,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 num
                 seed=42,
                 silent=1)

In [22]:
xgb_clf.fit(X_train_part,y_train_part)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.2, gamma=0.0, learning_rate=0.01,
       max_delta_step=0, max_depth=8, min_child_weight=1.5, missing=None,
       n_estimators=1000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0.9,
       reg_lambda=0.6, scale_pos_weight=1, seed=42, silent=1,
       subsample=0.2)

In [23]:
xgb_valid_pred = xgb_clf.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, xgb_valid_pred)

0.7313949358315118