In [1]:
%store -r train2
%store -r test2

In [4]:
train_df = pd.read_csv('train_preprocessed.csv')
test_df = pd.read_csv('test_preprocessed.csv')

In [5]:
train_df.head()

Unnamed: 0,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,cancelled,undelivered_orders,lifetime_order_count,reassigned_order,session_time,accept_order_diff,total_dist,first_order,cancel_before_accept,large_dist,weekday
0,1.5666,2.65,46.0,46.0,0,0.0,621.0,0.0,175.55,33.0,4.2166,1,0,0,1
1,2.5207,2.76,8.0,8.0,0,0.0,105.0,0.0,3.266667,89.0,5.2807,1,0,0,1
2,2.2074,4.8,1.0,1.0,0,0.0,66.0,0.0,9.816667,24.0,7.0074,1,0,0,1
3,2.1894,6.38,1.0,1.0,0,0.0,127.0,0.0,17.533333,73.0,8.5694,1,0,0,1
4,2.787,4.01,34.0,34.0,0,0.0,84.0,0.0,1.35,87.0,6.797,1,0,0,1


In [6]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

import gc
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, roc_auc_score
from statistics import mean

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [7]:
cols = train_df.drop('cancelled', axis=1).columns

In [8]:
SAMPLE_RATE = 0.4
RANDOM_SEED = 1
EARLY_STOPPING_ROUND = 100

In [40]:
import optuna

x = train_df.drop(['cancelled'],axis=1).values
y = train_df.cancelled.values
val = np.zeros(train_df.shape[0])
#pred = np.zeros(test_df.shape[0])

def objective(trial):
    pred = np.zeros(test_df.shape[0])
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.02, 0.001)
    param['depth'] = trial.suggest_int('depth', 9, 15)
    param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.5, 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 10000
    param['use_best_model'] = True
    param['eval_metric'] = 'AUC'
    param['task_type'] = 'GPU'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = RANDOM_SEED
    param['logging_level'] = 'Silent'

    # Simple kfold implementation
    folds = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)
    auc = np.empty(5)
    
    for fold_index, (train_index,val_index) in enumerate(folds.split(x,y)):
        print('Batch {} started...'.format(fold_index))
        gc.collect()
        model = CatBoostClassifier(**param)
        bst = model.fit(x[train_index],y[train_index],
                  eval_set = [(x[val_index],y[val_index])],
                  early_stopping_rounds=EARLY_STOPPING_ROUND,
                  verbose= 0, 
                  )
        val[val_index] = model.predict_proba(x[val_index])[:,1]
        print('auc of this val set is {}'.format(roc_auc_score(y[val_index],val[val_index])))
        #pred += model.predict_proba(test_df.drop(['order_id'],axis=1).values)[:,1]/folds.n_split
        auc[fold_index] = roc_auc_score(y[val_index],val[val_index])
    return np.mean(auc)

In [41]:
import optuna
study = optuna.create_study(direction="maximize", study_name="CATBClassifier")
study.optimize(objective, n_trials=10)

[32m[I 2022-02-05 12:56:34,464][0m A new study created in memory with name: CATBClassifier[0m


Batch 0 started...
auc of this val set is 0.7356070996888183
Batch 1 started...
auc of this val set is 0.7486424301587921
Batch 2 started...
auc of this val set is 0.7499018305834269
Batch 3 started...
auc of this val set is 0.7611603831779421
Batch 4 started...


[32m[I 2022-02-05 12:57:41,234][0m Trial 0 finished with value: 0.7496260949534103 and parameters: {'learning_rate': 0.012, 'depth': 9, 'l2_leaf_reg': 1.0, 'min_child_samples': 8}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7528187311580716
Batch 0 started...
auc of this val set is 0.7340739653686192
Batch 1 started...
auc of this val set is 0.7445384295951492
Batch 2 started...
auc of this val set is 0.7429068531706837
Batch 3 started...
auc of this val set is 0.7588902275333848
Batch 4 started...


[32m[I 2022-02-05 12:58:41,679][0m Trial 1 finished with value: 0.7460739409280626 and parameters: {'learning_rate': 0.010000000000000002, 'depth': 14, 'l2_leaf_reg': 5.0, 'min_child_samples': 4}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7499602289724757
Batch 0 started...
auc of this val set is 0.7344310760805707
Batch 1 started...
auc of this val set is 0.7442754102564404
Batch 2 started...
auc of this val set is 0.7516972853598982
Batch 3 started...
auc of this val set is 0.7545749780036761
Batch 4 started...


[32m[I 2022-02-05 12:59:38,149][0m Trial 2 finished with value: 0.7475063129310224 and parameters: {'learning_rate': 0.013000000000000001, 'depth': 9, 'l2_leaf_reg': 3.0, 'min_child_samples': 16}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7525528149545272
Batch 0 started...
auc of this val set is 0.7334259797447464
Batch 1 started...
auc of this val set is 0.7491310155118089
Batch 2 started...
auc of this val set is 0.7454766102024006
Batch 3 started...
auc of this val set is 0.7609240206833494
Batch 4 started...


[32m[I 2022-02-05 13:00:11,820][0m Trial 3 finished with value: 0.7476477761367818 and parameters: {'learning_rate': 0.004, 'depth': 12, 'l2_leaf_reg': 5.5, 'min_child_samples': 4}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7492812545416034
Batch 0 started...
auc of this val set is 0.734936782183461
Batch 1 started...
auc of this val set is 0.7463027344558977
Batch 2 started...
auc of this val set is 0.744606814312091
Batch 3 started...
auc of this val set is 0.759692805853994
Batch 4 started...


[32m[I 2022-02-05 13:00:44,923][0m Trial 4 finished with value: 0.7467087547395375 and parameters: {'learning_rate': 0.004, 'depth': 12, 'l2_leaf_reg': 5.5, 'min_child_samples': 8}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7480046368922432
Batch 0 started...
auc of this val set is 0.7301166740572764
Batch 1 started...
auc of this val set is 0.7410146537775353
Batch 2 started...
auc of this val set is 0.7405878604756857
Batch 3 started...
auc of this val set is 0.7555060046044546
Batch 4 started...


[32m[I 2022-02-05 13:02:21,646][0m Trial 5 finished with value: 0.7414623526249826 and parameters: {'learning_rate': 0.011, 'depth': 15, 'l2_leaf_reg': 4.0, 'min_child_samples': 1}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7400865702099613
Batch 0 started...
auc of this val set is 0.7313086921211818
Batch 1 started...
auc of this val set is 0.7459247199388598
Batch 2 started...
auc of this val set is 0.7443801750798836
Batch 3 started...
auc of this val set is 0.761621791280342
Batch 4 started...


[32m[I 2022-02-05 13:02:51,427][0m Trial 6 finished with value: 0.7459943576759865 and parameters: {'learning_rate': 0.004, 'depth': 12, 'l2_leaf_reg': 1.5, 'min_child_samples': 32}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7467364099596655
Batch 0 started...
auc of this val set is 0.7366718195615017
Batch 1 started...
auc of this val set is 0.7454817036953585
Batch 2 started...
auc of this val set is 0.7414818783800989
Batch 3 started...
auc of this val set is 0.7621265287380441
Batch 4 started...


[32m[I 2022-02-05 13:03:23,562][0m Trial 7 finished with value: 0.7470855168584787 and parameters: {'learning_rate': 0.002, 'depth': 11, 'l2_leaf_reg': 5.5, 'min_child_samples': 8}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7496656539173902
Batch 0 started...
auc of this val set is 0.7305829436957115
Batch 1 started...
auc of this val set is 0.7429276240858007
Batch 2 started...
auc of this val set is 0.7360043544279241
Batch 3 started...
auc of this val set is 0.7542488664592715
Batch 4 started...


[32m[I 2022-02-05 13:05:13,216][0m Trial 8 finished with value: 0.7424110135933288 and parameters: {'learning_rate': 0.012, 'depth': 15, 'l2_leaf_reg': 2.5, 'min_child_samples': 1}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7482912792979363
Batch 0 started...
auc of this val set is 0.7272304562113461
Batch 1 started...
auc of this val set is 0.7379292920251439
Batch 2 started...
auc of this val set is 0.7411212508693867
Batch 3 started...
auc of this val set is 0.7526030239410624
Batch 4 started...


[32m[I 2022-02-05 13:06:44,736][0m Trial 9 finished with value: 0.7419957780667861 and parameters: {'learning_rate': 0.016, 'depth': 14, 'l2_leaf_reg': 2.5, 'min_child_samples': 8}. Best is trial 0 with value: 0.7496260949534103.[0m


auc of this val set is 0.7510948672869912


In [42]:
study.best_trial.params

{'learning_rate': 0.012,
 'depth': 9,
 'l2_leaf_reg': 1.0,
 'min_child_samples': 8}

## Stacking maybe

In [28]:
def Stacker(model, model_name, fold, train_df, test_df):  
    test_preds = np.zeros(test.shape[0])
    train_preds = np.zeros(train.shape[0])

    kf = StratifiedKFold(n_splits=fold,random_state=48,shuffle=True)
    auc=[]
    n=0

    for train_index, test_index in kf.split(train[cols],train['cancelled']):

        X_train, X_valid = train[cols].iloc[train_index], train[cols].iloc[test_index]
        y_train, y_valid = train['cancelled'].iloc[train_index], train['cancelled'].iloc[test_index]

        if model_name == 'catb':
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], silent=True)
        elif model_name == 'rad':
            model.fit(X_train, y_train)
        else:
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

        test_preds += model.predict_proba(test[cols])[:,1]/kf.n_splits
        train_preds += model.predict_proba(train[cols])[:,1]/kf.n_splits

        auc.append(roc_auc_score(y_valid, model.predict_proba(X_valid)[:,1]))
        
        gc.collect()

        print(f"fold: {n+1}, auc: {auc[n]}")
        n+=1
    
    print('Average = ', np.mean(auc))
    return train_preds, test_preds

In [20]:
lgbm = LGBMClassifier()
xgb = XGBClassifier()
catb = CatBoostClassifier()
rad = RandomForestClassifier()

In [21]:
sample_submission = pd.read_csv('sample_submission.csv')

In [22]:
lgbm_train, lgbm_test = Stacker(lgbm, 'lgbm', 5, train_df, test_df)
del lgbm
gc.collect()

sample_submission['cancelled'] = lgbm_test
sample_submission.to_csv('lgbm_test.csv', index=False)

fold: 1, auc: 0.727904793886488
fold: 2, auc: 0.7411527622376419
fold: 3, auc: 0.7424162375940648
fold: 4, auc: 0.7504487075619974
fold: 5, auc: 0.7455636927724554
Average =  0.7414972388105295


In [23]:
catb_train, catb_test = Stacker(catb, 'catb',5, train_df, test_df)
del catb
gc.collect()

sample_submission['cancelled'] = catb_test
sample_submission.to_csv('catb_test.csv', index=False)

fold: 1, auc: 0.737421004606802
fold: 2, auc: 0.7471484897995091
fold: 3, auc: 0.747444332546169
fold: 4, auc: 0.7649773989603366
fold: 5, auc: 0.7559809050782192
Average =  0.7505944261982072


In [24]:
xgb_train, xgb_test = Stacker(xgb, 'xgb', 5, train_df, test_df)
del xgb
gc.collect()

sample_submission['cancelled'] = xgb_test
sample_submission.to_csv('xgb_test.csv', index=False)

fold: 1, auc: 0.7186176627873179
fold: 2, auc: 0.7257331046356104
fold: 3, auc: 0.734505868328033
fold: 4, auc: 0.7417689891976385
fold: 5, auc: 0.7298070152384902
Average =  0.730086528037418


In [29]:
rad_train, rad_test = Stacker(rad, 'rad', 5, train_df, test_df)
del rad
gc.collect()

sample_submission['cancelled'] = rad_test
sample_submission.to_csv('rad_test.csv', index=False)

fold: 1, auc: 0.6899926258445981
fold: 2, auc: 0.704478709487992
fold: 3, auc: 0.700758588903309
fold: 4, auc: 0.705158397435798
fold: 5, auc: 0.6911811184664878
Average =  0.6983138880276369


In [30]:
stack_train = np.concatenate((catb_train.reshape(-1,1), lgbm_train.reshape(-1,1), xgb_train.reshape(-1,1)), axis = 1)
stack_test = np.concatenate((catb_test.reshape(-1,1), lgbm_test.reshape(-1,1), xgb_test.reshape(-1,1)), axis = 1)

gc.collect()

stack_train = pd.DataFrame(stack_train, columns = ['catb', 'lgbm', 'xgb'])
stack_test = pd.DataFrame(stack_test, columns = ['catb', 'lgbm', 'xgb'])


In [31]:
stack_train.to_csv('stack_train.csv', index=True)
stack_test.to_csv('stack_test.csv', index=True)

In [38]:
y = train_df['cancelled'].copy()

from sklearn.linear_model import RidgeCV, LogisticRegressionCV, RidgeClassifierCV

train_preds = np.zeros(stack_train.shape[0])
test_preds = np.zeros(stack_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
auc=[]

n=0
for train_index, test_index in kf.split(stack_train, y):
    
    X_train, X_valid = stack_train.iloc[train_index], stack_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    lr = LogisticRegressionCV()
    lr.fit(X_train, y_train)
    
    train_preds += lr.predict_proba(stack_train)[:,1]/kf.n_splits
    test_preds += lr.predict_proba(stack_test)[:,1]/kf.n_splits
    
    auc.append(roc_auc_score(y_valid, lr.predict_proba(X_valid)[:,1]))
    gc.collect()
        
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1

fold: 1, auc: 0.9376369383805297
fold: 2, auc: 0.9405079539490304
fold: 3, auc: 0.9470187293076485
fold: 4, auc: 0.9451630613714233
fold: 5, auc: 0.938182738667014


In [39]:
sample_submission.cancelled = test_preds
sample_submission.to_csv('stacks2.csv', index=False)