# EDA + Data Preprocessing + TabNet PyTorch + Stacking Models ✅

## Rider-Driven Cancellation Prediction 🛵

## Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import xgboost as xgb
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
import gc
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier
gc.enable()

# Loading Data...

In [2]:
train_df = pd.read_csv('../input/cascade-cup-22/train.csv', dtype={'major': str})
test_df = pd.read_csv('../input/cascade-cup-22/test.csv', dtype={'major':str})

In [3]:
train_df.shape

(450000, 20)

In [4]:
test_df.shape

(144844, 16)

In [5]:
train_df.head()

Unnamed: 0,order_time,order_id,order_date,allot_time,accept_time,pickup_time,delivered_time,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,cancelled,undelivered_orders,lifetime_order_count,reassignment_method,reassignment_reason,reassigned_order,session_time,cancelled_time
0,2021-01-26 02:21:35,556753,2021-01-26 00:00:00,2021-01-26 02:21:59,2021-01-26 02:22:08,2021-01-26 02:32:51,2021-01-26 02:49:47,11696,1.5666,2.65,46.0,46.0,0,0.0,621.0,,,,,
1,2021-01-26 02:33:16,556754,2021-01-26 00:00:00,2021-01-26 02:33:57,2021-01-26 02:34:45,2021-01-26 02:50:25,2021-01-26 03:11:15,18117,2.5207,2.76,8.0,8.0,0,0.0,105.0,,,,3.266667,
2,2021-01-26 02:39:49,556755,2021-01-26 00:00:00,2021-01-26 02:39:57,2021-01-26 02:40:13,2021-01-26 02:56:00,2021-01-26 03:12:46,18623,2.2074,4.8,1.0,1.0,0,0.0,66.0,,,,9.816667,
3,2021-01-26 02:47:53,556756,2021-01-26 00:00:00,2021-01-26 02:48:25,2021-01-26 02:49:06,2021-01-26 03:21:51,2021-01-26 03:41:05,15945,2.1894,6.38,1.0,1.0,0,0.0,127.0,,,,17.533333,
4,2021-01-26 03:06:30,556757,2021-01-26 00:00:00,2021-01-26 03:07:21,2021-01-26 03:07:57,2021-01-26 03:31:38,2021-01-26 04:00:15,17589,2.787,4.01,34.0,34.0,0,0.0,84.0,,,,1.35,


In [6]:
test_df.head()

Unnamed: 0,order_time,order_id,order_date,allot_time,accept_time,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,undelivered_orders,lifetime_order_count,reassignment_method,reassignment_reason,reassigned_order,session_time
0,2021-02-06 10:03:24,130231,2021-02-06 00:00:00,2021-02-06 10:03:49,2021-02-06 10:04:15,12884,1.6585,4.54,216.0,215.0,1.0,747.0,,,,273.4
1,2021-02-06 10:03:26,130232,2021-02-06 00:00:00,2021-02-06 10:03:27,2021-02-06 10:03:36,3541,2.0709,5.84,52.0,52.0,0.0,75.0,,,,252.1
2,2021-02-06 10:03:27,130233,2021-02-06 00:00:00,2021-02-06 10:04:14,2021-02-06 10:05:34,603,1.3884,0.99,289.0,289.0,0.0,2214.0,,,,241.383333
3,2021-02-06 10:03:29,130234,2021-02-06 00:00:00,2021-02-06 10:03:30,2021-02-06 10:03:53,3414,1.9039,2.59,125.0,122.0,3.0,1020.0,,,,291.933333
4,2021-02-06 10:03:35,130235,2021-02-06 00:00:00,2021-02-06 10:03:43,2021-02-06 10:04:43,1426,0.8275,0.94,352.0,350.0,2.0,7284.0,,,,247.133333


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450000 entries, 0 to 449999
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_time            450000 non-null  object 
 1   order_id              450000 non-null  int64  
 2   order_date            450000 non-null  object 
 3   allot_time            450000 non-null  object 
 4   accept_time           449843 non-null  object 
 5   pickup_time           447579 non-null  object 
 6   delivered_time        444782 non-null  object 
 7   rider_id              450000 non-null  int64  
 8   first_mile_distance   450000 non-null  float64
 9   last_mile_distance    450000 non-null  float64
 10  alloted_orders        433052 non-null  float64
 11  delivered_orders      432659 non-null  float64
 12  cancelled             450000 non-null  int64  
 13  undelivered_orders    432659 non-null  float64
 14  lifetime_order_count  449947 non-null  float64
 15  

In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144844 entries, 0 to 144843
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_time            144844 non-null  object 
 1   order_id              144844 non-null  int64  
 2   order_date            144844 non-null  object 
 3   allot_time            144844 non-null  object 
 4   accept_time           144776 non-null  object 
 5   rider_id              144844 non-null  int64  
 6   first_mile_distance   144844 non-null  float64
 7   last_mile_distance    144844 non-null  float64
 8   alloted_orders        140071 non-null  float64
 9   delivered_orders      139960 non-null  float64
 10  undelivered_orders    139960 non-null  float64
 11  lifetime_order_count  144066 non-null  float64
 12  reassignment_method   4632 non-null    object 
 13  reassignment_reason   4635 non-null    object 
 14  reassigned_order      4635 non-null    float64
 15  

In [9]:
train_df.describe()

Unnamed: 0,order_id,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,cancelled,undelivered_orders,lifetime_order_count,reassigned_order,session_time
count,450000.0,450000.0,450000.0,450000.0,433052.0,432659.0,450000.0,432659.0,449947.0,13753.0,446325.0
mean,369143.080767,7763.244016,1.229889,2.968873,104.620909,103.950448,0.011596,0.764165,853.640664,1.0,220.474779
std,131146.906408,5592.880135,0.846183,1.884124,90.135492,89.639646,0.107057,1.066473,1502.976162,0.0,176.713853
min,118350.0,0.0,0.000134,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,257342.75,2805.0,0.539575,1.47,36.0,36.0,0.0,0.0,165.0,1.0,84.1
50%,369842.5,6754.0,1.1387,2.67,81.0,81.0,0.0,0.0,396.0,1.0,175.55
75%,482342.25,11965.0,1.853,4.22,147.0,146.0,0.0,1.0,948.0,1.0,316.766667
max,594842.0,21566.0,42.0381,22.41,567.0,562.0,1.0,9.0,30469.0,1.0,1298.966667


In [10]:
test_df.describe()

Unnamed: 0,order_id,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,undelivered_orders,lifetime_order_count,reassigned_order,session_time
count,144844.0,144844.0,144844.0,144844.0,140071.0,139960.0,139960.0,144066.0,4635.0,139790.0
mean,74594.70161,5050.498019,1.317988,2.846224,105.236402,104.522542,0.79612,827.877716,1.0,236.664557
std,45018.049343,3587.392934,0.852682,1.832061,89.838188,89.347307,1.091238,1511.791891,0.0,187.018749
min,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
25%,36210.75,1915.0,0.6496,1.36,37.0,36.0,0.0,132.0,1.0,88.8
50%,72421.5,4513.0,1.2451,2.54,83.0,82.0,0.0,356.0,1.0,190.258333
75%,108632.25,7634.0,1.9499,4.12,148.0,147.0,1.0,913.0,1.0,354.008333
max,156724.0,13984.0,54.6902,20.68,558.0,553.0,10.0,30469.0,1.0,1210.316667


In [11]:
train_df.isnull().sum()

order_time                   0
order_id                     0
order_date                   0
allot_time                   0
accept_time                157
pickup_time               2421
delivered_time            5218
rider_id                     0
first_mile_distance          0
last_mile_distance           0
alloted_orders           16948
delivered_orders         17341
cancelled                    0
undelivered_orders       17341
lifetime_order_count        53
reassignment_method     436256
reassignment_reason     436247
reassigned_order        436247
session_time              3675
cancelled_time          444782
dtype: int64

**There are a lot of null values that need to be fixed.** 🤯

In [12]:
train_df.duplicated().sum()

1

In [13]:
train_df = train_df.drop_duplicates()

In [58]:
px.bar(train_df.cancelled.value_counts())

**There is a huge class imbalance so we will use Stratified K-Fold for training our models.**

## Further Steps:

1. Parse the dates and times and create columns containing time differences between accept and deliver and so on

2. Fix the problem of null values (huge problem)

3. Make a column for total dist travelled by rider

4. Use the rider's previous history as a fraction to judge him on his commitment

5. Use the dates to classify into weekends and weekdays

In [15]:
train = train_df.copy()
test = test_df.copy()

In [16]:
# cols_to_drop = []
# for col in train.columns:
#     if col not in test.columns:
#         cols_to_drop.append(col)

In [17]:
cols_to_drop = ['pickup_time', 'delivered_time', 'cancelled_time']

> These columns were dropped as they are not present in test dataframe.

**Parsing Dates from object datatype to DateTime format**

In [18]:
def parse_dates(train):
    train['order_time'] = pd.to_datetime(train['order_time'], format = "%Y-%m-%d %X")
    train['order_date'] = pd.to_datetime(train['order_date'], format = "%Y-%m-%d %X")
    train['allot_time'] = pd.to_datetime(train['allot_time'], format = "%Y-%m-%d %X")
    train['accept_time'] = pd.to_datetime(train['accept_time'], format = "%Y-%m-%d %X")
#     train['pickup_time'] = pd.to_datetime(train['pickup_time'], format = "%Y-%m-%d %X")
#     train['delivered_time'] = pd.to_datetime(train['delivered_time'], format = "%Y-%m-%d %X")
#     train['cancelled_time'] = pd.to_datetime(train['cancelled_time'], format = "%Y-%m-%d %X")
    
    return train

In [19]:
train = parse_dates(train)
test = parse_dates(test)

In [20]:
train = train.drop(cols_to_drop, axis=1)

In [21]:
train.head()

Unnamed: 0,order_time,order_id,order_date,allot_time,accept_time,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,cancelled,undelivered_orders,lifetime_order_count,reassignment_method,reassignment_reason,reassigned_order,session_time
0,2021-01-26 02:21:35,556753,2021-01-26,2021-01-26 02:21:59,2021-01-26 02:22:08,11696,1.5666,2.65,46.0,46.0,0,0.0,621.0,,,,
1,2021-01-26 02:33:16,556754,2021-01-26,2021-01-26 02:33:57,2021-01-26 02:34:45,18117,2.5207,2.76,8.0,8.0,0,0.0,105.0,,,,3.266667
2,2021-01-26 02:39:49,556755,2021-01-26,2021-01-26 02:39:57,2021-01-26 02:40:13,18623,2.2074,4.8,1.0,1.0,0,0.0,66.0,,,,9.816667
3,2021-01-26 02:47:53,556756,2021-01-26,2021-01-26 02:48:25,2021-01-26 02:49:06,15945,2.1894,6.38,1.0,1.0,0,0.0,127.0,,,,17.533333
4,2021-01-26 03:06:30,556757,2021-01-26,2021-01-26 03:07:21,2021-01-26 03:07:57,17589,2.787,4.01,34.0,34.0,0,0.0,84.0,,,,1.35


# Some Basic Feature Engineering 👨‍🔬

In [22]:
train['accept_order_diff'] = (train['accept_time'] - train['order_time']).dt.total_seconds()

In [23]:
test['accept_order_diff'] = (test['accept_time'] - test['order_time']).dt.total_seconds()

In [24]:
train['total_dist'] = train.first_mile_distance + train.last_mile_distance
test['total_dist'] = test.first_mile_distance + test.last_mile_distance

In [25]:
train.isnull().sum()

order_time                   0
order_id                     0
order_date                   0
allot_time                   0
accept_time                157
rider_id                     0
first_mile_distance          0
last_mile_distance           0
alloted_orders           16948
delivered_orders         17341
cancelled                    0
undelivered_orders       17341
lifetime_order_count        53
reassignment_method     436256
reassignment_reason     436247
reassigned_order        436247
session_time              3675
accept_order_diff          157
total_dist                   0
dtype: int64

In [26]:
test.isnull().sum()

order_time                   0
order_id                     0
order_date                   0
allot_time                   0
accept_time                 68
rider_id                     0
first_mile_distance          0
last_mile_distance           0
alloted_orders            4773
delivered_orders          4884
undelivered_orders        4884
lifetime_order_count       778
reassignment_method     140212
reassignment_reason     140209
reassigned_order        140209
session_time              5054
accept_order_diff           68
total_dist                   0
dtype: int64

In [27]:
train.reassigned_order = train['reassigned_order'].fillna(value = 0)
test.reassigned_order = test['reassigned_order'].fillna(value = 0)

In [28]:
train.reassignment_method = train.reassignment_method.fillna(value = 'none')
test.reassignment_method = test.reassignment_method.fillna(value = 'none')

In [29]:
train.reassignment_reason = train.reassignment_reason.fillna(value = 'none')
test.reassignment_reason = test.reassignment_reason.fillna(value = 'none')

In [30]:
train.reassigned_order.value_counts()

0.0    436247
1.0     13752
Name: reassigned_order, dtype: int64

In [31]:
test.reassigned_order.value_counts()

0.0    140209
1.0      4635
Name: reassigned_order, dtype: int64

In [32]:
train.reassignment_method.value_counts()

none      436256
auto       13382
manual       361
Name: reassignment_method, dtype: int64

In [33]:
train.reassignment_reason.unique()

array(['none', 'Reassignment Request from SE portal.',
       'Auto Reassignment basis Inaction. coreengine.tasks.repush_order_to_aa_bucket',
       'Reassign'], dtype=object)

In [34]:
train.lifetime_order_count = train.lifetime_order_count.fillna(0)
test.lifetime_order_count = test.lifetime_order_count.fillna(0)

### A feature to store if this is the first order of the rider

In [35]:
def first_order_set(row):
    if row.lifetime_order_count > 0:
        return 1
    return 0

train['first_order'] = train.apply(first_order_set, axis = 1)
test['first_order'] = test.apply(first_order_set, axis=1)

In [36]:
train.delivered_orders = train.delivered_orders.fillna(0)
train.undelivered_orders = train.undelivered_orders.fillna(0)
train.alloted_orders = train.alloted_orders.fillna(0)

In [37]:
test.delivered_orders = test.delivered_orders.fillna(0)
test.undelivered_orders = test.undelivered_orders.fillna(0)
test.alloted_orders = test.alloted_orders.fillna(1)

In [38]:
train.session_time = train.session_time.fillna(train.session_time.median())

In [39]:
test.session_time = test.session_time.fillna(test.session_time.median())

In [40]:
train.isnull().sum()

order_time                0
order_id                  0
order_date                0
allot_time                0
accept_time             157
rider_id                  0
first_mile_distance       0
last_mile_distance        0
alloted_orders            0
delivered_orders          0
cancelled                 0
undelivered_orders        0
lifetime_order_count      0
reassignment_method       0
reassignment_reason       0
reassigned_order          0
session_time              0
accept_order_diff       157
total_dist                0
first_order               0
dtype: int64

In [41]:
test.isnull().sum()

order_time               0
order_id                 0
order_date               0
allot_time               0
accept_time             68
rider_id                 0
first_mile_distance      0
last_mile_distance       0
alloted_orders           0
delivered_orders         0
undelivered_orders       0
lifetime_order_count     0
reassignment_method      0
reassignment_reason      0
reassigned_order         0
session_time             0
accept_order_diff       68
total_dist               0
first_order              0
dtype: int64

### This feature tells if the accept time is null, that means the order wasn't accepted by the rider

In [42]:
def cancel_before_accept(row):
    if pd.isna(row['accept_time']):
        return 1
    return 0

train['cancel_before_accept'] = train.apply(cancel_before_accept, axis=1)
test['cancel_before_accept'] = test.apply(cancel_before_accept, axis = 1)

In [43]:
train.accept_order_diff = train.accept_order_diff.fillna(-1)
test.accept_order_diff = test.accept_order_diff.fillna(-1)

In [44]:
train.isnull().sum()

order_time                0
order_id                  0
order_date                0
allot_time                0
accept_time             157
rider_id                  0
first_mile_distance       0
last_mile_distance        0
alloted_orders            0
delivered_orders          0
cancelled                 0
undelivered_orders        0
lifetime_order_count      0
reassignment_method       0
reassignment_reason       0
reassigned_order          0
session_time              0
accept_order_diff         0
total_dist                0
first_order               0
cancel_before_accept      0
dtype: int64

In [45]:
test.isnull().sum()

order_time               0
order_id                 0
order_date               0
allot_time               0
accept_time             68
rider_id                 0
first_mile_distance      0
last_mile_distance       0
alloted_orders           0
delivered_orders         0
undelivered_orders       0
lifetime_order_count     0
reassignment_method      0
reassignment_reason      0
reassigned_order         0
session_time             0
accept_order_diff        0
total_dist               0
first_order              0
cancel_before_accept     0
dtype: int64

In [46]:
px.box(train, y="total_dist")

In [47]:
def large_dist(row):
    if row.total_dist > 10:
        return 1
    return 0

train['large_dist'] = train.apply(large_dist, axis=1)
test['large_dist'] = test.apply(large_dist, axis=1)
train['weekday'] = train.order_date.dt.weekday
test['weekday'] = test.order_date.dt.weekday

In [48]:
train['delivered_fraction'] = train.delivered_orders / train.alloted_orders
test['delivered_fraction'] = test.delivered_orders / test.alloted_orders

In [49]:
train.delivered_fraction = train.delivered_fraction.fillna(0)
test.delivered_fraction = test.delivered_fraction.fillna(0)

In [50]:
train2 = train.drop(columns=['order_time', 'rider_id','order_id', 'order_date', 'allot_time', 'accept_time', 'reassignment_method', 'reassignment_reason'])

In [51]:
test2 = test.drop(columns=['order_time', 'rider_id', 'order_date', 'allot_time', 'accept_time', 'reassignment_method', 'reassignment_reason'])

# Pytorch Tabnet 🔥

In [52]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [53]:
import torch

In [54]:
tabnet_params = dict(n_steps = 1,
                   optimizer_fn=torch.optim.Adam,
                   optimizer_params=dict(lr=1e-2, weight_decay = 5e-4),
                   scheduler_params={"step_size":1, # how to use learning rate scheduler
                                     "gamma":0.7},
                   scheduler_fn=torch.optim.lr_scheduler.StepLR,
                   mask_type='entmax',
                   verbose = 5)

### Scaling the values with MinMax Scaler
I have not removed the outliers or fixed them. Doing that can improve the scores.

In [55]:
scaler = MinMaxScaler()
cols = train2.drop('cancelled', axis=1).columns
train2[cols] = scaler.fit_transform(train2[cols])
test2[cols] = scaler.transform(test2[cols])

In [56]:
X = train2.drop(['cancelled'],axis=1).values
y = train2.cancelled.values
X_test = test2[cols].values

### Using Stratified K-fold to train the TabNet model

In [None]:
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

kf = StratifiedKFold(n_splits=5, random_state = 42, shuffle = True)
preds = np.zeros(test2.shape[0])
for  fold , (train_index, test_index) in enumerate(kf.split(X, y)):
    print(20*"*")
    print("Fold {}:".format(fold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]

    clf = TabNetClassifier(**tabnet_params)
    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=['train', 'valid'],
        eval_metric=['auc'],
        max_epochs= 100, patience=5,
        batch_size=1024*10, virtual_batch_size=128*10,
        num_workers=0,
        weights=1,
        drop_last=False
    ) 
    preds += clf.predict_proba(X_test)[:,1]/kf.n_splits
    print(preds.shape)

In [None]:
train2ns = train2
test2ns = test2

In [None]:
val = np.zeros(train2.shape[0])
pred_xgb = np.zeros(test2.shape[0])
pred_cb = np.zeros(test2.shape[0])
pred_rf = np.zeros(test2.shape[0])
pred_lgbm = np.zeros(test2.shape[0])
x = train2.drop(['cancelled'],axis=1).values
y = train2.cancelled.values
# folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=192)

# Creating Tree-based Models

In [None]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

model_lgm = LGBMClassifier()
model_rf = RandomForestClassifier()
model_cb = CatBoostClassifier()
model_xgb =  xgb.XGBClassifier()


### Stratified K-fold training function for models

In [None]:
def kfoldtraining(model):
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=192)
    train_pred = np.zeros(train2.shape[0])
    test_pred = np.zeros(test2.shape[0])
    for fold_index, (train_index,val_index) in enumerate(folds.split(x,y)):
        print('Batch {} started...'.format(fold_index))
        gc.collect()
        bst = model.fit(x[train_index],y[train_index],
              eval_set = [(x[val_index],y[val_index])],
              early_stopping_rounds=200,
            verbose=400
              )
        val[val_index] = model.predict_proba(x[val_index])[:,1]
        print('auc of this val set is {}'.format(roc_auc_score(y[val_index],val[val_index])))
        train_pred += model.predict_proba(train2.drop(['cancelled'],axis=1).values)[:,1]/folds.n_splits
        test_pred += model.predict_proba(test2.drop(['order_id'],axis=1).values)[:,1]/folds.n_splits
        
    return train_pred, test_pred

In [None]:
def otherkfoldtraining(model):
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=192)
    train_pred = np.zeros(train2.shape[0])
    test_pred = np.zeros(test2.shape[0])
    for fold_index, (train_index,val_index) in enumerate(folds.split(x,y)):
        print('Batch {} started...'.format(fold_index))
        gc.collect()
        bst = model.fit(x[train_index],y[train_index])
        val[val_index] = model.predict_proba(x[val_index])[:,1]
        print('auc of this val set is {}'.format(roc_auc_score(y[val_index],val[val_index])))
        train_pred += model.predict_proba(train2.drop(['cancelled'],axis=1).values)[:,1]/folds.n_splits
        test_pred += model.predict_proba(test2.drop(['order_id'],axis=1).values)[:,1]/folds.n_splits
        
    return train_pred, test_pred

In [None]:
pred_rf = rfkfoldtraining(model_rf)

In [None]:
lgbm_train, lgbm_test = kfoldtraining(model_lgm)

In [None]:
cb_train, cb_test = kfoldtraining(model_cb)

In [None]:
xgb_train, xgb_test = kfoldtraining(model_xgb)

In [None]:
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
submission_df.head()

In [None]:
submission_df.cancelled = (pred_lgbm + pred_xgb + 5*pred_cb)/7

In [None]:
submission_df.to_csv('w_avg2.csv',index=False)

In [None]:
submission_df.isnull().sum()

I got these parameters by hyperparameter tuning from Optuna

In [None]:
model_cb2 = CatBoostClassifier(iterations=10000,learning_rate=0.01, l2_leaf_reg=3.5,
                           colsample_bylevel=    0.0962895297660657,depth= 11, boosting_type='Plain',  
                           eval_metric="AUC",use_best_model=True
                           ,random_seed=22,bootstrap_type= "Bernoulli",subsample=0.6927844340277456)

In [None]:
submission_df.cancelled = pred_cb2
submission_df.to_csv('cb2.csv', index=False)

In [None]:
submission_df.cancelled = (5*pred_cb2 + 3*pred_cb + pred_lgbm + pred_xgb)/10
submission_df.to_csv('cb2_avg.csv', index=False)

In [None]:
train2.to_csv('train_preprocessed.csv', index=False)

In [None]:
test2.to_csv('test_preprocessed.csv', index=False)

In [None]:
submission_df.cancelled = pred_cb3
submission_df.to_csv('cb3.csv', index=False)

In [None]:
px.line(train.corr()['cancelled'])

In [None]:
submission_df.cancelled = pred_cb4
submission_df.to_csv('cb4.csv', index=False)

In [None]:
submission_df.cancelled = (pred_cb+3*pred_cb3+2*pred_cb4)/6

In [None]:
submission_df.to_csv('cb4_avg.csv', index=False)

**Using diverse range of models for stacking**

In [None]:
model_ext = ExtraTreesClassifier()
model_ada = AdaBoostClassifier()

In [None]:
ext_train, ext_test = otherkfoldtraining(model_ext)

In [None]:
ada_train, ada_test = otherkfoldtraining(model_ada)

# Creating Stack

In [None]:
stack_train = np.concatenate((lgbm_train.reshape(-1,1), 
                              cb_train.reshape(-1,1),
                                xgb_train.reshape(-1,1),
                               ada_train.reshape(-1,1),
                             ), axis = 1)
stack_test = np.concatenate((lgbm_test.reshape(-1,1), 
                              cb_test.reshape(-1,1),
                                xgb_test.reshape(-1,1),
                      ada_test.reshape(-1,1),
                             ), axis = 1)

In [None]:
stack_train = pd.DataFrame(stack_train, columns = ['lgbm','catb','xgb','ada'])
stack_test = pd.DataFrame(stack_test, columns = ['lgbm','catb','xgb','ada'])

In [None]:
stack_train.to_csv('stack_train.csv', index=True)
stack_test.to_csv('stack_test.csv', index=True)

In [None]:
y = train['cancelled'].copy()

from sklearn.linear_model import LogisticRegression,LogisticRegressionCV, RidgeClassifierCV

train_preds = np.zeros(stack_train.shape[0])
test_preds = np.zeros(stack_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
auc=[]

n=0
for train_index, test_index in kf.split(stack_train, y):
    
    X_train, X_valid = stack_train.iloc[train_index], stack_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    lr = LogisticRegressionCV()
    lr.fit(X_train, y_train)
    
    train_preds += lr.predict_proba(stack_train)[:,1]/kf.n_splits
    test_preds += lr.predict_proba(stack_test)[:,1]/kf.n_splits
    
    auc.append(roc_auc_score(y_valid, lr.predict_proba(X_valid)[:,1]))
    gc.collect()
        
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1

In [None]:
submission_df = pd.read_csv('sample_submission.csv')
submission_df['cancelled'] = test_preds
submission_df.to_csv('stack3_lr.csv', index=False)

In [None]:
submission_df.head()