# KHDLUD - Team 10



# kaggle competion:
https://www.kaggle.com/c/santander-customer-transaction-prediction#
    
# solution git:
https://github.com/KazukiOnodera/Santander-Customer-Transaction-Prediction/blob/master/final_solution/akiyama/py/lgb_train_and_predict.py

# golf src:
https://github.com/KazukiOnodera/santander-customer-transaction-prediction/blob/master/py/990_2nd_place_solution_golf.py

# Các bước thực hiện
    1
    2
    3
    4
    5
    6
    7

# import lib

In [1]:
import numpy as np
import pandas as pd
import gc, os

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from multiprocessing import cpu_count
from tqdm import tqdm

In [2]:
# const
test_original_data  = '../input/test.csv'
train_original_data = '../input/train.csv'
sample_submission_file = '../input/sample_submission.csv'
SUBMIT_FILE_PATH = f'../output/2nd-place-solution.csv.gz'

In [3]:
# config

drop_vars = [7,
            10,
            17,
            27,
            29,
            30,
            38,
            41,
            46,
            96,
            100,
            103,
            126,
            158,
            185]

var_len = 200 - len(drop_vars)
NFOLD = 10
NROUND = 1600

## Bước 1: remove fake sample

In [4]:
%%time

te_ = pd.read_csv(test_original_data).drop(['ID_code'], axis=1).values

unique_samples = []
unique_count = np.zeros_like(te_)
for feature in tqdm(range(te_.shape[1])):
    _, index_, count_ = np.unique(te_[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1


100%|██████████| 200/200 [00:04<00:00, 41.31it/s]

Wall time: 7.78 s





In [5]:
%%time

real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

Wall time: 53 ms


#  set param for training

In [6]:
params = {
    'bagging_freq': 5,
    'bagging_fraction': 1.0,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 1.0,
    'learning_rate': 0.005,
    'max_depth': -1,
    'metric':'binary_logloss',
    'min_data_in_leaf': 30,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 64,
    'num_threads': cpu_count(),
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': -1
    }
SEED = np.random.randint(99999)
np.random.seed(SEED)

## B2: load

In [7]:
%%time

train = pd.read_csv(train_original_data)
test  = pd.read_csv(test_original_data).drop(synthetic_samples_indexes)

X_train = train.iloc[:, 2:].values
y_train = train.target.values

X_test = test.iloc[:, 1:].values

X = np.concatenate([X_train, X_test], axis=0)
del X_train, X_test; gc.collect()

reverse_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 15, 16, 18, 19, 22, 24, 25, 26,
                27, 29, 32, 35, 37, 40, 41, 47, 48, 49, 51, 52, 53, 55, 60, 61,
                62, 65, 66, 67, 69, 70, 71, 74, 78, 79, 82, 84, 89, 90, 91, 94,
                95, 96, 97, 99, 103, 105, 106, 110, 111, 112, 118, 119, 125, 128,
                130, 133, 134, 135, 137, 138, 140, 144, 145, 147, 151, 155, 157,
                159, 161, 162, 163, 164, 167, 168, 170, 171, 173, 175, 176, 179,
                180, 181, 184, 185, 187, 189, 190, 191, 195, 196, 199,
                
                ]

for j in reverse_list:
    X[:, j] *= -1

Wall time: 6.03 s


##  remove vars that not necessary & scaling

In [8]:
%%time

# drop
X = np.delete(X, drop_vars, 1)

# scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# count encoding
X_cnt = np.zeros((len(X), var_len * 4))

for j in tqdm(range(var_len)):
    for i in range(1, 4):
        x = np.round(X[:, j], i+1)
        dic = pd.value_counts(x).to_dict()
        X_cnt[:, i+j*4] = pd.Series(x).map(dic)
    x = X[:, j]
    dic = pd.value_counts(x).to_dict()
    X_cnt[:, j*4] = pd.Series(x).map(dic)

# raw + count feature
X_raw = X.copy() # rename for readable
del X; gc.collect()

X = np.zeros((len(X_raw), var_len * 5))
for j in tqdm(range(var_len)):
    X[:, 5*j+1:5*j+5] = X_cnt[:, 4*j:4*j+4]
    X[:, 5*j] = X_raw[:, j]

# treat each var as same
X_train_concat = np.concatenate([
    np.concatenate([
        X[:200000, 5*cnum:5*cnum+5], 
        np.ones((len(y_train), 1)).astype("int")*cnum
    ], axis=1) for cnum in range(var_len)], axis=0)
y_train_concat = np.concatenate([y_train for cnum in range(var_len)], axis=0)

100%|██████████| 185/185 [00:41<00:00,  4.43it/s]
100%|██████████| 185/185 [00:02<00:00, 71.70it/s]


Wall time: 47 s


## stratified

In [9]:
%%time

train_group = np.arange(len(X_train_concat))%200000

id_y = pd.DataFrame(zip(train_group, y_train_concat), 
                    columns=['id', 'y'])

id_y_uq = id_y.drop_duplicates('id').reset_index(drop=True)

def stratified(nfold=5):
    
    id_y_uq0 = id_y_uq[id_y_uq.y==0].sample(frac=1)
    id_y_uq1 = id_y_uq[id_y_uq.y==1].sample(frac=1)
    
    id_y_uq0['g'] = [i%nfold for i in range(len(id_y_uq0))]
    id_y_uq1['g'] = [i%nfold for i in range(len(id_y_uq1))]
    id_y_uq_ = pd.concat([id_y_uq0, id_y_uq1])
    
    id_y_ = pd.merge(id_y[['id']], id_y_uq_, how='left', on='id')
    
    train_idx_list = []
    valid_idx_list = []
    for i in range(nfold):
        train_idx = id_y_[id_y_.g!=i].index
        train_idx_list.append(train_idx)
        valid_idx = id_y_[id_y_.g==i].index
        valid_idx_list.append(valid_idx)
    
    return train_idx_list, valid_idx_list

train_idx_list, valid_idx_list = stratified(NFOLD)

Wall time: 42.9 s


## Train

In [10]:
%%time

models = []
oof = np.zeros(len(id_y))
p_test_all = np.zeros((100000, var_len, NFOLD))
id_y['var'] = np.concatenate([np.ones(200000)*i for i in range(var_len)])

print(NFOLD)
for i in  tqdm(range(NFOLD)):
    
    print(f'building {i}...')
    
    train_idx = train_idx_list[i]
    valid_idx = valid_idx_list[i]
    
    # train
    X_train_cv = X_train_concat[train_idx]
    y_train_cv = y_train_concat[train_idx]
    
    # valid
    X_valid = X_train_concat[valid_idx]
    
    # test
    X_test = np.concatenate([
        np.concatenate([
            X[200000:, 5*cnum:5*cnum+5], 
            np.ones((100000, 1)).astype("int")*cnum
        ], axis=1) for cnum in range(var_len)], axis=0
    )
    print('gen Dataset')
    dtrain = lgb.Dataset(
        X_train_cv, y_train_cv, 
        feature_name=['value', 'count_org', 'count_2', 'count_3', 'count_4', 'varnum'], 
        categorical_feature=['varnum'], free_raw_data=False
    )
    print('Trainning....')
    model = lgb.train(params, train_set=dtrain, num_boost_round=NROUND, verbose_eval=100)
    l = valid_idx.shape[0]
    
    print('Predicting....')
    p_valid = model.predict(X_valid)
    p_test  = model.predict(X_test)
    for j in tqdm(range(var_len)):
        oof[valid_idx] = p_valid
        p_test_all[:, j, i] = p_test[j*100000:(j+1)*100000]
    
    models.append(model)

10


  0%|          | 0/10 [00:00<?, ?it/s]

building 0...
gen Dataset
Trainning....




Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  1%|          | 1/185 [00:00<00:22,  8.20it/s][A
  3%|▎         | 6/185 [00:00<00:06, 28.88it/s][A
  5%|▌         | 10/185 [00:00<00:05, 33.48it/s][A
  8%|▊         | 15/185 [00:00<00:04, 37.75it/s][A
 11%|█         | 20/185 [00:00<00:04, 39.68it/s][A
 14%|█▎        | 25/185 [00:00<00:03, 41.55it/s][A
 16%|█▌        | 30/185 [00:00<00:03, 42.01it/s][A
 19%|█▉        | 35/185 [00:00<00:03, 42.66it/s][A
 22%|██▏       | 40/185 [00:01<00:03, 42.62it/s][A
 24%|██▍       | 45/185 [00:01<00:03, 41.44it/s][A
 27%|██▋       | 50/185 [00:01<00:03, 42.21it/s][A
 30%|██▉       | 55/185 [00:01<00:03, 41.88it/s][A
 32%|███▏      | 60/185 [00:01<00:02, 41.76it/s][A
 35%|███▌      | 65/185 [00:01<00:02, 41.32it/s][A
 38%|███▊      | 70/185 [00:01<00:02, 41.63it/s][A
 41%|████      | 75/185 [00:01<00:02, 41.80it/s][A
 43%|████▎     | 80/185 [00:01<00:02, 42.45it/s][A
 46%|████▌     | 85/185 [00:02<00:02, 41.68it/s][A
 49%|████▊     | 90/18

building 1...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  3%|▎         | 5/185 [00:00<00:04, 41.84it/s][A
  5%|▌         | 10/185 [00:00<00:03, 44.94it/s][A
  8%|▊         | 15/185 [00:00<00:03, 45.83it/s][A
 11%|█         | 20/185 [00:00<00:03, 46.10it/s][A
 14%|█▎        | 25/185 [00:00<00:03, 45.94it/s][A
 16%|█▌        | 30/185 [00:00<00:03, 46.42it/s][A
 19%|█▉        | 35/185 [00:00<00:03, 46.88it/s][A
 22%|██▏       | 40/185 [00:00<00:03, 46.97it/s][A
 24%|██▍       | 45/185 [00:00<00:02, 47.17it/s][A
 27%|██▋       | 50/185 [00:01<00:02, 47.17it/s][A
 30%|██▉       | 55/185 [00:01<00:02, 47.24it/s][A
 32%|███▏      | 60/185 [00:01<00:02, 46.95it/s][A
 35%|███▌      | 65/185 [00:01<00:02, 46.75it/s][A
 38%|███▊      | 70/185 [00:01<00:02, 47.08it/s][A
 41%|████      | 75/185 [00:01<00:02, 46.31it/s][A
 43%|████▎     | 80/185 [00:01<00:02, 46.77it/s][A
 46%|████▌     | 85/185 [00:01<00:02, 46.82it/s][A
 49%|████▊     | 90/185 [00:01<00:02, 47.12it/s][A
 51%|█████▏    | 95/1

building 2...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  2%|▏         | 4/185 [00:00<00:04, 39.02it/s][A
  5%|▍         | 9/185 [00:00<00:04, 42.56it/s][A
  8%|▊         | 14/185 [00:00<00:03, 43.81it/s][A
 10%|█         | 19/185 [00:00<00:03, 44.92it/s][A
 13%|█▎        | 24/185 [00:00<00:03, 45.59it/s][A
 16%|█▌        | 29/185 [00:00<00:03, 45.83it/s][A
 18%|█▊        | 34/185 [00:00<00:03, 46.33it/s][A
 21%|██        | 39/185 [00:00<00:03, 46.39it/s][A
 24%|██▍       | 44/185 [00:00<00:03, 46.43it/s][A
 26%|██▋       | 49/185 [00:01<00:02, 46.26it/s][A
 29%|██▉       | 54/185 [00:01<00:02, 46.47it/s][A
 32%|███▏      | 59/185 [00:01<00:02, 46.74it/s][A
 35%|███▍      | 64/185 [00:01<00:02, 46.87it/s][A
 37%|███▋      | 69/185 [00:01<00:02, 47.03it/s][A
 40%|████      | 74/185 [00:01<00:02, 47.07it/s][A
 43%|████▎     | 79/185 [00:01<00:02, 47.10it/s][A
 45%|████▌     | 84/185 [00:01<00:02, 46.79it/s][A
 48%|████▊     | 89/185 [00:01<00:02, 45.56it/s][A
 51%|█████     | 94/18

building 3...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  2%|▏         | 4/185 [00:00<00:04, 37.74it/s][A
  5%|▍         | 9/185 [00:00<00:04, 41.89it/s][A
  8%|▊         | 14/185 [00:00<00:03, 43.89it/s][A
 10%|█         | 19/185 [00:00<00:03, 45.08it/s][A
 13%|█▎        | 24/185 [00:00<00:03, 44.48it/s][A
 16%|█▌        | 29/185 [00:00<00:03, 44.24it/s][A
 18%|█▊        | 34/185 [00:00<00:03, 44.40it/s][A
 21%|██        | 39/185 [00:00<00:03, 43.30it/s][A
 24%|██▍       | 44/185 [00:01<00:03, 40.86it/s][A
 26%|██▋       | 49/185 [00:01<00:03, 38.23it/s][A
 29%|██▉       | 54/185 [00:01<00:03, 39.56it/s][A
 32%|███▏      | 59/185 [00:01<00:03, 41.39it/s][A
 35%|███▍      | 64/185 [00:01<00:02, 42.55it/s][A
 37%|███▋      | 69/185 [00:01<00:02, 43.21it/s][A
 40%|████      | 74/185 [00:01<00:02, 43.92it/s][A
 43%|████▎     | 79/185 [00:01<00:02, 44.08it/s][A
 45%|████▌     | 84/185 [00:01<00:02, 44.66it/s][A
 48%|████▊     | 89/185 [00:02<00:02, 44.36it/s][A
 51%|█████     | 94/18

building 4...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  3%|▎         | 5/185 [00:00<00:04, 41.49it/s][A
  5%|▌         | 10/185 [00:00<00:03, 44.66it/s][A
  8%|▊         | 15/185 [00:00<00:03, 45.58it/s][A
 11%|█         | 20/185 [00:00<00:03, 46.45it/s][A
 14%|█▎        | 25/185 [00:00<00:03, 46.71it/s][A
 16%|█▌        | 30/185 [00:00<00:03, 46.94it/s][A
 19%|█▉        | 35/185 [00:00<00:03, 47.01it/s][A
 22%|██▏       | 40/185 [00:00<00:03, 46.64it/s][A
 24%|██▍       | 45/185 [00:00<00:02, 46.81it/s][A
 27%|██▋       | 50/185 [00:01<00:02, 47.12it/s][A
 30%|██▉       | 55/185 [00:01<00:02, 47.07it/s][A
 32%|███▏      | 60/185 [00:01<00:02, 47.10it/s][A
 35%|███▌      | 65/185 [00:01<00:02, 47.32it/s][A
 38%|███▊      | 70/185 [00:01<00:02, 47.34it/s][A
 41%|████      | 75/185 [00:01<00:02, 47.43it/s][A
 43%|████▎     | 80/185 [00:01<00:02, 47.42it/s][A
 46%|████▌     | 85/185 [00:01<00:02, 47.28it/s][A
 49%|████▊     | 90/185 [00:01<00:02, 46.91it/s][A
 51%|█████▏    | 95/1

building 5...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  3%|▎         | 5/185 [00:00<00:04, 42.37it/s][A
  5%|▌         | 10/185 [00:00<00:03, 45.31it/s][A
  8%|▊         | 15/185 [00:00<00:03, 44.82it/s][A
 11%|█         | 20/185 [00:00<00:03, 44.91it/s][A
 14%|█▎        | 25/185 [00:00<00:03, 45.25it/s][A
 16%|█▌        | 30/185 [00:00<00:03, 45.46it/s][A
 19%|█▉        | 35/185 [00:00<00:03, 45.59it/s][A
 22%|██▏       | 40/185 [00:00<00:03, 45.68it/s][A
 24%|██▍       | 45/185 [00:00<00:03, 46.00it/s][A
 27%|██▋       | 50/185 [00:01<00:02, 46.36it/s][A
 30%|██▉       | 55/185 [00:01<00:02, 46.74it/s][A
 32%|███▏      | 60/185 [00:01<00:02, 47.00it/s][A
 35%|███▌      | 65/185 [00:01<00:02, 46.65it/s][A
 38%|███▊      | 70/185 [00:01<00:02, 47.05it/s][A
 41%|████      | 75/185 [00:01<00:02, 47.11it/s][A
 43%|████▎     | 80/185 [00:01<00:02, 47.26it/s][A
 46%|████▌     | 85/185 [00:01<00:02, 47.10it/s][A
 49%|████▊     | 90/185 [00:01<00:02, 47.39it/s][A
 51%|█████▏    | 95/1

building 6...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  2%|▏         | 4/185 [00:00<00:05, 33.47it/s][A
  4%|▍         | 8/185 [00:00<00:05, 34.58it/s][A
  6%|▋         | 12/185 [00:00<00:04, 36.63it/s][A
  9%|▊         | 16/185 [00:00<00:04, 36.79it/s][A
 11%|█         | 20/185 [00:00<00:04, 36.88it/s][A
 13%|█▎        | 24/185 [00:00<00:04, 37.70it/s][A
 15%|█▌        | 28/185 [00:00<00:04, 37.20it/s][A
 17%|█▋        | 32/185 [00:00<00:04, 36.34it/s][A
 19%|█▉        | 36/185 [00:00<00:04, 37.03it/s][A
 22%|██▏       | 40/185 [00:01<00:03, 37.41it/s][A
 24%|██▍       | 44/185 [00:01<00:03, 37.94it/s][A
 26%|██▌       | 48/185 [00:01<00:03, 35.72it/s][A
 28%|██▊       | 52/185 [00:01<00:03, 35.77it/s][A
 30%|███       | 56/185 [00:01<00:03, 35.56it/s][A
 32%|███▏      | 60/185 [00:01<00:03, 35.85it/s][A
 35%|███▍      | 64/185 [00:01<00:03, 36.20it/s][A
 37%|███▋      | 68/185 [00:01<00:03, 36.30it/s][A
 39%|███▉      | 72/185 [00:01<00:03, 36.07it/s][A
 41%|████      | 76/18

building 7...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  2%|▏         | 4/185 [00:00<00:05, 33.19it/s][A
  4%|▍         | 8/185 [00:00<00:04, 36.20it/s][A
  6%|▋         | 12/185 [00:00<00:04, 37.20it/s][A
  9%|▊         | 16/185 [00:00<00:04, 36.80it/s][A
 11%|█         | 20/185 [00:00<00:04, 37.76it/s][A
 13%|█▎        | 24/185 [00:00<00:04, 38.12it/s][A
 15%|█▌        | 28/185 [00:00<00:04, 38.72it/s][A
 18%|█▊        | 33/185 [00:00<00:03, 39.29it/s][A
 20%|██        | 37/185 [00:00<00:03, 39.33it/s][A
 22%|██▏       | 41/185 [00:01<00:03, 39.01it/s][A
 24%|██▍       | 45/185 [00:01<00:03, 38.90it/s][A
 27%|██▋       | 50/185 [00:01<00:03, 39.33it/s][A
 29%|██▉       | 54/185 [00:01<00:03, 39.13it/s][A
 31%|███▏      | 58/185 [00:01<00:03, 38.88it/s][A
 34%|███▎      | 62/185 [00:01<00:03, 38.59it/s][A
 36%|███▌      | 66/185 [00:01<00:03, 37.70it/s][A
 38%|███▊      | 70/185 [00:01<00:03, 37.71it/s][A
 40%|████      | 74/185 [00:01<00:03, 36.74it/s][A
 42%|████▏     | 78/18

building 8...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  2%|▏         | 3/185 [00:00<00:06, 29.13it/s][A
  4%|▍         | 7/185 [00:00<00:05, 35.14it/s][A
  6%|▌         | 11/185 [00:00<00:04, 36.75it/s][A
  8%|▊         | 15/185 [00:00<00:04, 35.27it/s][A
 10%|█         | 19/185 [00:00<00:04, 36.45it/s][A
 13%|█▎        | 24/185 [00:00<00:04, 38.06it/s][A
 16%|█▌        | 29/185 [00:00<00:04, 38.85it/s][A
 18%|█▊        | 33/185 [00:00<00:03, 38.52it/s][A
 20%|██        | 37/185 [00:00<00:03, 38.56it/s][A
 22%|██▏       | 41/185 [00:01<00:03, 37.46it/s][A
 24%|██▍       | 45/185 [00:01<00:03, 36.77it/s][A
 26%|██▋       | 49/185 [00:01<00:03, 37.52it/s][A
 29%|██▊       | 53/185 [00:01<00:03, 37.06it/s][A
 31%|███       | 57/185 [00:01<00:03, 37.74it/s][A
 34%|███▎      | 62/185 [00:01<00:03, 38.91it/s][A
 36%|███▌      | 67/185 [00:01<00:02, 39.42it/s][A
 38%|███▊      | 71/185 [00:01<00:03, 37.87it/s][A
 41%|████      | 75/185 [00:02<00:02, 37.29it/s][A
 43%|████▎     | 79/18

building 9...
gen Dataset
Trainning....
Predicting....



  0%|          | 0/185 [00:00<?, ?it/s][A
  3%|▎         | 5/185 [00:00<00:04, 41.66it/s][A
  5%|▌         | 10/185 [00:00<00:04, 43.52it/s][A
  8%|▊         | 15/185 [00:00<00:03, 44.46it/s][A
 11%|█         | 20/185 [00:00<00:03, 45.31it/s][A
 14%|█▎        | 25/185 [00:00<00:03, 45.70it/s][A
 16%|█▌        | 30/185 [00:00<00:03, 45.84it/s][A
 19%|█▉        | 35/185 [00:00<00:03, 46.06it/s][A
 22%|██▏       | 40/185 [00:00<00:03, 46.54it/s][A
 24%|██▍       | 45/185 [00:00<00:02, 46.86it/s][A
 27%|██▋       | 50/185 [00:01<00:02, 46.86it/s][A
 30%|██▉       | 55/185 [00:01<00:02, 46.88it/s][A
 32%|███▏      | 60/185 [00:01<00:02, 46.73it/s][A
 35%|███▌      | 65/185 [00:01<00:02, 46.97it/s][A
 38%|███▊      | 70/185 [00:01<00:02, 47.00it/s][A
 41%|████      | 75/185 [00:01<00:02, 46.80it/s][A
 43%|████▎     | 80/185 [00:01<00:02, 46.30it/s][A
 46%|████▌     | 85/185 [00:01<00:02, 46.69it/s][A
 49%|████▊     | 90/185 [00:01<00:02, 46.96it/s][A
 51%|█████▏    | 95/1

Wall time: 2h 13min 19s





## Test

In [11]:
%%time

id_y['pred'] = oof
oof = pd.pivot_table(id_y, index='id', columns='var', values='pred').values

p_test_mean = p_test_all.mean(axis=2)

p_test_odds = np.ones(100000) * 1 / 9
for j in tqdm(range(var_len)):
    if roc_auc_score(y_train, oof[:, j]) >= 0.500:
        p_test_odds *= (9 * p_test_mean[:, j] / (1 - p_test_mean[:, j]))

p_test_odds = p_test_odds / (1 + p_test_odds)

sub1 = pd.read_csv(sample_submission_file)
sub2 = pd.DataFrame({"ID_code":test.ID_code.values , "target":p_test_odds})
sub = pd.merge(sub1[["ID_code"]], sub2, how="left").fillna(0)

100%|██████████| 185/185 [00:09<00:00, 19.33it/s]


Wall time: 30.9 s


## save result

In [12]:
%%time

sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

Wall time: 950 ms
