In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/MyDrive/Kaggle\ Competition/amex

/content/gdrive/MyDrive/Kaggle Competition/amex


In [3]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
# import lightgbm as lgb
from itertools import combinations

In [5]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 167 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [6]:
# import cupy, cudf # GPU libraries
from catboost import CatBoostClassifier

In [7]:
class CFG:
    input_dir = 'data/'
    output_dir = '/content/gdrive/MyDrive/Kaggle\ Competition/amex/Jason_model/CatB/'
    seed = 42
    n_folds = 5
    target = 'target'

In [8]:
def read_preprocess_data():
    train = pd.read_parquet('data/train.parquet')
    features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print('Starting training feature engineer...')
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    train_labels = pd.read_csv('data/train_labels.csv')
    train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
    del train_num_agg, train_cat_agg
    gc.collect()
    test = pd.read_parquet('data/test.parquet')
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID')
    del test_num_agg, test_cat_agg
    gc.collect()
    # Save files to disk
    train.to_parquet(CFG.input_dir + 'train_fe.parquet')
    test.to_parquet(CFG.input_dir + 'test_fe.parquet')

In [None]:
# read_preprocess_data()

In [9]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [10]:
def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_fe.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_fe.parquet')
    return train, test

In [11]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [12]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

In [13]:
def train_and_evaluate(train, test):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    cat_features = [f"{cf}_last" for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]

    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]

        model = CatBoostClassifier(iterations=1000, random_state=22, nan_mode='Min', task_type='GPU')
        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], cat_features=cat_features,  verbose=200)

        # Save best model
        %ls
        joblib.dump(model, ('Jason_models/CatB/catb_fold{}_seed{}.pkl').format(fold, CFG.seed))
        # Predict validation
        val_pred = model.predict_proba(x_val)[:, 1]
        # Add to out of folds array
        oof_predictions[val_ind] = val_pred
        # Predict the test set
        test_pred = model.predict_proba(test[features])[:, 1]
        test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val
        gc.collect()
    # Compute out of folds metric
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    %ls
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    oof_df.to_csv(('Jason_models/CatB/OOF/oof_catb_baseline_{}fold_seed{}.csv').format(CFG.n_folds, CFG.seed), index = False)
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    %ls
    test_df.to_csv(('Jason_models/CatB/Predictions/test_catb_baseline_{}fold_seed{}.csv').format(CFG.n_folds, CFG.seed), index = False)
    
# seed_everything(CFG.seed)
# train, test = read_data()
# train_and_evaluate(train, test)

In [None]:
# oof_df = pd.DataFrame([1,1],[2,1])

In [14]:
train, test = read_data()
train_and_evaluate(train, test)

 
--------------------------------------------------
Training fold 0 with 1011 features...
Learning rate set to 0.043104
0:	learn: 0.6403072	test: 0.6401534	best: 0.6401534 (0)	total: 44.5ms	remaining: 44.4s
200:	learn: 0.2252651	test: 0.2252770	best: 0.2252770 (200)	total: 7.89s	remaining: 31.4s
400:	learn: 0.2188866	test: 0.2208592	best: 0.2208592 (400)	total: 15.6s	remaining: 23.3s
600:	learn: 0.2151017	test: 0.2191039	best: 0.2191039 (600)	total: 23.3s	remaining: 15.5s
800:	learn: 0.2122568	test: 0.2181542	best: 0.2181515 (797)	total: 30.9s	remaining: 7.68s
999:	learn: 0.2097271	test: 0.2175522	best: 0.2175522 (999)	total: 38.5s	remaining: 0us
bestTest = 0.2175522232
bestIteration = 999
 [0m[01;34mbuild[0m/             LGBM_train.ipynb                         models.py
 [01;34mcatboost_info[0m/    'Matthew LGBM Inference Notebook.ipynb'   process_data.py
 CatB_train.ipynb  'Matthew LGBM Training Notebook.ipynb'    [01;34m__pycache__[0m/
 config.py          metrics.py        

In [None]:
%cd 

/root


In [None]:
!nvidia-smi

Mon Jul 11 01:27:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    39W / 300W |    361MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces