In [None]:
!pip -q install iterative-stratification
!pip -q install pytorch-tabnet

In [None]:
import os
import gc
import sys
import time
import random
import operator
import typing as tp
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd,numpy as np
from copy import deepcopy as dp

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Tabnet 
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/AV/AmExpert2021/input/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/AV/AmExpert2021/input/test.csv")
ss_df = pd.read_csv("/content/drive/MyDrive/AV/AmExpert2021/input/sample_submission.csv")

train_df.shape, test_df.shape, ss_df.shape

((37748, 9), (20327, 8), (20327, 2))

In [None]:
train_df.drop(columns='Customer_ID').duplicated().sum()

2876

In [None]:
train_df = train_df[~train_df.drop(columns='Customer_ID').duplicated()].reset_index(drop=True)
train_df.shape

(34872, 9)

In [None]:
train_df['PHB1_len'] = train_df['Product_Holding_B1'].apply(lambda x: len(eval(x)))
test_df['PHB1_len'] = test_df['Product_Holding_B1'].apply(lambda x: len(eval(x)))

In [None]:
cat_cols = ['Gender', 'City_Category', 'Customer_Category']

for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [None]:
CCtrain_df = pd.get_dummies(train_df['Customer_Category'], prefix='Customer_Category')
CCtest_df = pd.get_dummies(test_df['Customer_Category'], prefix='Customer_Category')

train_df[CCtrain_df.columns] = CCtrain_df
test_df[CCtest_df.columns] = CCtest_df

In [None]:
num_cols = ['Age', 'Vintage', 'PHB1_len']

for col in num_cols:
    mms = StandardScaler()
    train_df[col] = mms.fit_transform(train_df[col].values.reshape(-1, 1))
    test_df[col] = mms.transform(test_df[col].values.reshape(-1, 1))

In [None]:
PHB1_list = train_df.Product_Holding_B1.apply(eval).values.tolist()
PHB2_list = train_df.Product_Holding_B2.apply(eval).values.tolist()
tPHB1_list = test_df.Product_Holding_B1.apply(eval).values.tolist()

In [None]:
mlb1 = MultiLabelBinarizer()
mlb2 = MultiLabelBinarizer()

PHB1_onehot = mlb1.fit_transform(PHB1_list)
PHB2_onehot = mlb2.fit_transform(PHB2_list)
tPHB1_onehot = mlb1.transform(tPHB1_list)

PHB1_onehot.shape, PHB2_onehot.shape, tPHB1_onehot.shape

((34872, 22), (34872, 20), (20327, 22))

In [None]:
train_df[mlb1.classes_] = PHB1_onehot
test_df[mlb1.classes_] = tPHB1_onehot

In [None]:
def pca_pre(tr,te, n_comp, feat_raw, feat_new):
    pca = PCA(n_components=n_comp, random_state=42)
    tr2 = pd.DataFrame(pca.fit_transform(tr[feat_raw]),columns=feat_new)
    #va2 = pd.DataFrame(pca.transform(va[feat_raw]),columns=feat_new)
    te2 = pd.DataFrame(pca.transform(te[feat_raw]),columns=feat_new)
    return(tr2,te2)

n_comp1 = 5
feat_cols = mlb1.classes_
pca_feat_g = [f'pca-{i}' for i in range(n_comp1)]

x_tr_g_pca,x_te_g_pca = pca_pre(train_df, test_df,
                                            n_comp1, feat_cols ,pca_feat_g)
train_df = pd.concat([train_df, x_tr_g_pca],axis = 1)
# x_valid = pd.concat([x_valid,x_va_g_pca],axis = 1)
test_df  = pd.concat([test_df, x_te_g_pca],axis = 1)

train_df.shape, test_df.shape

((34872, 40), (20327, 39))

In [None]:
drop_cols = ['Customer_ID', 'Customer_Category', 'Product_Holding_B1', 'Product_Holding_B2']
train_cols = train_df.drop(columns=drop_cols).columns.values
train_cols

array(['Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category',
       'PHB1_len', 'Customer_Category_0', 'Customer_Category_1',
       'Customer_Category_2', 'P00', 'P1', 'P10', 'P11', 'P12', 'P13',
       'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P2', 'P20', 'P21', 'P3',
       'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'pca-0', 'pca-1', 'pca-2',
       'pca-3', 'pca-4'], dtype=object)

In [None]:
X_test = test_df[train_cols].values

## TabNet

In [None]:
def g_table(list1):
    table_dic = {}
    for i in list1:
        if i not in table_dic.keys():
            table_dic[i] = 1
        else:
            table_dic[i] += 1
    return(table_dic)

DEVICE = "cuda"

tar_freq = np.array([np.min(list(g_table(PHB2_onehot[:, i]))) for i in range(PHB2_onehot.shape[1])])
tar_weight0 = np.array([np.log(i+100) for i in tar_freq])
tar_weight0_min = dp(np.min(tar_weight0))
tar_weight = tar_weight0_min/tar_weight0
pos_weight = torch.tensor(tar_weight).to(DEVICE)

wgt_bce = dp(F.binary_cross_entropy_with_logits)
wgt_bce.__defaults__ = (None, None, None, 'mean', pos_weight)

from torch.nn.modules.loss import _WeightedLoss
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight,
                                                    pos_weight = pos_weight)
        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [None]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = True

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        l = y_true.sum(axis=1).tolist()
        ll = y_true.argsort()[:, ::-1].tolist()
        y_true = [lli[:li] for lli, li in zip(ll, l)]
        y_pred = np.argsort(y_pred)[:, -3:][:, ::-1]

        score = mapk(y_true, y_pred, k=3)

        #logits = 1 / (1 + np.exp(-y_pred))
        #aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return score #np.mean(-aux)

In [None]:
skf = MultilabelStratifiedKFold(n_splits= 5, shuffle=True, random_state=42)

test_predictions = []

for idx, (train_idx, valid_idx) in enumerate(skf.split(train_df, PHB2_onehot)):
    
    X_train = train_df.iloc[train_idx][train_cols].values
    y_train = PHB2_onehot[train_idx]

    X_valid = train_df.iloc[valid_idx][train_cols].values
    y_valid = PHB2_onehot[valid_idx]

    print("Trian :", X_train.shape, y_train.shape)
    print("Valid :", X_valid.shape, y_valid.shape)

    MAX_EPOCH = 500
    tabnet_params = dict(
        n_d = 64,
        n_a = 128,
        n_steps = 1,
        gamma = 1.3,
        # cat_idxs = [],
        # cat_dims = [],
        # cat_emb_dim = [],
        lambda_sparse = 0,
        n_independent = 2,
        n_shared = 1,
        optimizer_fn = torch.optim.Adam,
        optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
        mask_type = "entmax",
        scheduler_params = dict(
            mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
        scheduler_fn = ReduceLROnPlateau,
        seed = 0,
        verbose = 10
    )

    ### Fit ###
    model = TabNetRegressor(**tabnet_params)
    model.fit(
        X_train = X_train,
        y_train = y_train,
        eval_set = [(X_valid, y_valid)],
        eval_name = ["val"],
        eval_metric = ["logits_ll"],
        max_epochs = MAX_EPOCH,
        patience = 40,
        batch_size = 1024, 
        virtual_batch_size = 32,
        num_workers = 1,
        drop_last = False,
        loss_fn = SmoothBCEwLogits(smoothing=1e-4) #nn.BCEWithLogitsLoss() #SmoothBCEwLogits(smoothing=1e-4) # wgt_bce
    )

    predictions = 1 / (1 + np.exp(-model.predict(X_test)))

    test_predictions.append(predictions)

Trian : (27874, 36) (27874, 20)
Valid : (6998, 36) (6998, 20)
Device used : cuda
epoch 0  | loss: 0.25313 | val_logits_ll: 0.45652 |  0:00:01s
epoch 10 | loss: 0.13462 | val_logits_ll: 0.65219 |  0:00:19s
epoch 20 | loss: 0.13139 | val_logits_ll: 0.66429 |  0:00:36s
epoch 30 | loss: 0.13027 | val_logits_ll: 0.66773 |  0:00:54s
epoch 40 | loss: 0.12905 | val_logits_ll: 0.66787 |  0:01:11s
epoch 50 | loss: 0.12835 | val_logits_ll: 0.66938 |  0:01:28s
epoch 60 | loss: 0.12836 | val_logits_ll: 0.67144 |  0:01:46s
epoch 70 | loss: 0.12748 | val_logits_ll: 0.67271 |  0:02:04s
epoch 80 | loss: 0.12678 | val_logits_ll: 0.67412 |  0:02:21s
epoch 90 | loss: 0.12673 | val_logits_ll: 0.6715  |  0:02:38s
epoch 100| loss: 0.12615 | val_logits_ll: 0.67343 |  0:02:56s
epoch 110| loss: 0.12596 | val_logits_ll: 0.67189 |  0:03:13s
epoch 120| loss: 0.12552 | val_logits_ll: 0.67439 |  0:03:31s
epoch 130| loss: 0.12493 | val_logits_ll: 0.67287 |  0:03:48s

Early stopping occurred at epoch 133 with best_epo

In [None]:
final_predictions_mean = np.array(test_predictions).mean(axis=0)
final_predictions_mean.shape

(20327, 20)

In [None]:
np.save("/content/drive/MyDrive/AV/AmExpert2021/input/TabNet_D_M_pre.npy", final_predictions_mean)

In [None]:
final_predictions = np.argsort(final_predictions_mean)[:, -3:][:, ::-1]

In [None]:
final_test = []
for i in final_predictions:
    final_test.append([mlb2.classes_[int(ii)] for ii in i])

In [None]:
ss_df['Product_Holding_B2'] = final_test
ss_df.to_csv("/content/drive/MyDrive/AV/AmExpert2021/input/TabNet_D_M.csv",index=False)

In [None]:
ss_df

Unnamed: 0,Customer_ID,Product_Holding_B2
0,CC372708,"[P8, P10, P4]"
1,CC216072,"[P8, P1, P10]"
2,CC387629,"[P16, P13, P00]"
3,CC389228,"[P8, P10, P9]"
4,CC394445,"[P00, P1, P8]"
...,...,...
20322,CC303542,"[P8, P00, P6]"
20323,CC266713,"[P8, P12, P6]"
20324,CC393639,"[P8, P00, P9]"
20325,CC285013,"[P1, P6, P7]"
