In [1]:
!pip -q install iterative-stratification

In [2]:
!pip -q install catboost

[K     |████████████████████████████████| 76.3 MB 1.2 MB/s 
[?25h

In [3]:
import os
import gc
import sys
import time
import random
import operator
import typing as tp
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd,numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold
from sklearn.decomposition import PCA

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import lightgbm as lgb
import xgboost as xgb

In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/AV/AmExpert2021/input/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/AV/AmExpert2021/input/test.csv")
ss_df = pd.read_csv("/content/drive/MyDrive/AV/AmExpert2021/input/sample_submission.csv")

train_df.shape, test_df.shape, ss_df.shape

((37748, 9), (20327, 8), (20327, 2))

In [5]:
train_df.columns

Index(['Customer_ID', 'Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category',
       'Customer_Category', 'Product_Holding_B1', 'Product_Holding_B2'],
      dtype='object')

In [6]:
train_df.drop(columns='Customer_ID').duplicated().sum()

2876

In [7]:
train_df = train_df[~train_df.drop(columns='Customer_ID').duplicated()].reset_index(drop=True)
train_df.shape

(34872, 9)

In [8]:
train_df['PHB1_len'] = train_df['Product_Holding_B1'].apply(lambda x: len(eval(x)))
test_df['PHB1_len'] = test_df['Product_Holding_B1'].apply(lambda x: len(eval(x)))

In [9]:
PHB1_list = train_df.Product_Holding_B1.apply(eval).values.tolist()
PHB2_list = train_df.Product_Holding_B2.apply(eval).values.tolist()
tPHB1_list = test_df.Product_Holding_B1.apply(eval).values.tolist()

In [10]:
mlb1 = MultiLabelBinarizer()
mlb2 = MultiLabelBinarizer()

PHB1_onehot = mlb1.fit_transform(PHB1_list)
PHB2_onehot = mlb2.fit_transform(PHB2_list)
tPHB1_onehot = mlb1.transform(tPHB1_list)

PHB1_onehot.shape, PHB2_onehot.shape, tPHB1_onehot.shape

((34872, 22), (34872, 20), (20327, 22))

In [11]:
train_df[mlb1.classes_] = PHB1_onehot
test_df[mlb1.classes_] = tPHB1_onehot

In [12]:
# d = np.dot(PHB1_onehot.T, PHB2_onehot)
# d.shape

(22, 20)

In [13]:
# ddf = pd.DataFrame(d, index=mlb1.classes_, columns=mlb2.classes_)

In [14]:
#ddf = ddf.div(ddf.sum(axis=1), axis=0)

In [16]:
# PHB1_featres = []
# for li in PHB1_list:
#     dd = ddf.loc[li].sum().values.tolist()
#     PHB1_featres.append(dd)

# dd_cols = [f'{col}_dd' for col in ddf.columns]
# train_df[dd_cols] = PHB1_featres

In [17]:
# tPHB1_featres = []
# for li in tPHB1_list:
#     dd = ddf.loc[li].sum().values.tolist()
#     tPHB1_featres.append(dd)

# dd_cols = [f'{col}_dd' for col in ddf.columns]
# test_df[dd_cols] = tPHB1_featres

In [18]:
train_df.shape, test_df.shape

((34872, 32), (20327, 31))

In [19]:
cat_cols = ['Gender', 'City_Category', 'Customer_Category']

for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [20]:
cid_list = []
target_list = []
for cid, ph_list in zip(train_df.Customer_ID.values, PHB2_list):
    for l in ph_list:
        cid_list.append(cid)
        target_list.append(l)

target_df = pd.DataFrame({
    "Customer_ID" : cid_list,
    "Target" : target_list
})

print(len(cid_list), len(target_list))

target_df.shape

49123 49123


(49123, 2)

In [21]:
def pca_pre(tr,te, n_comp, feat_raw, feat_new):
    pca = PCA(n_components=n_comp, random_state=42)
    tr2 = pd.DataFrame(pca.fit_transform(tr[feat_raw]),columns=feat_new)
    #va2 = pd.DataFrame(pca.transform(va[feat_raw]),columns=feat_new)
    te2 = pd.DataFrame(pca.transform(te[feat_raw]),columns=feat_new)
    return(tr2,te2)

n_comp1 = 5
feat_cols = mlb1.classes_
pca_feat_g = [f'pca-{i}' for i in range(n_comp1)]

x_tr_g_pca,x_te_g_pca = pca_pre(train_df, test_df,
                                            n_comp1, feat_cols ,pca_feat_g)
train_df = pd.concat([train_df, x_tr_g_pca],axis = 1)
# x_valid = pd.concat([x_valid,x_va_g_pca],axis = 1)
test_df  = pd.concat([test_df, x_te_g_pca],axis = 1)


In [22]:
train_df = target_df.merge(train_df, how='left')

In [23]:
target_le = LabelEncoder()
train_df['Target'] = target_le.fit_transform(train_df['Target'])

In [24]:
drop_cols = ['Customer_ID', 'Target', 'Product_Holding_B1', 'Product_Holding_B2']
train_cols = train_df.drop(columns=drop_cols).columns.values
train_cols

array(['Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category',
       'Customer_Category', 'PHB1_len', 'P00', 'P1', 'P10', 'P11', 'P12',
       'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P2', 'P20',
       'P21', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'pca-0', 'pca-1',
       'pca-2', 'pca-3', 'pca-4'], dtype=object)

In [25]:
target = train_df['Target'].values
X_test = test_df[train_cols].values

In [None]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [26]:
from typing import Optional, Union, Tuple
def binary_logloss_for_lgbm(label, preds):
    """Calculate Binary Logloss"""
    # print(preds)
    # print(label)
    preds = preds.reshape(20, len(label)).T
    preds = np.argsort(preds)[:, -3:][:, ::-1]
    label = label.reshape(len(label), 1)
    loss = mapk(label, preds, k=3)

    # # eval_name, eval_result, is_higher_better
    return 'my_lnloss', loss, True

In [32]:
#skf = MultilabelStratifiedKFold(n_splits= 5, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_predictions = []

for idx, (train_idx, valid_idx) in enumerate(skf.split(train_df, target)):
    
    X_train = train_df.iloc[train_idx][train_cols].values
    y_train = target[train_idx]

    X_valid = train_df.iloc[valid_idx][train_cols].values
    y_valid = target[valid_idx]

    print("Trian :", X_train.shape, y_train.shape)
    print("Valid :", X_valid.shape, y_valid.shape)

    params = {'boosting_type': 'gbdt', 'tree_learner': 'feature', #''serial' or feature' or 'data' or 'voting'
              'num_leaves': 31, 'max_depth': -1,
              'learning_rate': 5e-2, 'n_estimators': 10000, 'importance_type': 'gain',
              'subsample_for_bin': 200000, 'objective': 'multiclass', 'min_split_gain': 0.0, 'min_child_weight': 1e-3, 'min_child_samples': 20, 
              'bagging_freq': 0, 'bagging_fraction': 1.0, 'feature_fraction': 1.0,
              'reg_alpha': 0.2, 'reg_lambda': 0.2,
              'random_state': 43, 'data_random_seed': 1,
              'n_jobs': -1, 'silent': False, 'num_class':20}
    
    # params = {'lambda': 0.021898519242975315, 'alpha': 0.4526612768270315, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.01, 
    #           'max_depth': 5, 'random_state': 2020, 'min_child_weight': 1,
    #           'num_class': 20, 'n_estimators': 10000
    #           }


    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, 
              eval_metric='multi_logloss', 
              #eval_metric=binary_logloss_for_lgbm,
              verbose=100)
    # pred_val_y = model.predict_proba(val_X)
    predictions = model.predict_proba(X_test)

    test_predictions.append(predictions)



Trian : (39298, 34) (39298,)
Valid : (9825, 34) (9825,)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.59171	valid_0's multi_logloss: 1.59171
[200]	valid_0's multi_logloss: 1.59265	valid_0's multi_logloss: 1.59265
Early stopping, best iteration is:
[138]	valid_0's multi_logloss: 1.58513	valid_0's multi_logloss: 1.58513
Trian : (39298, 34) (39298,)
Valid : (9825, 34) (9825,)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.59447	valid_0's multi_logloss: 1.59447
[200]	valid_0's multi_logloss: 1.59405	valid_0's multi_logloss: 1.59405
Early stopping, best iteration is:
[138]	valid_0's multi_logloss: 1.58741	valid_0's multi_logloss: 1.58741
Trian : (39298, 34) (39298,)
Valid : (9825, 34) (9825,)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.61016	valid_0's multi_logloss: 1.61016
[200]	valid_0's multi_logloss: 1.61369	valid_0's multi_logloss: 1.613

In [33]:
final_predictions_mean = np.array(test_predictions).mean(axis=0)
final_predictions_mean.shape

(20327, 20)

In [34]:
np.save("/content/drive/MyDrive/AV/AmExpert2021/input/LGBM_D_M_C_pre.npy", final_predictions_mean)

In [35]:
final_predictions = np.argsort(final_predictions_mean)[:, -3:][:, ::-1]

In [36]:
final_test = []
for i in final_predictions:
    final_test.append([target_le.classes_[int(ii)] for ii in i])

In [37]:
ss_df['Product_Holding_B2'] = final_test
ss_df.to_csv("/content/drive/MyDrive/AV/AmExpert2021/input/LGBM_D_M_C.csv",index=False)