In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

SEED = 42
np.random.seed(SEED)

In [2]:
N_PR = 21

In [3]:
train = pd.read_csv('./dataset/Train.csv')
train.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,4WKQSBB,1/2/2019,F,M,1987,1X1H,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CP5S02H,1/6/2019,F,M,1981,UAOD,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2YKDILJ,1/6/2013,M,U,1991,748L,QZYX,90QI,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2S9E81J,1/8/2019,M,M,1990,1X1H,BP09,56SI,0,0,...,0,0,0,1,0,0,0,0,0,0
4,BHDYVFT,1/8/2019,M,M,1990,748L,NO3L,T4MS,0,0,...,0,0,0,0,0,0,1,1,0,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29132 entries, 0 to 29131
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ID                        29132 non-null  object
 1   join_date                 29130 non-null  object
 2   sex                       29132 non-null  object
 3   marital_status            29132 non-null  object
 4   birth_year                29132 non-null  int64 
 5   branch_code               29132 non-null  object
 6   occupation_code           29132 non-null  object
 7   occupation_category_code  29132 non-null  object
 8   P5DA                      29132 non-null  int64 
 9   RIBP                      29132 non-null  int64 
 10  8NN1                      29132 non-null  int64 
 11  7POT                      29132 non-null  int64 
 12  66FJ                      29132 non-null  int64 
 13  GYSR                      29132 non-null  int64 
 14  SOP4                  

In [4]:
Y = train.iloc[:, -N_PR:]

In [24]:
for col in Y_train:
    print(f'Column {col} has {Y_train[col].sum()} ones')

Column P5DA has 40 ones
Column RIBP has 1780 ones
Column 8NN1 has 157 ones
Column 7POT has 316 ones
Column 66FJ has 339 ones
Column GYSR has 4 ones
Column SOP4 has 431 ones
Column RVSZ has 25328 ones
Column PYUQ has 2173 ones
Column LJR9 has 354 ones
Column N2MW has 838 ones
Column AHXO has 539 ones
Column BSTQ has 324 ones
Column FM3X has 110 ones
Column K6QO has 21629 ones
Column QBOL has 6833 ones
Column JWFN has 311 ones
Column JZ9D has 1425 ones
Column J9JW has 1418 ones
Column GHYX has 902 ones
Column ECY3 has 1102 ones


In [22]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans

class ProductsClusterAttribute(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=8):
        self.n_clusters = n_clusters
        
    def fit(self, X, y=None):
        self.kmeans = KMeans(self.n_clusters)
        self.kmeans.fit(X)
        return self
    
    def transform(self, X):
        cluster_predictions = self.kmeans.predict(X)
        return np.hstack((cluster_predictions.reshape(-1, 1), X))

In [8]:
class YearMonthDayAttributes(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        data_timestamps = pd.to_datetime(X['join_date']).ffill()
        year = pd.DatetimeIndex(data_timestamps).year.values.reshape(-1, 1)
        month = pd.DatetimeIndex(data_timestamps).month.values.reshape(-1, 1)
        day = pd.DatetimeIndex(data_timestamps).day.values.reshape(-1, 1)

        return np.hstack((year, month, day))

In [17]:
def get_random_ones_indices(row):
    index = np.random.choice(np.where(row == 1)[0])
    new_row = np.zeros(len(row), dtype=bool)
    new_row[index] = True
    return new_row

def remove_ones(X):
    products = X[:, -21:]
    random_ones_indices = np.apply_along_axis(get_random_ones_indices, 1, products)
    products[random_ones_indices] = 0

In [13]:
X_train.columns

Index(['sex', 'marital_status', 'birth_year', 'branch_code', 'occupation_code',
       'occupation_category_code', 'P5DA', 'RIBP', '8NN1', '7POT', '66FJ',
       'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X',
       'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3'],
      dtype='object')

In [10]:
columns_to_drop = ['ID']

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

pipeline = ColumnTransformer([
    ('one_hot', OneHotEncoder(), ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code']),
    ('year_month_day', YearMonthDayAttributes(), ['join_date']),
    #('products_cluster_and_products', ProductsClusterAttribute(80), Y.columns)
], remainder='passthrough', sparse_threshold=0.)

In [27]:
pipeline.fit(pd.concat([train.drop(columns_to_drop, 1), test.drop(columns_to_drop, 1)]));

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train.drop(columns_to_drop, 1), Y, test_size=0.15)

In [18]:
train_prepared = pipeline.transform(X_train)
test_prepared = pipeline.transform(X_test)

In [200]:
products_cluster = ProductsClusterAttribute()
products_cluster.fit(train_prepared[:, -21:])

train_prepared = np.hstack((train_prepared[:, :-21], products_cluster.transform(train_prepared[:, -21:])))
test_prepared = np.hstack((test_prepared[:, :-21], products_cluster.transform(test_prepared[:, -21:])))

In [19]:
remove_ones(train_prepared)
remove_ones(test_prepared)

In [33]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV


class ProductClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, internal_classifier_class=GradientBoostingClassifier, param_grid=None, params=None, verbose=True):
        self.internal_classifier_class = internal_classifier_class
        self.verbose = verbose
        self.param_grid = param_grid
        self.params = params
    
    def fit(self, X, y):
        self.n_products = y.shape[1]
        self.internal_classifiers = []
        
        for product_index in range(self.n_products):
            classifier = self.internal_classifier_class()
            
            col_to_remove = -self.n_products + product_index
            X_train_tmp = np.delete(X, col_to_remove, 1)
            
            if self.params and self.params[product_index]:
                classifier.set_params(**self.params[product_index])
            elif self.param_grid:
                best_params = self.grid_search_cv(classifier, X_train_tmp, y[:, product_index])
                classifier.set_params(**best_params)
                if self.verbose:
                    print(f'Internal classifier {product_index} best params {best_params}')
            
            classifier.fit(X_train_tmp, y[:, product_index])
            self.internal_classifiers.append(classifier)
            
            if self.verbose:
                print(f'Internal classifier {product_index} training finished')
            
        return self
    
    def grid_search_cv(self, model, X, y):
        grid_search = GridSearchCV(model, self.param_grid, scoring='neg_log_loss', n_jobs=-1)
        grid_search.fit(X, y)
        return grid_search.best_params_
    
    def get_internal_classifiers_params(self):
        return [classifier.get_params() for classifier in self.internal_classifiers]
    
    def set_internal_classifiers(self, classifiers):
        self.internal_classifiers = classifiers
        self.n_products = len(classifiers)
        
    def predict(self, X, set_ones_from_x=True):
        return self._make_predictions(X, set_ones_from_x=set_ones_from_x)
    
    def predict_proba(self, X, set_ones_from_x=True):
        return self._make_predictions(X, proba=True, set_ones_from_x=set_ones_from_x)
    
    def _make_predictions(self, X, proba=False, set_ones_from_x=True):
        predictions = []
        
        for product_index in range(self.n_products):
            col_to_remove = -self.n_products + product_index
            X_tmp = np.delete(X, col_to_remove, 1)
            
            if proba:
                prediction = self.internal_classifiers[product_index].predict_proba(X_tmp)
            else:
                prediction = self.internal_classifiers[product_index].predict(X_tmp)
            predictions.append(prediction[:, 1])
        
        predictions = np.array(predictions).T
        if set_ones_from_x:
            predictions[X[:, -self.n_products:] == 1] = 1
        
        return predictions
    
    def score(self, X, y):
        predictions = self.predict_proba(X)
        return log_loss(y.reshape(-1, 1, order='F').astype('float64'),
                        predictions.reshape(-1, 1, order='F').astype('float64'))

## GridSearch internal_classifiers

In [None]:
product_index = 0

In [106]:
param_grid = {
    'max_depth': [2, 3, 4],
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'n_estimators': [100, 150, 200, 300],
    'subsample': [0.8, 1],
    'gamma': [1, 0],
    'reg_alpha ': [0, 0.1, 0.3]
}

col_to_remove = -N_PR + product_index
X_train_tmp = np.delete(train_prepared, col_to_remove, 1)

grid_search = GridSearchCV(XGBClassifier(), param_grid, verbose=2, n_jobs=-1, scoring='neg_log_loss', refit=True)
grid_search.fit(X_train_tmp, Y_train.values[:, product_index])

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 44.3min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 68.6min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 89.5min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 119.6min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 136.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': [1, 0],
                         'learning_rate': [0.01, 0.05, 0.1, 0.3],
                         'max_depth': [2, 3, 4],
                   

In [127]:
grid_search.best_params_

{'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 2,
 'n_estimators': 150,
 'reg_alpha ': 0,
 'subsample': 1}

In [107]:
best_single_model = grid_search.best_estimator_

In [14]:
from sklearn.metrics import confusion_matrix


def grid_search_for_one_product(model_class, param_grid, X, Y, product_index):
    X_train_tmp = get_x_for_one_product(X, product_index)
    y = Y[:, product_index]
    
    grid_search = GridSearchCV(model_class(), param_grid, verbose=2, n_jobs=-1, scoring='neg_log_loss', refit=True)
    grid_search.fit(X_train_tmp, y)
    
    return grid_search.best_estimator_, grid_search.best_params_

def predict_for_one_product(model, X, product_index, proba=True):
    X_train_tmp = get_x_for_one_product(X, product_index)
    
    pred = model.predict_proba(X_train_tmp)[:, 1]
    pred = np.array(pred)
    pred[X[:, -N_PR + product_index] == 1] = 1
    
    return pred

def get_x_for_one_product(X, product_index):
    col_to_remove = -N_PR + product_index
    X_train_tmp = np.delete(X, col_to_remove, 1)
    
    return X_train_tmp

def score_for_one_product(model, X, Y, product_index):
    pred = predict_for_one_product(model, X, product_index)
    
    return log_loss(Y.values[:, product_index].astype('float64'), pred.astype('float64'))

def confusion_matrix_for_one_product(model, X, Y, product_index):
    X_tmp = get_x_for_one_product(X, product_index)
    pred = model.predict(X_tmp)
    
    print(confusion_matrix(Y[:, product_index], pred))

In [158]:
product_index = 1

param_grid = {
    'max_depth': [2, 3, 4],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 150, 200, 300],
    'subsample': [0.8, 1],
    'gamma': [0.1, 0],
    'reg_alpha ': [0, 0.1, 0.3],
    'n_jobs ': [-1],
    'random_state ': [42],
    'scale_pos_weight ': [1, (Y_train.values[:, product_index] == 1).sum() / (Y_train.values[:, product_index] == 0).sum()]
}

best_classifier, best_classifier_params = grid_search_for_one_product(XGBClassifier,
                                                                      param_grid,
                                                                      train_prepared,
                                                                      Y_train.values,
                                                                      product_index)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 27.7min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 44.8min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 70.5min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 92.5min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 121.3min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 154.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 193.0min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 235.0min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 281.8min finished


In [164]:
best_classifier_params

{'gamma': 0,
 'learning_rate': 0.2,
 'max_depth': 3,
 'n_estimators': 300,
 'n_jobs ': -1,
 'random_state ': 42,
 'reg_alpha ': 0,
 'scale_pos_weight ': 1,
 'subsample': 1}

In [189]:
score_for_one_product(best_classifier, train_prepared, Y_train, product_index)

0.04463866945650593

In [190]:
score_for_one_product(best_classifier, test_prepared, Y_test, product_index)

0.04841947222509766

In [192]:
confusion_matrix_for_one_product(best_classifier, train_prepared, Y_train.values, product_index)

[[20239   253]
 [  274  1083]]


In [193]:
confusion_matrix_for_one_product(best_classifier, test_prepared, Y_test.values, product_index)

[[6766   94]
 [  98  325]]


In [15]:
from sklearn.model_selection import cross_val_score

def scores_for_models(models, product_index):
    FTM = '{:<30}   {:<25}  {:<25}'
    print(FTM.format(*f'model mean std performance_gain'.split()))
    
    
    for model in models:
        X_tmp = get_x_for_one_product(full_data_prepared, product_index)

        score = cross_val_score(model, X_tmp, Y.values[:, product_index],
                                scoring='neg_log_loss', verbose=0, n_jobs=-1)
        mean = -score.mean()
        std = score.std()

        print(FTM.format(model.__class__.__name__, mean, std))

In [417]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = [
    SVC(probability=True),
    KNeighborsClassifier(),
    XGBClassifier(),
    GradientBoostingClassifier(), 
    RandomForestClassifier()
]

scores_for_models(models, product_index)

model                            mean                       std                      
SVC                              0.2676891174080283         0.15970208713752207      
KNeighborsClassifier             0.24846642658667273        0.02067590298642904      
XGBClassifier                    0.033255301131902254       0.0014557602670413746    
GradientBoostingClassifier       0.03509443247733283        0.0021850642751806595    
RandomForestClassifier           0.062193276160904996       0.011522046496813506     


In [416]:
product_index = 3

In [429]:
base_params = {
    'learning_rate': 0.3,
    'n_jobs ': -1,
    'random_state ': 42,
}

n_estimators_values = [100, 150, 200, 300, 400, 500]
scores_for_params(XGBClassifier, base_params, 'n_estimators', n_estimators_values, product_index)

n_estimators           mean                       std                        performance_gain         
100                    0.02722047704254462        0.001827477903095133       9999.972779522957        
150                    0.0259954789396318         0.002047083351987715       0.001224998102912822     
200                    0.025555109019793805       0.002336230103016028       0.00044036991983799345   
300                    0.02569496527197419        0.002450922094331987       -0.00013985625218038492  
400                    0.026361793206192707       0.0026634932905192695      -0.0006668279342185168   
500                    0.027005444251445715       0.0028577650710894346      -0.0006436510452530082   


In [430]:
base_params = {
    'n_estimators': 100,
    'learning_rate': 0.3,
    'n_jobs ': -1,
    'random_state ': 42,
}

max_depth_values = [1, 2, 3, 4, 5, 6, 7]
scores_for_params(XGBClassifier, base_params, 'max_depth', max_depth_values, product_index)

max_depth              mean                       std                        performance_gain         
1                      0.0359726379650966         0.0019947586841376013      9999.964027362035        
2                      0.03037986979950822        0.0016555536118216213      0.005592768165588381     
3                      0.02722047704254462        0.001827477903095133       0.0031593927569636       
4                      0.026126902865185642       0.002268672513537371       0.0010935741773589781    
5                      0.02557169909262625        0.0018495325228055882      0.0005552037725593921    
6                      0.02585585598241748        0.0018788013381550066      -0.00028415688979123097  
7                      0.02642923222964011        0.0019438095654037296      -0.0005733762472226282   


In [431]:
base_params = {
    'n_estimators': 100,
    'learning_rate': 0.3,
    'max_depth': 5,
    'n_jobs ': -1,
    'random_state ': 42,
}

min_child_weight_values = [1, 3, 5, 7, 9, 11]
scores_for_params(XGBClassifier, base_params, 'min_child_weight', min_child_weight_values, product_index)

min_child_weight       mean                       std                        performance_gain         
1                      0.02557169909262625        0.0018495325228055882      9999.974428300908        
3                      0.02660581476141279        0.001761990339127031       -0.0010341156687865387   
5                      0.02745815895508688        0.0014186574630358775      -0.0008523441936740921   
7                      0.029218537128766004       0.0016053093150832686      -0.0017603781736791231   
9                      0.030662990514156774       0.0014114801950817872      -0.0014444533853907697   
11                     0.03192305625064405        0.0017036709525742792      -0.0012600657364872786   


In [432]:
base_params = {
    'n_estimators': 100,
    'learning_rate': 0.3,
    'max_depth': 5,
    'min_child_weight': 1,
    'n_jobs ': -1,
    'random_state ': 42,
}

gamma_values = [0, 0.3, 0.6, 1, 2, 5]
scores_for_params(XGBClassifier, base_params, 'gamma', gamma_values, product_index)

gamma                  mean                       std                        performance_gain         
0                      0.02557169909262625        0.0018495325228055882      9999.974428300908        
0.3                    0.02531030940672003        0.00209804557454618        0.00026138968590622147   
0.6                    0.025479426526305617       0.0018095276941066996      -0.00016911711958558784  
1                      0.025795882970334694       0.001729856801156092       -0.0003164564440290771   
2                      0.02747926876839312        0.0018351261793901389      -0.0016833857980584274   
5                      0.031502820021404775       0.0016569462563722364      -0.0040235512530116535   


In [420]:
base_params = {
    'n_estimators': 300,
    'learning_rate': 0.2,
    'max_depth': 3,
    'n_jobs ': -1,
    'random_state ': 42,
}

learning_rate_values = [0.01, 0.03, 0.05, 0.07, 0.1, 0.15, 0.2, 0.3]
scores_for_params(XGBClassifier, base_params, 'learning_rate', learning_rate_values, product_index)

learning_rate          mean                       std                        performance_gain         
0.01                   0.058513440995184074       0.0005407584165690809      9999.941486559004        
0.03                   0.0339051917713909         0.0014509961819877814      0.024608249223793177     
0.05                   0.03112810843144953        0.0015226226280853695      0.0027770833399413664    
0.07                   0.029281428847920194       0.0016563384982012832      0.0018466795835293363    
0.1                    0.027525155038580336       0.0016101215008895804      0.0017562738093398582    
0.15                   0.026060636543819637       0.0017919500049894132      0.0014645184947606991    
0.2                    0.025579814583659117       0.0021064904430235093      0.00048082196016051987   
0.3                    0.02569496527197419        0.002450922094331987       -0.00011515068831507297  


In [422]:
base_params = {
    'n_estimators': 300,
    'max_depth': 3,
    'learning_rate': 0.2,
    'n_jobs ': -1,
    'random_state ': 42
}

subsample_values = [0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
scores_for_params(XGBClassifier, base_params, 'subsample', subsample_values, product_index)

subsample              mean                       std                        performance_gain         
0.75                   0.025932026628175765       0.002400482955063481       9999.974067973371        
0.8                    0.02644747140532393        0.0020762001700241404      -0.0005154447771481656   
0.85                   0.026022178890539586       0.002324102443730788       0.00042529251478434435   
0.9                    0.0257752056481378         0.0022348586287827894      0.00024697324240178717   
0.95                   0.025537521331492723       0.0021156925687500646      0.00023768431664507664   
1.0                    0.025579814583659117       0.0021064904430235093      -4.229325216639421e-05   


In [390]:
base_params = {
    'n_estimators': 400,
    'max_depth': 4,
    'learning_rate': 0.1,
    'subsample': 1,
    'n_jobs ': -1,
    'random_state ': 42
}

colsample_bytree_values = [0.43, 0.45, 0.47]
scores_for_params(XGBClassifier, base_params, 'colsample_bytree', colsample_bytree_values, product_index)

colsample_bytree       mean                       std                        performance_gain         
0.43                   0.014041724356128221       0.0014762387958161094      9999.985958275643        
0.45                   0.014012027735046279       0.0014089091350243893      2.9696621081942015e-05   
0.47                   0.014039129033591221       0.0015190153323775938      -2.71012985449421e-05    


In [391]:
base_params = {
    'n_estimators': 400,
    'max_depth': 4,
    'learning_rate': 0.1,
    'subsample': 1,
    'colsample_bytree': 0.45,
    'n_jobs ': -1,
    'random_state ': 42
}

gamma_values = [0, 0.3, 0.6, 1, 2, 5]
scores_for_params(XGBClassifier, base_params, 'gamma', gamma_values, product_index)

gamma                  mean                       std                        performance_gain         
0                      0.014012027735046279       0.0014089091350243893      9999.985987972264        
0.3                    0.01414710333366707        0.0015585534946994694      -0.00013507559862079195  
0.6                    0.014209868955775376       0.0015537548173317028      -6.276562210830519e-05   
1                      0.014454189474159116       0.0015218171690944817      -0.0002443205183837397   
2                      0.01541322338782312        0.0015812520583829324      -0.0009590339136640036   
5                      0.017358889842922765       0.0012754263723559942      -0.0019456664550996459   


In [392]:
base_params = {
    'n_estimators': 400,
    'max_depth': 4,
    'learning_rate': 0.1,
    'subsample': 1,
    'colsample_bytree': 0.45,
    'gamma': 0,
    'n_jobs ': -1,
    'random_state ': 42
}

min_child_weight_values = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7]
scores_for_params(XGBClassifier, base_params, 'min_child_weight', min_child_weight_values, product_index)

min_child_weight       mean                       std                        performance_gain         
0.01                   0.013899147643690848       0.0018337749923535342      9999.986100852357        
0.05                   0.01386827174312944        0.001786319948504007       3.0875900561408084e-05   
0.1                    0.013844883498463467       0.0017776129256542742      2.3388244665972432e-05   
0.2                    0.013948261941392112       0.001602897832052705       -0.0001033784429286446   
0.3                    0.01402806527559385        0.0016039751012628313      -7.980333420173902e-05   
0.5                    0.014037658897000987       0.001598849008733256       -9.593621407136266e-06   
0.7                    0.014290238423488205       0.0014266734074982793      -0.0002525795264872177   


In [395]:
base_params = {
    'n_estimators': 400,
    'max_depth': 4,
    'learning_rate': 0.1,
    'subsample': 1,
    'colsample_bytree': 0.45,
    'gamma': 0,
    'min_child_weight': 0.2, 
    'n_jobs ': -1,
    'random_state ': 42
}

reg_alpha_values = [0.35, 0.4, 0.45]
scores_for_params(XGBClassifier, base_params, 'reg_alpha', reg_alpha_values, product_index)

reg_alpha              mean                       std                        performance_gain         
0.35                   0.013757388584973402       0.0013940014459754323      9999.986242611414        
0.4                    0.013733622493375086       0.0013830909905114448      2.3766091598315456e-05   
0.45                   0.013849381979436675       0.0014500024847267474      -0.00011575948606158855  


In [396]:
base_params = {
    'n_estimators': 400,
    'max_depth': 4,
    'learning_rate': 0.1,
    'subsample': 1,
    'colsample_bytree': 0.45,
    'gamma': 0,
    'min_child_weight': 0.2, 
    'reg_alpha': 0.4,
    'n_jobs ': -1,
    'random_state ': 42
}

reg_lambda_values = [0.1, 0.5, 0.8, 0.9, 1, 1.5, 3]
scores_for_params(XGBClassifier, base_params, 'reg_lambda', reg_lambda_values, product_index)

reg_lambda             mean                       std                        performance_gain         
0.1                    0.013873359506543444       0.0015062136002397927      9999.986126640493        
0.5                    0.013799119143872874       0.0014431526855419351      7.424036267057038e-05    
0.8                    0.013824560627789112       0.0014566834638338913      -2.544148391623799e-05   
0.9                    0.013798320141437007       0.0014317108455362201      2.6240486352104714e-05   
1                      0.013733622493375086       0.0013830909905114448      6.469764806192081e-05    
1.5                    0.013808054150046304       0.00144359708187996        -7.44316566712172e-05    
3                      0.014139821563303286       0.001334239528771931       -0.0003317674132569826   


In [397]:
base_params = {
    'n_estimators': 400,
    'max_depth': 4,
    'learning_rate': 0.1,
    'subsample': 1,
    'colsample_bytree': 0.45,
    'gamma': 0,
    'min_child_weight': 0.2, 
    'reg_alpha': 0.4,
    'reg_lambda': 1,
    'n_jobs ': -1,
    'random_state ': 42
}

scale_pos_weight_values = [0.2, 0.25, 0.3, 0.4, 0.5, 0.8, 1, 1.5, 2, 5]
scores_for_params(XGBClassifier, base_params, 'scale_pos_weight', scale_pos_weight_values, product_index)

scale_pos_weight       mean                       std                        performance_gain         
0.2                    0.017826499472364622       0.0012646683123018925      9999.982173500528        
0.25                   0.016777856051140513       0.0013534114834006868      0.0010486434212241096    
0.3                    0.01613762152616101        0.001438336615986743       0.0006402345249795043    
0.4                    0.015323592412227896       0.0013670539938975077      0.000814029113933112     
0.5                    0.014916649825897483       0.001463479985403116       0.00040694258633041346   
0.8                    0.013926290398216051       0.0013768192272799525      0.0009903594276814316    
1                      0.013733622493375086       0.0013830909905114448      0.00019266790484096484   
1.5                    0.01379127391806879        0.0014820788642921842      -5.765142469370439e-05   
2                      0.013896383683025596       0.0017556175750745207  

In [12]:
def scores_for_params(model_class, base_params, param, param_values, product_index):
    FTM = '{:<20}   {:<25}  {:<25}  {:<25}'
    print(FTM.format(*f'{param} mean std performance_gain'.split()))
    
    model = model_class(**base_params)
    
    last_score = 10000
    
    for param_value in param_values:
        model.set_params(**{param: param_value})
        X_tmp = get_x_for_one_product(full_data_prepared, product_index)

        score = cross_val_score(model, X_tmp, Y.values[:, product_index],
                                scoring='neg_log_loss', verbose=0, n_jobs=-1)
        mean = -score.mean()
        std = score.std()

        print(FTM.format(param_value, mean, std, last_score - mean))
        last_score = mean

In [29]:
import pickle

model = pickle.load(open('best_model_so_far.pkl', 'rb'))

In [17]:
import pickle

best_params = pickle.load(open('best_params_so_far.pkl', 'rb'))

In [27]:
from sklearn.model_selection import  KFold
from xgboost import XGBClassifier


n_clusters_results = []
n_clusters_to_try = [80, 85] + list(range(100, 201, 10))

for n_clusters in n_clusters_to_try:
    print(f'n_clusters={n_clusters} cross validation started')
    
    product_classifier = ProductClassifier(XGBClassifier, params=best_params, verbose=False)
    k_fold = KFold(5, shuffle=True)
    
    scores = []
    
    for train_index, val_index in k_fold.split(full_data_prepared, Y.values):
        train_data_x = full_data_prepared[train_index, :]
        train_data_y = Y.values[train_index, :]
        val_data_x = full_data_prepared[val_index, :]
        val_data_y = Y.values[val_index, :]
        
        products_cluster = ProductsClusterAttribute(n_clusters)
        products_cluster.fit(train_data_y)
        
        train_data_x = np.hstack((train_data_x[:, :-N_PR], products_cluster.transform(train_data_x[:, -N_PR:])))
        val_data_x = np.hstack((val_data_x[:, :-N_PR], products_cluster.transform(val_data_x[:, -N_PR:])))
        
        product_classifier.fit(train_data_x, train_data_y)
        scores.append(product_classifier.score(val_data_x, val_data_y))
        
        print(f'n_clusters={n_clusters} training on fold finished')
    
    n_clusters_results.append((n_clusters, np.mean(scores), np.std(scores)))
    print(f'n_clusters={n_clusters} cross validation finished')


FRT = '{:<20}  {:<20}  {:<20}'
print(FRT.format(*'n_clusters log_loss_mean log_loss_std'.split()))
for result in n_clusters_results:
    print(FRT.format(*result))

n_clusters=80 cross validation started


NameError: name 'full_data_prepared' is not defined

In [27]:
from scipy import stats

print(stats.t.interval(0.95, 4, loc=0.03716961925381999, scale=0.000743562061527577/np.sqrt(5)), 
stats.t.interval(0.95, 4, loc=0.03722173330194854, scale=0.0006893754979451699/np.sqrt(5)))

(0.03624636501159102, 0.03809287349604895) (0.03636576056490623, 0.038077706038990855)


In [509]:
products_cluster = ProductsClusterAttribute(200)
products_cluster.fit(Y_train.values)

train_prepared_cluster = np.hstack((train_prepared[:, :-21], products_cluster.transform(train_prepared[:, -21:])))
test_prepared_cluster = np.hstack((test_prepared[:, :-21], products_cluster.transform(test_prepared[:, -21:])))

In [510]:
best_params_xgb = XGBClassifier(**base_params)

In [511]:
best_params_xgb.fit(get_x_for_one_product(train_prepared_cluster, product_index), Y_train.values[:, product_index])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              n_jobs =-1, nthread=None, objective='binary:logistic',
              random_state=0, random_state =42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, seed=None, silent=None, subsample=1,
              verbosity=1)

In [512]:
score_for_one_product(best_params_xgb, test_prepared_cluster, Y_test, product_index)

0.019358024466355282

In [427]:
score_for_one_product(model.internal_classifiers[product_index], test_prepared, Y_test, product_index)

0.019699571698539864

In [414]:
-cross_val_score(best_params_xgb, train_prepared, Y_train.values[:, product_index], scoring='neg_log_loss')

array([0.01320611, 0.01461401, 0.01483579, 0.01137726, 0.01621979])

In [415]:
-cross_val_score(model.internal_classifiers[product_index], train_prepared, Y_train.values[:, product_index], scoring='neg_log_loss')

array([0.01277334, 0.01426107, 0.01489099, 0.01098458, 0.01563331])

In [428]:
model.internal_classifiers[product_index].get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 8,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 150,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1,
 'eta': 0.2}

In [315]:
default_params = list(itertools.repeat({
    'n_estimators': 150,
    'max_depth': 3,
    'n_jobs': -1
}, N_PR))

In [76]:
best_params = model.get_internal_classifiers_params()

In [79]:
internal_classifiers = model.internal_classifiers

In [55]:
full_data_prepared = pipeline.transform(train.drop(['ID', 'join_date'], 1))

In [56]:
remove_ones(full_data_prepared)

In [57]:
products_cluster = ProductsClusterAttribute(80)
products_cluster.fit(Y.values)

full_data_prepared = np.hstack((full_data_prepared[:, :-21], products_cluster.transform(full_data_prepared[:, -21:])))

In [58]:
full_data_model = ProductClassifier(XGBClassifier, params=best_params)

In [59]:
full_data_model.fit(full_data_prepared, Y.values)

Internal classifier 0 training finished
Internal classifier 1 training finished
Internal classifier 2 training finished
Internal classifier 3 training finished
Internal classifier 4 training finished
Internal classifier 5 training finished
Internal classifier 6 training finished
Internal classifier 7 training finished
Internal classifier 8 training finished
Internal classifier 9 training finished
Internal classifier 10 training finished
Internal classifier 11 training finished
Internal classifier 12 training finished
Internal classifier 13 training finished
Internal classifier 14 training finished
Internal classifier 15 training finished
Internal classifier 16 training finished
Internal classifier 17 training finished
Internal classifier 18 training finished
Internal classifier 19 training finished
Internal classifier 20 training finished


ProductClassifier(internal_classifier_class=<class 'xgboost.sklearn.XGBClassifier'>,
                  param_grid=None,
                  params=[{'base_score': 0.5, 'booster': 'gbtree',
                           'colsample_bylevel': 1, 'colsample_bynode': 1,
                           'colsample_bytree': 1, 'eta': 0.2, 'gamma': 0,
                           'learning_rate': 0.1, 'max_delta_step': 0,
                           'max_depth': 3, 'min_child_weight': 1,
                           'missing': None, 'n_estimators': 100, 'n_jobs': -1,
                           'n...
                           'colsample_bylevel': 1, 'colsample_bynode': 1,
                           'colsample_bytree': 1, 'eta': 0.2, 'gamma': 0,
                           'learning_rate': 0.1, 'max_delta_step': 0,
                           'max_depth': 6, 'min_child_weight': 1,
                           'missing': None, 'n_estimators': 200, 'n_jobs': -1,
                           'nthread': None, 'objective

In [31]:
full_data_model.score(full_data_prepared, Y.values)

0.02917459687055274

In [528]:
import pickle

pickle.dump(full_data_model, open('best_model_so_far.pkl', 'wb'))

In [529]:
pickle.dump(best_params, open('best_params_so_far.pkl', 'wb'))

In [23]:
from xgboost import  XGBClassifier
import itertools

In [19]:
products_cluster = ProductsClusterAttribute(80)
products_cluster.fit(Y_train.values)

train_prepared_cluster = np.hstack((train_prepared[:, :-21], products_cluster.transform(train_prepared[:, -21:])))
test_prepared_cluster = np.hstack((test_prepared[:, :-21], products_cluster.transform(test_prepared[:, -21:])))

In [24]:
full_data_model = ProductClassifier(XGBClassifier, params=list(itertools.repeat({'max_depth':2, 'n_jobs': -1, 'n_estimators': 50}, N_PR)))

In [25]:
full_data_model.fit(train_prepared_cluster, Y_train.values)

Internal classifier 0 training finished
Internal classifier 1 training finished
Internal classifier 2 training finished
Internal classifier 3 training finished
Internal classifier 4 training finished
Internal classifier 5 training finished
Internal classifier 6 training finished
Internal classifier 7 training finished
Internal classifier 8 training finished
Internal classifier 9 training finished
Internal classifier 10 training finished
Internal classifier 11 training finished
Internal classifier 12 training finished
Internal classifier 13 training finished
Internal classifier 14 training finished
Internal classifier 15 training finished
Internal classifier 16 training finished
Internal classifier 17 training finished
Internal classifier 18 training finished
Internal classifier 19 training finished
Internal classifier 20 training finished


ProductClassifier(internal_classifier_class=<class 'xgboost.sklearn.XGBClassifier'>,
                  param_grid=None,
                  params=[{'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth':...
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators': 50, 'n_jobs': -1},
                          {'max_depth': 2, 'n_estimators'

In [31]:
full_data_model.score(test_prepared_cluster, Y_test.values)

0.05096616369788112

## Final model

In [63]:
full_data_prepared = pipeline.transform(train.drop(columns_to_drop, 1))
remove_ones(full_data_prepared)

In [64]:
products_cluster = ProductsClusterAttribute(80)
products_cluster.fit(Y.values)

full_data_prepared = np.hstack((full_data_prepared[:, :-21], products_cluster.transform(full_data_prepared[:, -21:])))

In [27]:
import pickle

final_model = pickle.load(open('best_model_so_far_2.pkl', 'rb'))

In [65]:
xgb_params = list(itertools.repeat({
    'max_depth': 2,
    'n_jobs': -1,
    'n_estimators': 100,
    'learning_rate': 0.1
}, N_PR))

simple_model = ProductClassifier(XGBClassifier, params=xgb_params, verbose=False)
simple_model.fit(full_data_prepared, Y.values)

ProductClassifier(internal_classifier_class=<class 'xgboost.sklearn.XGBClassifier'>,
                  param_grid=None,
                  params=[{'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 100, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 100, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 100, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_esti...
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 100, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 100, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 100, 'n_jobs': -1},
         

In [66]:
full_data_prepared = np.hstack((simple_model.predict_proba(full_data_prepared), full_data_prepared))

In [32]:
final_model.fit(full_data_prepared, Y.values)

Internal classifier 0 training finished
Internal classifier 1 training finished
Internal classifier 2 training finished
Internal classifier 3 training finished
Internal classifier 4 training finished
Internal classifier 5 training finished
Internal classifier 6 training finished
Internal classifier 7 training finished
Internal classifier 8 training finished
Internal classifier 9 training finished
Internal classifier 10 training finished
Internal classifier 11 training finished
Internal classifier 12 training finished
Internal classifier 13 training finished
Internal classifier 14 training finished
Internal classifier 15 training finished
Internal classifier 16 training finished
Internal classifier 17 training finished
Internal classifier 18 training finished
Internal classifier 19 training finished
Internal classifier 20 training finished


ProductClassifier(internal_classifier_class=<class 'xgboost.sklearn.XGBClassifier'>,
                  param_grid=None,
                  params=[{'base_score': 0.5, 'booster': 'gbtree',
                           'colsample_bylevel': 1, 'colsample_bynode': 1,
                           'colsample_bytree': 1, 'eta': 0.2, 'gamma': 0,
                           'learning_rate': 0.1, 'max_delta_step': 0,
                           'max_depth': 3, 'min_child_weight': 1,
                           'missing': None, 'n_estimators': 100, 'n_jobs': -1,
                           'n...
                           'colsample_bylevel': 1, 'colsample_bynode': 1,
                           'colsample_bytree': 1, 'eta': 0.2, 'gamma': 0,
                           'learning_rate': 0.1, 'max_delta_step': 0,
                           'max_depth': 6, 'min_child_weight': 1,
                           'missing': None, 'n_estimators': 200, 'n_jobs': -1,
                           'nthread': None, 'objective

In [33]:
final_model.score(full_data_prepared, Y.values)

0.011893614510613505

In [67]:
xgb_final_params = list(itertools.repeat({
    'max_depth': 2,
    'n_jobs': -1,
    'n_estimators': 110,
    'learning_rate': 0.1
}, N_PR))

final_model = ProductClassifier(XGBClassifier, params=xgb_final_params)
final_model.fit(full_data_prepared, Y.values)

Internal classifier 0 training finished
Internal classifier 1 training finished
Internal classifier 2 training finished
Internal classifier 3 training finished
Internal classifier 4 training finished
Internal classifier 5 training finished
Internal classifier 6 training finished
Internal classifier 7 training finished
Internal classifier 8 training finished
Internal classifier 9 training finished
Internal classifier 10 training finished
Internal classifier 11 training finished
Internal classifier 12 training finished
Internal classifier 13 training finished
Internal classifier 14 training finished
Internal classifier 15 training finished
Internal classifier 16 training finished
Internal classifier 17 training finished
Internal classifier 18 training finished
Internal classifier 19 training finished
Internal classifier 20 training finished


ProductClassifier(internal_classifier_class=<class 'xgboost.sklearn.XGBClassifier'>,
                  param_grid=None,
                  params=[{'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 110, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 110, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 110, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_esti...
                           'n_estimators': 110, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 110, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
                           'n_estimators': 110, 'n_jobs': -1},
                          {'learning_rate': 0.1, 'max_depth': 2,
         

In [68]:
final_model.score(full_data_prepared, Y.values)

0.02923096124809288

## Test set

In [11]:
def submission_format(data, ids, columns=Y.columns):
    submission = pd.DataFrame(data=data, columns=Y.columns)
    submission['ID'] = ids
    submission = pd.melt(submission, id_vars=['ID'], value_vars=Y.columns, var_name="PCODE", value_name="Label")
    submission['ID X PCODE'] = submission['ID'] + ' X ' + submission['PCODE']
    return submission[['ID X PCODE', 'Label']]

In [12]:
test = pd.read_csv('./dataset/Test.csv')

In [70]:
test_final_prepared = pipeline.transform(test.drop(columns_to_drop, 1))

In [71]:
test_final_prepared = np.hstack((test_final_prepared[:, :-21], products_cluster.transform(test_final_prepared[:, -21:])))

In [72]:
test_final_prepared = np.hstack((simple_model.predict_proba(test_final_prepared), test_final_prepared))

In [73]:
test_predictions = final_model.predict_proba(test_final_prepared)

In [74]:
submission = submission_format(test_predictions, test['ID'])

In [75]:
submission.to_csv('results_4.csv', index=False)

## Exploration

In [None]:
from sklearn.cluster import KMeans

inert = []
clusters = range(2, 30)
for n_clusters in clusters:
    kmeans = KMeans(n_clusters)
    kmeans.fit(train_prepared[:, -21:])
    inert.append(kmeans.inertia_)

plt.plot(clusters, inert)

In [40]:
def upsample(X, Y):
    new_X = []
    new_Y = []
    
    for x_row, y_row in zip(X, Y):
        ones_indices = np.where(y_row == 1)[0]
        
        for i in ones_indices:
            x_new_row = np.copy(x_row)
            x_new_row[-N_PR + i] = 0
            
            new_X.append(x_new_row)
            new_Y.append(np.copy(y_row))
    
    return np.array(new_X), np.array(new_Y)

def upsample_for_one_product(X, Y, product_index):
    new_X = []
    new_y = []
    
    y = Y[:, product_index]
    for x_row, y in zip(X, y):
        if y == 1:
            new_X.append(np.copy(x_row))
            new_y.append(1)
        else:
            ones_indices = np.where(x_row[-N_PR:] == 1)[0]
            for i in ones_indices:
                x_new_row = np.copy(x_row)
                x_new_row[-N_PR + i] = 0
                
                new_X.append(x_new_row)
                new_y.append(0)
    
    return np.array(new_X), np.array(new_y)

In [65]:
from sklearn.model_selection import cross_val_score


def test_join_data_attribute(X, Y, model):
    data = pipeline.transform(X.drop(columns_to_drop, 1))
    remove_ones(data)
    model.verbose = False
    
    data_timestamps = pd.to_datetime(X['join_date'])
    year = pd.DatetimeIndex(data_timestamps).year.values.reshape(-1, 1)
    month = pd.DatetimeIndex(data_timestamps).month.values.reshape(-1, 1)
    day = pd.DatetimeIndex(data_timestamps).day.values.reshape(-1, 1)
    
    data_with_year = np.hstack((year, data))
    data_with_month = np.hstack((month, data))
    data_with_day = np.hstack((day, data))
    data_with_ymd = np.hstack((year, month, day, data))
    
    descriptions = ['Without year, month, day', 'With year',
                    'With month', 'With day', 'With everything']
    data_types = [data, data_with_year, data_with_month, data_with_day, data_with_ymd]
    
    
    for desc, d in zip(descriptions, data_types):
        scores = cross_val_score(model, d, Y.values, cv=7, n_jobs=-1)
        print(f'{desc}: mean={scores.mean()} std={scores.std()}')
    
test_join_data_attribute(train, Y, model)

Without year, date, month: mean=-0.04553042340288534 std=-0.0008292302638647557
With year: mean=-0.04052095208559906 std=-0.0007456094676233196
With month: mean=-0.045532192298556914 std=-0.0008314816359417681
With day: mean=-0.04530110197255082 std=-0.0008091571176300878
With everything: mean=-0.03942802745108379 std=-0.0006775761791297971


In [81]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score


def test_one_hot_vs_ordinal(X, Y, model):
    data_one_hot = pipeline.transform(X.drop(columns_to_drop, 1))
    remove_ones(data_one_hot)
    
    ordinal_pipeline = ColumnTransformer([
        ('encoder', OrdinalEncoder(), ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code']),
        ('year_month_day', YearMonthDayAttributes(), ['join_date'])
    ], remainder='passthrough', sparse_threshold=0.)
    
    data_ordinal = ordinal_pipeline.fit_transform(X.drop(columns_to_drop, 1))
    remove_ones(data_ordinal)
    
    for desc, data in zip(['One hot encoding', 'Ordinal encoding'], [data_one_hot, data_ordinal]):
        scores = cross_val_score(model, data, Y.values, cv=5, n_jobs=-1)
        print(f'{desc}: mean={scores.mean()} std={scores.std()}')

test_one_hot_vs_ordinal(train, Y, model)

One hot encoding: mean=0.03952546360993871 std=0.0008606107670312565
Ordinal encoding: mean=0.03977522651709635 std=0.0009113774705102566


In [46]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import itertools


def test_stacking(X, Y, model_class, model_params):
    fmt = '{:<20} {:<20}'
    print(fmt.format('model', 'score'))
    
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)
    
    X_train = pipeline.transform(X_train.drop(columns_to_drop, 1))
    X_val = pipeline.transform(X_val.drop(columns_to_drop, 1))
    
    remove_ones(X_train)
    remove_ones(X_val)
    
    products_cluster = ProductsClusterAttribute(80)
    products_cluster.fit(Y_train)

    X_train = np.hstack((X_train[:, :-21], products_cluster.transform(X_train[:, -21:])))
    X_val = np.hstack((X_val[:, :-21], products_cluster.transform(X_val[:, -21:])))
    
    #regular_model
    regular_model = ProductClassifier(model_class, params=model_params, verbose=False)
    regular_model.fit(X_train, Y_train)
    print(fmt.format('Regular model train', regular_model.score(X_train, Y_train)))
    print(fmt.format('Regular model val', regular_model.score(X_val, Y_val)))
    
    #stacking
    X_train_stacked = np.hstack((regular_model.predict_proba(X_train, set_ones_from_x=False), X_train))
    X_val_stacked = np.hstack((regular_model.predict_proba(X_val, set_ones_from_x=False), X_val))
    
    for level in range(1, 7):
        stacking_model = ProductClassifier(model_class, params=model_params, verbose=False)
        stacking_model.fit(X_train_stacked, Y_train)
        
        train_score = stacking_model.score(X_train_stacked, Y_train)
        val_score = stacking_model.score(X_val_stacked, Y_val)
        
        print(fmt.format(f'Stack level {level} train', train_score))
        print(fmt.format(f'Stack level {level} val', val_score))
        
        train_predictions = stacking_model.predict_proba(X_train_stacked, set_ones_from_x=False)
        val_predictions = stacking_model.predict_proba(X_val_stacked, set_ones_from_x=False)
        
        X_train_stacked = np.hstack((train_predictions, X_train_stacked))
        X_val_stacked = np.hstack((val_predictions, X_val_stacked))
        

xgb_params = list(itertools.repeat({
    'max_depth': 2,
    'n_jobs': -1,
    'n_estimators': 50,
    'learning_rate': 0.1,
}, N_PR))

test_stacking(train, Y.values, XGBClassifier, xgb_params)

model                score               
Regular model train  0.047476564132147056
Regular model val    0.04974279389545674 
Stack level 1 train  0.0381584146696273  
Stack level 1 val    0.04129655125188827 
Stack level 2 train  0.035541379614598724
Stack level 2 val    0.03932487556157789 
Stack level 3 train  0.03397934411240745 
Stack level 3 val    0.03853234726125814 
Stack level 4 train  0.03300187263145432 
Stack level 4 val    0.0382805374592969  
Stack level 5 train  0.0323187153214861  
Stack level 5 val    0.038114127555309574
Stack level 6 train  0.031755163550807604
Stack level 6 val    0.03821453398455538 
