In [None]:
#main

import numpy as np
import pandas as pd
import scipy as sp
import os
import sys
import shelve 
import string
import time
import datetime
import re

In [None]:
# support

# csv to pd loader
from corelib.utilities import loader

# reduce memory usage for pd-index
from corelib.utilities import reduce_mem_usage

# searching for unical ordered values
from corelib.utilities import search_func

In [None]:
#sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
#encoders

from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.backward_difference import BackwardDifferenceEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.hashing import HashingEncoder
from category_encoders.binary import BinaryEncoder
from category_encoders.polynomial import PolynomialEncoder

In [None]:
#double validation

from typing import List
class DoubleValidationEncoderNumerical:
    """
    Encoder with validation within
    """
    def __init__(self, cols: List, encoder, folds):
        """
        :param cols: Categorical columns
        :param encoder: Encoder class
        :param folds: Folds to split the data
        """
        self.cols = cols
        self.encoder = encoder
        self.encoders_dict = {}
        self.folds = folds

    def fit_transform(self, X: pd.DataFrame, y: np.array) -> pd.DataFrame:
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        for n_fold, (train_idx, val_idx) in enumerate(self.folds.split(X, y)):
            X_train, X_val = X.loc[train_idx].reset_index(drop=True), X.loc[val_idx].reset_index(drop=True)
            y_train, y_val = y[train_idx], y[val_idx]
            _ = self.encoder.fit_transform(X_train, y_train)

            # transform validation part and get all necessary cols
            val_t = self.encoder.transform(X_val)

            if n_fold == 0:
                cols_representation = np.zeros((X.shape[0], val_t.shape[1]))
            
            self.encoders_dict[n_fold] = self.encoder

            cols_representation[val_idx, :] += val_t.values

        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)

        return cols_representation

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.reset_index(drop=True)

        cols_representation = None

        for encoder in self.encoders_dict.values():
            test_tr = encoder.transform(X)

            if cols_representation is None:
                cols_representation = np.zeros(test_tr.shape)

            cols_representation = cols_representation + test_tr / self.folds.n_splits

        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)
        
        return cols_representation

In [None]:
#paths

data_path = os.path.realpath('../input')
dump_path = os.path.realpath('../kernels/loaded_data')

### dump from this point

In [None]:
data = loader(data_path)
data.keys()

In [None]:
df_train_x, df_train_y, df_test_x, df_test_y = data.values()
del data

In [None]:
# Dump loaded and prepared data
with shelve.open(dump_path) as s:
    s["df_train"] = df_train_x
    s["df_test"] = df_test_x

In [None]:
df_train_x = reduce_mem_usage(df_train_x)

### to this point

In [None]:
# Dump open (prepared dataset)
with shelve.open(dump_path) as o:
    df_train = o["df_train"]
    df_test = o["df_test"]

In [None]:
# preprocessing pipline
pipePre = Pipeline([
    ('simpleimputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('standardscaler', StandardScaler()),
    ('normalizer', Normalizer())
     ])

In [None]:
df_train_x = pipePre.fit_transform(df_train_x)
df_test_x = pipePre.fit_transform(df_test_x)
del df_train_x
del df_test_x

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size = 0.25, random_state=42)
N_train, _ = X_train.shape 
N_test,  _ = X_test.shape 
print(N_train, N_test)