In [None]:
#main libraris
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import os, sys, shelve, string, time, datetime

In [None]:
#sklearn
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_val_score

from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.backward_difference import BackwardDifferenceEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.hashing import HashingEncoder
from category_encoders.binary import BinaryEncoder
from category_encoders.polynomial import PolynomialEncoder

In [None]:
#support function
from typing import List
class DoubleValidationEncoderNumerical:
    """
    Encoder with validation within
    """
    def __init__(self, cols: List, encoder, folds):
        """
        :param cols: Categorical columns
        :param encoder: Encoder class
        :param folds: Folds to split the data
        """
        self.cols = cols
        self.encoder = encoder
        self.encoders_dict = {}
        self.folds = folds

    def fit_transform(self, X: pd.DataFrame, y: np.array) -> pd.DataFrame:
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        for n_fold, (train_idx, val_idx) in enumerate(self.folds.split(X, y)):
            X_train, X_val = X.loc[train_idx].reset_index(drop=True), X.loc[val_idx].reset_index(drop=True)
            y_train, y_val = y[train_idx], y[val_idx]
            _ = self.encoder.fit_transform(X_train, y_train)

            # transform validation part and get all necessary cols
            val_t = self.encoder.transform(X_val)

            if n_fold == 0:
                cols_representation = np.zeros((X.shape[0], val_t.shape[1]))
            
            self.encoders_dict[n_fold] = self.encoder

            cols_representation[val_idx, :] += val_t.values

        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)

        return cols_representation

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.reset_index(drop=True)

        cols_representation = None

        for encoder in self.encoders_dict.values():
            test_tr = encoder.transform(X)

            if cols_representation is None:
                cols_representation = np.zeros(test_tr.shape)

            cols_representation = cols_representation + test_tr / self.folds.n_splits

        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)
        
        return cols_representation

In [None]:
# data-loader
from zipfile import ZipFile
def loader(path, index_col=False):
    """
    Unpack kaggle zip-data, then return dict of pd.data
    
    Parameters
    ----------
    path: current path to folder with data
        String
    
    index_col: Column to use as the row labels of the DataFrame, either given as string name or column index.  
    If a sequence of int / str is given, a MultiIndex is used.
    Note: index_col=False can be used to force pandas to not use the first column as the index, e.g. when 
    you have a malformed file with delimiters at the end of each line. 
        int, str, sequence of int / str, or False, default None
        
    Future:
    encoding='utf-8' parameter for open method

    """
    data_dict = {}
    for i in os.listdir(path):
        if os.path.splitext(os.path.join(path, i))[1] == ".zip":
            with ZipFile(os.path.join(path, i), 'r') as g:
                file_list = g.namelist()
                for file_name in file_list:
                    if file_name.endswith('.csv'):
                        with g.open(file_name) as h:
                            filename = os.path.splitext(file_name)[0]
                            data_dict[filename] = pd.read_csv(h, index_col=index_col)
        elif os.path.splitext(os.path.join(path, i))[1] == ".csv":
            with open(os.path.join(path, i), 'r') as g:
                filename = os.path.splitext(i)[0]
                data_dict[filename] = pd.read_csv(g, index_col=index_col)            
    return data_dict

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    """
    Reduse numeric 
    
    Parameters
    ----------
    df: pandas data frame
        pd.DataFrame object

    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# searching for unical ordered value
def searchFunc(data, *cols):
    """
    Function return dictionary of the form: 'value': index, that can be used for 
    mapping in ordered feature encoding estimators
    
    Parameters
    ----------
    data: pandas data frame
        pd.DataFrame object
    
    cols: list of columns, where function search for unical ordered value 
        list, tuple
    
    """
    full_map = []
    for i in cols:
        mapping = {}
        for idx, val in enumerate(pd.unique(sorted(data[i]))):
            mapping[val] = idx
        full_map.append(mapping)
    return full_map

### dump from this point

In [None]:
data = loader(os.path.realpath('../input'))
data.keys()

In [None]:
df_train_x, df_train_y, df_test_x, df_test_y = data.values()
del data

In [None]:
# Dump loaded and prepared data
with shelve.open(os.path.realpath('../kernels/loaded_data')) as s:
    s["df_train"] = df_train_x
    s["df_test"] = df_test_x

In [None]:
df_train_x = reduce_mem_usage(df_train_x)

### to this point

In [None]:
# Dump open (prepared dataset)
with shelve.open(os.path.realpath('../kernels/loaded_data')) as o:
    df_train = o["df_train"]
    df_test = o["df_test"]

In [None]:
# preprocessing pipline
pipePre = Pipeline([
    ('simpleimputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('standardscaler', StandardScaler()),
    ('normalizer', Normalizer())
     ])

In [None]:
df_train_x = pipePre.fit_transform(df_train_x)
df_test_x = pipePre.fit_transform(df_test_x)
del df_train_x
del df_test_x

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size = 0.25, random_state=42)
N_train, _ = X_train.shape 
N_test,  _ = X_test.shape 
print(N_train, N_test)