In [None]:
# import packages (write a macro to import all of these at once later)
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb
import h5py
import time

from itertools import product
from sklearn.model_selection import KFold
from sklearn.cross_validation import StratifiedKFold
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import catboost as cat

# import undersampling methods
from imblearn.under_sampling import (ClusterCentroids, RandomUnderSampler,
                                     NearMiss, InstanceHardnessThreshold,
                                     CondensedNearestNeighbour,
                                     EditedNearestNeighbours,
                                     RepeatedEditedNearestNeighbours,
                                     AllKNN, OneSidedSelection,
                                     NeighbourhoodCleaningRule)

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
import sklearn.preprocessing


"""
this is an sklearn wrapper for XGBoost. 
This allows us to use sklearnâ€™s 
Grid Search with parallel processing 
in the same way we did for GBM
"""
from xgboost.sklearn import XGBClassifier as xgb_sk
from xgboost import XGBClassifier as xgb_xgb
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

<h1><center> Mean Encoding </center></h1>

Mean encoding may be referred to as likelihood encoding or target encoding.

Mean encoding produces more information to feed the model and helps the model to work better achieving better loss.
If you have a large data set, then this should work fine, if your data set is not too big, then you may want to use other regularizing method on top of this one, like smoothing.

You can use stratified K-fold if you want.

Finally, there are only two ways, ```mean``` and ```counts```. You can use other ways as well.

<font color='red'>**Warning**</font> These function applies changes in place. If you do not want your original dataFrame to change, make a copy of it!

In [None]:
def regularize_cv(df_tr, cols, target, method, no_folds=5, rand_state=100):
    # split the data into no_folds folds
    kf = KFold(n_splits=no_folds, shuffle=False, random_state=100)
    kf.get_n_splits(df_tr)
    
    # pick up the target
    y_tr = df_tr[target].values
    
    #
    # mean encoding
    #
    if method=='mean':
        for col in cols:
            df_tr[col+"_mean_target"] = None

        for tr_ind, val_ind in kf.split(df_tr):
            # gran validation and training sets.
            X_tr, X_val = df_tr.iloc[tr_ind].copy(), df_tr.iloc[val_ind].copy()
            for col in cols:
                """
                 map mean_encoding of training set back to validation set
                 this is done to prevent data leackage.
                 This is a way of regularizing (prevent leackage)
                 and would work just fine if size of data is big.
                """
                means = X_val[col].map(X_tr.groupby(col)[target].mean())
                X_val.loc[:, col+'_mean_target'] = means
            df_tr.iloc[val_ind]= X_val

        # replace NaNs with global mean
        prior = df_tr[target].mean()
        df_tr.fillna(prior, inplace=True)
        return df_tr
    #
    # count encoding (Why in the log we were getting division by zero,
    #                 while clearly it was not zero?)
    elif method=='count':
        for col in cols:
            df_tr[col+"_count_target"] = None

        for tr_ind, val_ind in kf.split(df_tr):
            # gran validation and training sets.
            X_tr, X_val = df_tr.iloc[tr_ind].copy(), df_tr.iloc[val_ind].copy()
            for col in cols:
                means = X_val[col].map(X_tr.groupby(col)[target].sum())
                X_val.loc[:, col+'_count_target'] = means
            df_tr.iloc[val_ind]= X_val

        # replace NaNs with global count
        prior = df_tr[target].count()
        df_tr.fillna(prior, inplace=True)
        return df_tr

In [None]:
def apply_train_encode_on_val(train_df, val_df, cols, method, target):
    if method=='mean':
        val_df_new = val_df.copy()

        # pick up the target
        y_tr = train_df[target].values

        # In the new dataFrame create new columns
        # for encodings
        for col in cols:
            val_df_new[col+"_mean_target"] = None

        for col in cols:
            means = val_df_new[col].map(train_df.groupby(col)[target].mean())
            val_df_new.loc[:, col+'_mean_target'] = means

        # replace NaNs with global mean
        prior = val_df_new[target].mean()
        val_df_new.fillna(prior, inplace=True)
        return val_df_new
    
    elif method=='count':
        val_df_new = val_df.copy()

        # pick up the target
        y_tr = train_df[target].values

        # In the new dataFrame create new columns
        # for encodings
        for col in cols:
            val_df_new[col+"_count_target"] = None

        for col in cols:
            means = val_df_new[col].map(train_df.groupby(col)[target].sum())
            val_df_new.loc[:, col+'_count_target'] = means

        # replace NaNs with global mean
        prior = val_df_new[target].count()
        val_df_new.fillna(prior, inplace=True)
        return val_df_new