# ENCODERS

In [None]:
## importing the relevant packages:

# clear the workspace
%reset -f

# print list of files in directory
import os
print(os.listdir())

# print/display all plots inline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# the base packages
import collections # for the Counter function
import csv # for reading/writing csv files
import pandas as pd, numpy as np, time, gc, bisect, re

# the encoders
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

# modules to handle 'if-else' and 'for' loop breaks
import sys
from io import StringIO
from IPython import get_ipython

# define the global variables
randomseed = 1 # the value for the random state used at various points in the pipeline
pd.options.display.max_rows = 1000 # specify if you want the full output in cells rather the truncated list
pd.options.display.max_columns = 200

## MULTI COLUMN LABEL ENCODER

In [None]:
class MultiColumnLabelEncoder(LabelEncoder):
    """
    Wraps sklearn LabelEncoder functionality for use on multiple columns of a pandas dataframe
    Adapted from the discussion page = [https://code.i-harness.com/en/q/1753595]
    """
    
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, dframe):
        """
        - Fit label encoder to pandas columns
        - Access individual column classes via indexing `self.all_classes_`
        - Access individual column encoders via indexing `self.all_encoders_`
        """
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                            dtype=object)
            
            for idx, column in enumerate(self.columns):
                # fit LabelEncoder to get `classes_` for the column
                le = LabelEncoder()
                print(column)
                le.fit(dframe.loc[:, column].astype('str'))
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                # append this column's encoder
                self.all_encoders_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = dframe.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                le.fit(dframe.loc[:, column].astype('str'))
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
        return self

    def fit_transform(self, dframe):
        """
        - Fit label encoder and return encoded labels.
        - Access individual column classes via indexing `self.all_classes_`
        - Access individual column encoders via indexing `self.all_encoders_`
        """
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape, dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape, dtype=object)
            for idx, column in enumerate(self.columns):
                # instantiate LabelEncoder
                le = LabelEncoder()
                # fit and transform labels in the column
                dframe.loc[:, column] =le.fit_transform(dframe.loc[:, column].astype('str'))
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object))
                self.all_encoders_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = dframe.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape, dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                dframe.loc[:, column] = le.fit_transform(dframe.loc[:, column].astype('str'))
                self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object))
                self.all_encoders_[idx] = le
        return dframe

    def transform(self, dframe):
        """
        Transform labels to normalized encoding.
        """
        if self.columns is not None:
            for idx, column in enumerate(self.columns):
                le = self.all_encoders_[idx]
                dframe.loc[:, column] = dframe.loc[:, column].map(lambda i: 'NEW' if i not in le.classes_ else i)
                le_classes = le.classes_.tolist()
                bisect.insort_left(le_classes, 'NEW')
                le.classes_ = le_classes
                
                self.all_encoders_[idx] = le
                self.all_classes_[idx] = (column, np.array(le_classes_, dtype=object))
                dframe.loc[:, column] = self.all_encoders_[idx].transform(dframe.loc[:, column].astype('str'))
        else:
            self.columns = dframe.iloc[:, :].columns
            for idx, column in enumerate(self.columns):
                le = self.all_encoders_[idx]
                dframe.loc[:, column] = dframe.loc[:, column].map(lambda i: 'NEW' if i not in le.classes_ else i)
                le_classes = le.classes_.tolist()
                bisect.insort_left(le_classes, 'NEW')
                le.classes_ = le_classes
                
                self.all_encoders_[idx] = le
                self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object))
                dframe.loc[:, column] = self.all_encoders_[idx].transform(dframe.loc[:, column].astype('str'))
        return dframe

    def inverse_transform(self, dframe):
        """
        Transform labels back to original encoding.
        """
        if self.columns is not None:
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[idx].inverse_transform(dframe.loc[:, column].astype('str'))
        else:
            self.columns = dframe.iloc[:, :].columns
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[idx].inverse_transform(dframe.loc[:, column].astype('str'))
        return dframe

## ENCODING CLASS
- Various types of encoding here
    - label encoder (_support given for handling unseen values for dataframes to be transformed_)
    - binary encoder
    - base encoder
    - hashing encoder
    - ordinal encoder (similar to label, different implementation)
    - one-hot encoder

In [None]:
class categ_encoders():
    
    def encoding(train, valid, y_train, y_valid, which=['le', 'be', 'bne', 'ohe', 'he', 'oe']):
        if which=='le':
            train, valid, categorical_names = categ_encoders.labelEncoder(train, valid)
        elif which in ['be', 'bne', 'ohe', 'he', 'oe']:
            train, valid, categorical_names = categ_encoders.ce_encodings(train, valid, y_train, y_valid, which)
        else :
            print('Not supported. Use one of [be, bne, he, oe, ohe]', '\n')
            exit()            
        return train, valid, categorical_names
        
    def labelEncoder(train_df, valid_df):
        print('label encoding is happening ...', '\n')
        cat_columns = train_df.select_dtypes(include=['object']).columns.values
        categorical_names = {}
        for feature in tqdm(cat_columns):
            le = preprocessing.LabelEncoder()
            le.fit(train_df[feature].astype(str))
            train_df[feature] = le.transform(train_df[feature].astype(str))
            valid_df[feature] = valid_df[feature].map(lambda i: 'No Data' if i not in le.classes_ else i)
            le_classes = le.classes_.tolist()
            bisect.insort_left(le_classes, 'No Data')
            le.classes_ = le_classes
            valid_df[feature] = le.transform(valid_df[feature].astype(str))
            categorical_names[feature] = le.classes_
        print('label encoding completed', '\n')
        return train_df, valid_df, categorical_names
        
    def ce_encodings(train_df, valid_df, y_train, y_valid, encoding):
        print(str(encoding) + ' encoding is happening ...', '\n')
        if encoding=='bne':    
            enc=ce.BaseNEncoder(base=4)
        elif encoding=='be':
            enc=ce.BinaryEncoder()
        elif encoding=='he':
            enc=ce.HashingEncoder(drop_invariant=True)
        elif encoding=='oe':
            enc=ce.OrdinalEncoder()
        elif encoding=='ohe':
            enc=ce.BaseNEncoder(base=1)
        enc.fit(train_df)
        train_enc=enc.transform(train_df)
        valid_enc=enc.transform(valid_df)
        print('category encoding completed', '\n')
        categorical_names = {}
        return train_enc, valid_enc, categorical_names