In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [13]:
# Author: Marco Mungai Coppolino
# License: BSD 3 clause

import numpy as np
import pandas as pd
from operator import itemgetter

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin


class EncoderSwitcher(BaseEstimator, TransformerMixin):
    """
    Encode categorical features, allowing to decide on which features to apply
    OneHotEncoding or OrdinalEncoding from scikit-learn. The transformer  
    will combine the output of the two encodings and sort them keeping the 
    original order of the columns of the input.

    The input should be a numpy array or a pandas dataframe 

    The output can be a dataframe or a numpy array making the transformer
    compatible and usable in Pipelines or Column Transformers.

    There are two ways for the transformer to decide which encoder to perform
    on which features:
        - The first way: deciding the max number of unique variables a feature
           should have to be OneHot encoded.
        - The second way: deciding the max total number of NEW columns that 
           should be created. The transformer will keep one-hot encoding 
           features, starting from the one with the least amount of variables, 
           and proceeding in ascending order until the max total number is 
           reached.

    Parameters
    ----------
    max_num_columns : is the number that sets the limit of unique variables. 
        It must be a natural number. If it is set to 0 the transformer will 
        perform only ordinal encoding on the input dataframe.

    total : a boolean keyword used to specify if max_num_columns is used
        in the first way as a limit on how many unique variables a feature
        can have to be one-hot encoded (total=False) or in the second way as
        a limit on the total number of new columns the encoder will create
        (total=True).

    check : a boolean keyword, if True the transformer will perform a check
        on the input dataframe/array and will encode only the categorical
        features (the ones with an "object" dtype) if False all the columns
        will be encoded.

    array : a boolean keyword, it is used to indicate if the output should
        be a pandas DataFrame (array=False), or a numpy array (array=True).
        Keep it True if you are using the EncoderSwitcher in a PipeLine
        or in a Column Transformer

    Attributes
    ----------
    object_cols : a list containing the names of all the columns that will
        be encoded. If the object in input is an array the names of those
        columns will be their location. 
        In this case, is important that both the fitted array and the 
        transformed one share the same column order or the encoder won't be 
        able to perform the right encoding. This is not a problem if the 
        input is a dataframe because the name of the columns will be used as
        a reference.

    OHE_cols : a list containing the names of the columns that will be 
        one-hot encoded.

    OE_cols : a list containing the names of the columns that  will be
        ordinal encoded

    fit_X_is_df : a boolean variable used in the functions of the transformer
        to keep track of the nature of the object in the input

    num_cols_list : it's a particular list of lists of 2 elements that is used
        in the _switcher() function if total == True. (More detail on its 
        purpose can be founded in that function)


    Example 
    -------

    """
    def __init__(
        self, max_num_columns=0, total=False, check=True, array=True
    ):

        self.max_num_columns = max_num_columns
        self.total = total
        self.check = check
        self.array = array
        self.object_cols = None
        self.OHE_cols = None
        self.OE_cols = None
        self.fit_X_is_df = None
        self.num_cols_list = None

    def _check_keywords(self):
        """
        Perform a check on the input keywords.

        """
        if not isinstance(self.max_num_columns, int):
            raise ValueError("max_num_columns is not a positive integer")
        else:
            if self.max_num_columns < 0:
                raise ValueError("max_num_columns is not a positive integer")

        if not isinstance(self.total, bool):
            raise ValueError("total is not boolean")

        if not isinstance(self.check, bool):
            raise ValueError("check is not boolean")

        if not isinstance(self.array, bool):
            raise ValueError("array is not boolean")

    def _check_input_X(self, X):
        """
        Perform a check on the input object to see if it is a dataframe or 
        an arry.

        """
        if isinstance(X, (pd.DataFrame, np.ndarray)):
            return
        else: 
            msg = (
                "Object in input is not a pandas DataFrame or a numpy array"            
            )
            raise ValueError(msg)

    def _switcher(self, X):
        """
        It's the function that fills OHE_cols and OE_cols, making a selection
        of which columns of the input dataframe/array, (named X), should be 
        one-hot encoded and which ones should be ordinal encoded.

        """

        # a conditional statement will define object_cols making a difference
        # based on the value of the keyword check.
        if self.check:
            self.object_cols = [
                col for col in X.columns if X[col].dtype == "object"
            ]
        else:
            self.object_cols = X.columns

        # OHE_cols and OE_cols are resetted here to prevent problems in case 
        # the encoder is fitted mistakenly multiple times.
        self.OHE_cols = []
        self.OE_cols = []

        # the two decision method are now separated with a check on the
        # total keyword
        # First Method: total == False
        if not self.total:
            # the features with an equal or less amount of unique values than 
            # max_num_columns are one-hot encoded the others are ordinal 
            # encoded
            for col in self.object_cols:
                if X[col].nunique() <= self.max_num_columns:
                    self.OHE_cols.append(col)
                else:
                    self.OE_cols.append(col)
        # Second Method: total == True            
        else:
            # filling num_cols_list, a list in which every i-th element is  
            # a list of two things:
            #   1. object_cols[i]
            #   2. the number of unique values of the column associated with 
            #       object_cols[i]
            self.num_cols_list = [[
                self.object_cols[0],
                X[self.object_cols[0]].nunique()
            ]]
            for i in range(1, len(self.object_cols)):
                self.num_cols_list.append([
                    self.object_cols[i],
                    X[self.object_cols[i]].nunique()
                ])

            # we will now sort num_cols_list in ascending order based on
            # the number of unique elements
            self.num_cols_list = sorted(self.num_cols_list, key=itemgetter(1))
            # we will now select which column are OneHotEncoded
            # and which one are OrdinalEncoded
            num_columns = 0
            for i in range(len(self.object_cols)):
                if self.max_num_columns >= (num_columns +
                                            self.num_cols_list[i][1] - 1):
                    self.OHE_cols.append(self.num_cols_list[i][0])
                    num_columns += self.num_cols_list[i][1] - 1 
                    # the -1 in the if condition and in the num_columns is 
                    # related to the fact that total should be the maximum
                    # number of columns added to the original dataframe.
                    # So, for this reason, we have to count the fact that 
                    # the columns made with the OHE take the place
                    # of the columns of the orginal df. This means that, 
                    # for every new encoded feature i, we will have only:
                    # self.num_cols_list[i][1] - 1 new columns  
                else:
                    self.OE_cols.append(self.num_cols_list[i][0])

        return self

    def _sorter(self, X, OHE_X, OE_X):
        """
        It's the function responsible for the union of the two encoded
        dataframe, OHE_X, and OE_X, with the not encoded columns of the
        object in input.

        Moreover, while concatenating the different parts this function
        sorts the columns of the final dataframe (Encoded_X), making sure
        that the original order of the features is preserved

        """
        # the function stars creating a list containing the names
        # of the original columns of the dataframe. 
        X_cols_list = list(X.columns)

        # we want to fill X_cols_list with the names of the new columns
        # obtained with the OneHot Encoding.
        # To understand how we are doing that it's important
        # to keep in mind some things:
        #   1. self.OHE_cols contains the names of the columns
        #       of the dataset that are OneHot Encoded
        #   2. self.OH_encoder.get_feature_names(self.OHE_cols) contains
         
        
        k = 0
        for i in range(len(self.OHE_cols)):
            index = X_cols_list.index(self.OHE_cols[i])+1

            for j in range(k,k + X[self.OHE_cols[i]].drop_duplicates().shape[0]):
                X_cols_list.insert(index, self.OH_encoder.get_feature_names(self.OHE_cols)[j])
                print(X_cols_list[index])
                index += 1
            
            k += X[self.OHE_cols[i]].drop_duplicates().shape[0]
            X_cols_list.remove(self.OHE_cols[i])
        
        
            
        # for keeping everything clear we will use a list 
        # char_to_drop, containing the number of characters  
        # to delete after that OHE_cols is properly created
        """
        char_to_drop = []
        
        # in the next for, OHE_cols and char_to_drop are properly filled
        for i in range(len(self.OHE_cols)):
            OHE_cols.append(
                '_' + str(X.columns.get_loc(self.OHE_cols[i])) +
                '_' + self.OHE_cols[i]
            )
            char_to_drop.append(
                1 + len(str(X.columns.get_loc(self.OHE_cols[i]))) + 1
            )
        
        # we now get the names of the new generated features using 
        # the new OHE_cols. 
        print(names)
        # this is the part where OHE_cols_names is created
        for i in range(len(self.OHE_cols)):
            OHE_cols_names.append([
                col_name[char_to_drop[i]:] 
                for col_name in names 
                if (OHE_cols[i]) in col_name
            ])

            # after being added to OHE_cols_names the list
            # of generated features from the i-th encoded
            # feature is inserted in X_cols_list at the right
            # place
            index = X_cols_list.index(self.OHE_cols[i])+1
            X_cols_list[index:index] = OHE_cols_names[i]
            X_cols_list.remove(self.OHE_cols[i])
        """
        
        # after X_cols_list is completed we concatenate
        # the two encoded dataframes with the features 
        # not encoded in a final dataframe, called Encoded_X
        Encoded_X = pd.concat([
            X[[col for col in X.columns if col not in self.object_cols]],
            OHE_X,
            OE_X
        ], axis=1)

        # using X_cols_list we can easily sort Encoded_X in the right way
        Encoded_X = Encoded_X[X_cols_list]
        """
        if not self.array:
            return Encoded_X
        else:
            return Encoded_X.values
        """
        return Encoded_X, X, X_cols_list 

    def fit(self, X, y=None):
        """
        Fit the EncoderSwitcher to X

        """
        # some checks on the input X and on the keywords of the encoder
        self._check_input_X(X)
        self._check_keywords()

        # the switcher and the sorter are made to work with a dataframe
        # thos is crucial especially to make the sorter work
        # so we assure here that X is converted in a dataframe
        if isinstance(X, np.ndarray):
            self.fit_X_is_df = False
            if self.check:
                X = pd.DataFrame(X)
                X = X.convert_dtypes(convert_string=False)
            else:
                X = pd.DataFrame(X)   
            X.columns = [str(col) for col in X.columns]
        else:
            self.fit_X_is_df = True

        # the siwtcher fills OHE_cols and OE_cols
        self._switcher(X)

        # the two different encoders are then properly fitted 
        self.OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        self.ordinal_encoder = OrdinalEncoder(
                                    handle_unknown='use_encoded_value',
                                    unknown_value=-999
                                )
        self.OH_encoder.fit(X[self.OHE_cols])
        self.ordinal_encoder.fit(X[self.OE_cols])
        print(self.OHE_cols)
        return self

    def transform(self, X):
        """
        Transform X using the EncoderSwitcher

        """
        self._check_input_X(X)

        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
            X.columns = [str(col) for col in X.columns]
            X = X.convert_dtypes(convert_string=False)

        OHE_X = pd.DataFrame(self.OH_encoder.transform(X[self.OHE_cols]))
        OHE_X.index = X.index

        OE_X = pd.DataFrame(self.ordinal_encoder.transform(X[self.OE_cols]))
        OE_X.index = X.index

        OHE_X.columns = self.OH_encoder.get_feature_names(self.OHE_cols)
        OE_X.columns = self.OE_cols

        # Encoded_X = self._sorter(X, OHE_X, OE_X)
        Encoded_X, X_class, X_cols_list = self._sorter(X, OHE_X, OE_X)
        # return Encoded_X 
        return OHE_X, OE_X, self.OHE_cols, self.OE_cols, Encoded_X, X_class, X_cols_list, self.num_cols_list


In [78]:
# Author: Marco Mungai Coppolino      
# License: BSD 3 clause      
      
import numpy as np      
import pandas as pd      
from operator import itemgetter      
      
from sklearn.preprocessing import OrdinalEncoder      
from sklearn.preprocessing import OneHotEncoder      
from sklearn.base import BaseEstimator, TransformerMixin      
      
      
class EncoderSwitcher(BaseEstimator, TransformerMixin):      
    """      
    Encode categorical features, allowing to decide on which features to      
    apply OneHotEncoding or OrdinalEncoding from scikit-learn.    
    The transformer will combine the output of the two encodings and sort   
    them keeping the original order of the columns of the input.      
      
    The input should be a numpy array or a pandas dataframe 
    The output can be a dataframe or a numpy array making the transformer     
    compatible and usable in Pipelines or Column Transformers.      
      
    There are two ways for the transformer to decide which encoder to perform
    on which features:      
        - The first way: deciding the max number of unique variables a      
           feature should have to be OneHot encoded.      
        - The second way: deciding the max total number of NEW columns that  
           should be created. The transformer will keep one-hot encoding
           features, starting from the one with the least amount of      
           variables, and proceeding in ascending order until the max total 
           number is reached.      
    The parameter check is used to decide what to do to the column that are
    not OneHot Encoded
    
    Parameters      
    ----------      
    max_num_columns : is the number that sets the limit of unique variables.  
        It must be a natural number. If it is set to 0 the transformer will  
        perform only ordinal encoding on the input dataframe.      
      
    total : a boolean keyword used to specify if max_num_columns is used      
        in the first way as a limit on how many unique variables a feature 
        can have to be one-hot encoded (total=False) or in the second way as  
        a limit on the total number of new columns the encoder will create 
        (total=True).      
      
    check : a boolean keyword, if True the transformer will perform a check  
        on the input dataframe/array and will encode only the categorical
        features (the ones with an "object" dtype) if False all the columns  
        will be encoded.      
      
    array : a boolean keyword, it is used to indicate if the output should
        be a pandas DataFrame (array=False), or a numpy array (array=True).  
        Keep it True if you are using the EncoderSwitcher in a PipeLine      
        or in a Column Transformer      
      
    Attributes      
    ----------      
    object_cols : a list containing the names of all the columns that will 
        be encoded. If the object in input is an array the names of those
        columns will be their location.       
        In this case, is important that both the fitted array and the       
        transformed one share the same column order or the encoder won't be  
        able to perform the right encoding. This is not a problem if the
        input is a dataframe because the name of the columns will be used as  
        a reference.      
      
    OHE_cols : a list containing the names of the columns that will be       
        one-hot encoded.      
      
    OE_cols : a list containing the names of the columns that  will be      
        ordinal encoded      
      
    fit_X_is_df : a boolean variable used in the functions of the transformer
        to keep track of the nature of the object in the input      
      
    num_cols_list : it's a particular list of lists of 2 elements that is
        used in the _switcher() function if total == True. (More detail on
        its purpose can be founded in that function)            
      
    """      
    def __init__(      
        self, max_num_columns=0, total=False, check=True, array=True      
    ):      
      
        self.max_num_columns = max_num_columns      
        self.total = total      
        self.check = check      
        self.array = array      
        self.object_cols = None      
        self.OHE_cols = None      
        self.OE_cols = None      
        self.fit_X_is_df = None      
        self.num_cols_list = None      
      
    def _check_keywords(self):      
        """      
        Perform a check on the input keywords.      
      
        """      
        if not isinstance(self.max_num_columns, int):      
            raise ValueError("max_num_columns is not a positive integer")     
 
        else:      
            if self.max_num_columns < 0:      
                raise ValueError("max_num_columns is not a positive integer") 

        if not isinstance(self.total, bool):      
            raise ValueError("total is not boolean")      
      
        if not isinstance(self.check, bool):      
            raise ValueError("check is not boolean")      
      
        if not isinstance(self.array, bool):      
            raise ValueError("array is not boolean")      
      
    def _check_input_X(self, X):      
        """      
        Perform a check on the input object to see if it is a dataframe or  
        an arry.      
      
        """      
        if isinstance(X, (pd.DataFrame, np.ndarray)):      
            return      
        else:       
            msg = (      
                "Object in input is not a pandas DataFrame or a numpy array"
            )      
            raise ValueError(msg)      
      
    def _switcher(self, X):      
        """      
        It's the function that fills OHE_cols and OE_cols, making a selection
        of which columns of the input dataframe/array, (named X), should be  
        one-hot encoded and which ones should be ordinal encoded.      
      
        """      
      
        # a conditional statement will define object_cols making a difference
        # based on the value of the keyword check.      
        if self.check:      
            self.object_cols = [      
                col for col in X.columns if X[col].dtype == "object"      
            ]      
        else:      
            self.object_cols = X.columns      
      
        # OHE_cols and OE_cols are resetted here to prevent problems in case
        # the encoder is fitted mistakenly multiple times.      
        self.OHE_cols = []      
        self.OE_cols = []      
      
        # the two decision method are now separated with a check on the      
        # total keyword      
        # First Method: total == False      
        if not self.total:      
            # the features with an equal or less amount of unique values than
            # max_num_columns are one-hot encoded the others are ordinal
            # encoded      
            for col in self.object_cols:      
                if X[col].nunique() <= self.max_num_columns:      
                    self.OHE_cols.append(col)      
                else:      
                    self.OE_cols.append(col)      
        # Second Method: total == True                  
        else:      
            # filling num_cols_list, a list in which every i-th element is   
            # a list of two things:      
            #   1. object_cols[i]      
            #   2. the number of unique values of the column associated with
            #       object_cols[i]      
            self.num_cols_list = [[      
                self.object_cols[0],      
                X[self.object_cols[0]].nunique()      
            ]]      
            for i in range(1, len(self.object_cols)):      
                self.num_cols_list.append([      
                    self.object_cols[i],      
                    X[self.object_cols[i]].nunique()      
                ])      
      
            # we will now sort num_cols_list in ascending order based on      
            # the number of unique elements      
            self.num_cols_list = sorted(     
                                     self.num_cols_list,      
                                     key=itemgetter(1)     
                                 )      
            # we will now select which column are OneHotEncoded      
            # and which one are OrdinalEncoded      
            num_columns = 0      
            for i in range(len(self.object_cols)):      
                if self.max_num_columns >= (num_columns +      
                                            self.num_cols_list[i][1] - 1):    

                    self.OHE_cols.append(self.num_cols_list[i][0])      
                    num_columns += self.num_cols_list[i][1] - 1       
                    # the -1 in the if condition and in the num_columns is 
                    # related to the fact that total should be the maximum 
                    # number of columns added to the original dataframe. 
                    # So, for this reason, we have to count the fact that 
  
                    # the columns made with the OHE take the place 
                    # of the columns of the orginal df. This means that, 
                    # for every new encoded feature i, we will have only: 
                    # self.num_cols_list[i][1] - 1 new columns        
                else:      
                    self.OE_cols.append(self.num_cols_list[i][0])      
      
        return self      
      
    def _sorter(self, X, OHE_X, OE_X):      
        """      
        It's the function responsible for the union of the two encoded 
        dataframe, OHE_X, and OE_X, with the not encoded columns of the 
        object in input. 
      
        Moreover, while concatenating the different parts this function 
        sorts the columns of the final dataframe (Encoded_X), making sure 
        that the original order of the features is preserved 
      
        """      
        # the function stars creating a list containing the names 
        # of the original columns of the dataframe.       
        X_cols_list = list(X.columns)      
      
        # we want to fill X_cols_list with the names of the new columns  
        # obtained after the OneHot Encoding.  
        # To understand how we are doing that it's important  
        # to keep in mind some things:  
        #   1. self.OHE_cols contains the names of the columns  
        #       of the dataset that are OneHot Encoded  
        #   2. self.OH_encoder.get_feature_names(self.OHE_cols) contains 
        #       the names of the columns after the OneHot Encoding 
        #   3. X[self.OHE_cols[i]].drop_duplicates().shape[0] will give us 
        #       the number of unique variables each OneHot Encoded feature 
        #       contains 
        # So using the names in self.OHE_cols we can find, in the database,  
        # the index of the encoded features and add the new columns in the 
        # rigth place. 
      
        k = 0      
        for i in range(len(self.OHE_cols)):      
            index = X_cols_list.index(self.OHE_cols[i])+1      
      
            for j in range(     
                         k,     
                         k + X[self.OHE_cols[i]].drop_duplicates().shape[0]
                     ):      
                X_cols_list.insert(     
                    index,      
                    self.OH_encoder.get_feature_names(self.OHE_cols)[j]     
                )      
                index += 1
                  
            k += X[self.OHE_cols[i]].drop_duplicates().shape[0]      
            X_cols_list.remove(self.OHE_cols[i])      
              
        # after X_cols_list is completed we concatenate, in a final      
        # dataframe, the two encoded datasets with the features       
        # not encoded, called Encoded_X      
        Encoded_X = pd.concat([      
            X[[col for col in X.columns if col not in self.object_cols]],
            OHE_X,      
            OE_X      
        ], axis=1)      
      
        # using X_cols_list we can easily sort Encoded_X in the right way     
 
        Encoded_X = Encoded_X[X_cols_list] 
        print(self.OH_encoder.get_feature_names(self.OHE_cols))
             
        if not self.array:      
            return Encoded_X      
        else:      
            return Encoded_X.values                   
      
    
    def fit(self, X, y=None):      
        """      
        Fit the EncoderSwitcher to X      
      
        """      
        # some checks on the input X and on the keywords of the encoder      
        self._check_input_X(X)      
        self._check_keywords()      
      
        # the switcher and the sorter are made to work with a dataframe      
        # thos is crucial especially to make the sorter work      
        # so we assure here that X is converted in a dataframe      
        if isinstance(X, np.ndarray):      
            self.fit_X_is_df = False      
            if self.check:      
                X = pd.DataFrame(X)      
                X = X.convert_dtypes(convert_string=False)      
            else:      
                X = pd.DataFrame(X)         
            X.columns = [str(col) for col in X.columns]      
        else:      
            self.fit_X_is_df = True      
      
        # the siwtcher fills OHE_cols and OE_cols      
        self._switcher(X)      
      
        # the two different encoders are then properly fitted       
        self.OH_encoder = OneHotEncoder(     
                              handle_unknown='ignore',      
                              sparse=False     
                          )      
        self.ordinal_encoder = OrdinalEncoder(      
                                   handle_unknown='use_encoded_value',      
                                   unknown_value=-999      
                               )      
        self.OH_encoder.fit(X[self.OHE_cols])      
        self.ordinal_encoder.fit(X[self.OE_cols])      
        return self      
      
    def transform(self, X):      
        """      
        Transform X using the EncoderSwitcher      
      
        """      
        self._check_input_X(X)      
      
        if isinstance(X, np.ndarray):      
            X = pd.DataFrame(X)      
            X.columns = [str(col) for col in X.columns]      
            X = X.convert_dtypes(convert_string=False)      
      
        OHE_X = pd.DataFrame(self.OH_encoder.transform(X[self.OHE_cols]))     
 
        OHE_X.index = X.index      
      
        OE_X = pd.DataFrame(self.ordinal_encoder.transform(X[self.OE_cols]))    
    
        OE_X.index = X.index      
      
        OHE_X.columns = self.OH_encoder.get_feature_names(self.OHE_cols)      
        OE_X.columns = self.OE_cols      
      
        Encoded_X = self._sorter(X, OHE_X, OE_X)      
        return Encoded_X       

In [79]:
X = pd.read_csv('Data_Sets/train.csv', index_col='Id') 
X_test = pd.read_csv('Data_Sets/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [80]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

In [81]:
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

In [82]:
X_train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,RL,11694,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,...,108,0,0,260,0,0,7,2007,New,Partial
871,20,RL,6600,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,0,0,0,8,2009,WD,Normal
93,30,RL,13360,Pave,IR1,HLS,AllPub,Inside,Gtl,Crawfor,...,0,44,0,0,0,0,8,2009,WD,Normal
818,20,RL,13265,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Mitchel,...,59,0,0,0,0,0,7,2008,WD,Normal
303,20,RL,13704,Pave,IR1,Lvl,AllPub,Corner,Gtl,CollgCr,...,81,0,0,0,0,0,1,2006,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,60,RL,9430,Pave,Reg,Lvl,AllPub,Inside,Gtl,NoRidge,...,128,0,0,180,0,0,7,2009,WD,Normal
836,20,RL,9600,Pave,Reg,Lvl,AllPub,Inside,Gtl,Sawyer,...,0,0,0,0,0,0,2,2010,WD,Normal
1217,90,RM,8930,Pave,Reg,Lvl,AllPub,Inside,Gtl,Sawyer,...,0,0,0,0,0,0,4,2010,WD,Normal
560,120,RL,3196,Pave,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,...,20,0,0,0,0,0,10,2006,WD,Normal


In [83]:
X_valid

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
530,20,RL,32668,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Crawfor,...,0,200,0,0,0,0,3,2007,WD,Alloca
492,50,RL,9490,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,32,0,0,0,0,8,2006,WD,Normal
460,50,RL,7015,Pave,IR1,Bnk,AllPub,Corner,Gtl,BrkSide,...,0,248,0,0,0,0,7,2009,WD,Normal
280,60,RL,10005,Pave,Reg,Lvl,AllPub,Inside,Gtl,ClearCr,...,117,0,0,0,0,0,3,2008,WD,Normal
656,160,RM,1680,Pave,Reg,Lvl,AllPub,Inside,Gtl,BrDale,...,0,0,0,0,0,0,3,2010,WD,Family
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,120,RL,10846,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Veenker,...,30,0,0,0,0,0,5,2008,Con,Normal
441,20,RL,15431,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,...,72,0,0,170,0,0,4,2009,WD,Normal
1388,50,RM,8520,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,...,15,0,0,0,0,0,8,2007,CWD,Family
1324,30,RL,5330,Pave,Reg,HLS,AllPub,Inside,Gtl,BrkSide,...,0,0,0,0,0,0,12,2009,WD,Normal


In [86]:
Encoder = EncoderSwitcher(max_num_columns=2, total=False, check=True, array=False)

Encoder.fit(X_train)
Encoded_X = Encoder.transform(X_train)
#OHE_X, OE_X, OHE_cols, OE_cols, Encoded_X, X_class, X_cols_list, num_cols_list = Encoder.transform(X_train)

['Street_Grvl' 'Street_Pave' 'Utilities_AllPub' 'Utilities_NoSeWa'
 'CentralAir_N' 'CentralAir_Y']


In [88]:
X_valid['Utilities'].drop_duplicates().shape[0]

1

In [57]:
X_valid 

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
530,20,RL,32668,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Crawfor,...,0,200,0,0,0,0,3,2007,WD,Alloca
492,50,RL,9490,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,32,0,0,0,0,8,2006,WD,Normal
460,50,RL,7015,Pave,IR1,Bnk,AllPub,Corner,Gtl,BrkSide,...,0,248,0,0,0,0,7,2009,WD,Normal
280,60,RL,10005,Pave,Reg,Lvl,AllPub,Inside,Gtl,ClearCr,...,117,0,0,0,0,0,3,2008,WD,Normal
656,160,RM,1680,Pave,Reg,Lvl,AllPub,Inside,Gtl,BrDale,...,0,0,0,0,0,0,3,2010,WD,Family
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,120,RL,10846,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Veenker,...,30,0,0,0,0,0,5,2008,Con,Normal
441,20,RL,15431,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,...,72,0,0,170,0,0,4,2009,WD,Normal
1388,50,RM,8520,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,...,15,0,0,0,0,0,8,2007,CWD,Family
1324,30,RL,5330,Pave,Reg,HLS,AllPub,Inside,Gtl,BrkSide,...,0,0,0,0,0,0,12,2009,WD,Normal


In [11]:
OHE_X

Unnamed: 0_level_0,Street_Grvl,Street_Pave,Utilities_AllPub,Utilities_NoSeWa,CentralAir_N,CentralAir_Y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
619,0.0,1.0,1.0,0.0,0.0,1.0
871,0.0,1.0,1.0,0.0,1.0,0.0
93,0.0,1.0,1.0,0.0,0.0,1.0
818,0.0,1.0,1.0,0.0,0.0,1.0
303,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
764,0.0,1.0,1.0,0.0,0.0,1.0
836,0.0,1.0,1.0,0.0,0.0,1.0
1217,0.0,1.0,1.0,0.0,0.0,1.0
560,0.0,1.0,1.0,0.0,0.0,1.0


In [12]:
X_cols_list

['MSSubClass',
 'MSZoning',
 'LotArea',
 'Street_Grvl',
 'Street_Pave',
 'LotShape',
 'LandContour',
 'Utilities_AllPub',
 'Utilities_NoSeWa',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir_N',
 'CentralAir_Y',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition']

In [1]:
a = [1, 2, 3, 4]

In [2]:
a[-2
]

3