In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from operator import itemgetter


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Read the data
X = pd.read_csv('Data_Sets/train.csv', index_col='Id') 
X_test = pd.read_csv('Data_Sets/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [4]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,RL,11694,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,...,108,0,0,260,0,0,7,2007,New,Partial
871,20,RL,6600,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,0,0,0,8,2009,WD,Normal
93,30,RL,13360,Pave,IR1,HLS,AllPub,Inside,Gtl,Crawfor,...,0,44,0,0,0,0,8,2009,WD,Normal
818,20,RL,13265,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Mitchel,...,59,0,0,0,0,0,7,2008,WD,Normal
303,20,RL,13704,Pave,IR1,Lvl,AllPub,Corner,Gtl,CollgCr,...,81,0,0,0,0,0,1,2006,WD,Normal


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [6]:
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ])

clf = Pipeline(steps=[('preprocessor', preprocessor)])

clf.fit(X_train, y_train)

pd.reset_option('max_columns')

pd.reset_option('max_rows')

NameError: name 'Pipeline' is not defined

In [7]:
X_train.columns

Index(['MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition'],
      dtype='object')

In [8]:
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

In [9]:
object_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [10]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_encoder.fit_transform(X_train)

array([[1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [11]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[good_label_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[good_label_cols]))    
                                                         
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

OH_cols_train.columns = OH_encoder.get_feature_names(good_label_cols)
OH_cols_valid.columns = OH_encoder.get_feature_names(good_label_cols)

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

OH_X_train.sort_index(inplace=True)
OH_X_valid.sort_index(inplace=True)

In [12]:
X_train_columns = list(X_train.columns)

In [13]:
X_train.columns

Index(['MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition'],
      dtype='object')

In [14]:
X_train_columns

['MSSubClass',
 'MSZoning',
 'LotArea',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition']

In [15]:

matching = [s for s in OH_X_train if good_label_cols[0] in s]
matching

['MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM']

In [16]:
"""
L'obiettivo è riordinare le colonne modificate e farle mettere al giusto posto nel dataframe
Per fare questa dovrai creare una lista con i nomi delle colonne nel corretto ordine e poi riordinare il pd completo basandoti sulla lista

Inizi creando una lista vuota OH_column_names che sarà una lista di liste
Ogni lista in OH_column_names è formata dai titoli delle nuove features modificate

Sfruttando l'index delle feature da modificare nella lista iniziale puoi inserire le giuste colonne figlie subito dopo quelle padri

Poi finirai il tutto eliminando i nomi delle feature genitori e avrai la lista finale che poi userai come rierimento

"""

"""
OH_column_names : list of list
    Every element of OH_colum_names is a list containing the names of the
    columns generated from one encoded column. 
"""

# you start creating a list of list 
OH_column_names = []
for i in range(len(good_label_cols)):
    
    # The argument of the append is a list comprehension that for each feature to
    OH_column_names.append([col_name for col_name in OH_X_train if good_label_cols[i] in col_name])
    index = X_train_columns.index(good_label_cols[i])+1
    X_train_columns[index:index] = OH_column_names[i]

In [17]:
OH_X_train.columns

Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=192)

In [18]:
L = ['a', 'b', 'c']
L2 = [3, 1, 2]
new_L = [[L[0], L2[0]]]
         
for i in range(1, len(L)):
    new_L.append([L[i], L2[i]]) 



In [19]:
new_L

[['a', 3], ['b', 1], ['c', 2]]

In [20]:
from operator import itemgetter
sorted(new_L, key=itemgetter(1))

[['b', 1], ['c', 2], ['a', 3]]

In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

class EncoderSwitcher(BaseEstimator, TransformerMixin):
    def __init__(self, max_num_columns=1, total=False, copy=True):
        self.max_num_columns = max_num_columns
        self.total = total
        self.copy = True
        self.object_cols = None
        self.OHE_cols = None
        self.OE_cols = None
        
    def _switcher(self, X):
        # object_cols : list of the names of the categorical colummns
        self.object_cols = [col for col in X.columns if X[col].dtype == "object"]
        self.OHE_cols = []
        self.OE_cols = []
        # Check if total False or true
        if self.total == False:
            # Creating OHE_cols and OE_cols
            for col in self.object_cols:
                if X[col].nunique() <= self.max_num_columns:
                    self.OHE_cols.append(col)
                else:
                    self.OE_cols.append(col)
        else:
            # creating a list with every i-element is a list of two
            #   * self.object_cols[i]
            #   * the num of unique element of that categorical column
            num_cols_list = [[self.object_cols[0], X[self.object_cols[0]].nunique()]]
            for i in range(1, len(self.object_cols)):
                num_cols_list.append([self.object_cols[i], X[self.object_cols[i]].nunique()])

            # we will now sort this list in an ascending order based on
            # the number of unique elements
            sorted(num_cols_list, key=itemgetter(1))

            # we will now select which column are OneHotEncoded
            # and which one are OrdinalEncoded
            num_columns = 0
            index = 0
            for i in range(len(self.object_cols)):
                if num_columns + num_cols_list[i][1] <= self.max_num_columns:
                    self.OHE_cols.append(num_cols_list[i][0])
                    num_columns += num_cols_list[i][1]
                else:
                    self.OE_cols.append(num_cols_list[i][0])
            
        return self
    
    def _sorter(self,X):
        
        return self
    
    def fit(self, X, y=None):
        
        self._switcher(X)
        
        self.OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        self.ordinal_encoder = OrdinalEncoder()
        
        self.OH_encoder.fit(X[self.OHE_cols])
        self.ordinal_encoder.fit(X[self.OE_cols])
        
        return self
      
    def transform(self, X):
        OHE_X = pd.DataFrame(self.OH_encoder.transform(X[self.OHE_cols]))
        OE_X = pd.DataFrame(self.ordinal_encoder.transform(X[self.OE_cols]))
        return OHE_X, OE_X, self.OHE_cols, self.OE_cols


In [50]:
d = {'col1': ['a', 'c', 'b', 'b', 'c'], 'col2': ['a', 'a', 'a', 'a', 'a'] , 'col3': [12 , 'c', 'b', 'd', 'e'], 'col4': ['b', 'b', 'b', 'b', 'b'], 'col5': [12.01, 10, 13, 14, 12]}
df = pd.DataFrame(data=d)


In [51]:
DF = pd.DataFrame(df.values)

In [54]:
DF_N = DF.convert_dtypes(convert_string = False)

In [55]:
DF_N.dtypes

0     object
1     object
2     object
3     object
4    Float64
dtype: object

In [None]:
Enc = EncoderSwitcher(max_num_columns=3, total=False)

In [None]:
Enc.fit(df)


In [None]:
OHE_X, OE_X, OHE_cols, OE_cols = Enc.transform(df)

In [None]:
OHE_X

In [None]:
OE_X

In [None]:
df[OHE_cols]

In [None]:
df[OE_cols]

In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', EncoderSwitcher())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, df.columns)
    ])

model = RandomForestRegressor(n_estimators=100, random_state=0)


clf = Pipeline(steps=[('preprocessor', preprocessor),
                     ])

clf.fit(df)

display(df)

In [None]:
df2 = pipe.transform(df)

In [None]:
df2