# **One Hot Encoding with Sklearn**

In [0]:
import pandas as pd 
from sklearn.model_selection import train_test_split

In [0]:
import numpy as np
 
# load dataset from openML
data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
 
# replace ? with np.nan
data = data.replace('?', np.nan)
 
# capture only first cabin when more than one is available
def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan
 
data['cabin'] = data['cabin'].apply(get_first_cabin)
 
# save to csv
data.to_csv('titanic.csv', index=False)


In [3]:
data = data [['sex', 'embarked', 'cabin', 'survived']]
data['cabin'] = data.cabin.str[0]

data.head()

Unnamed: 0,sex,embarked,cabin,survived
0,female,S,B,1
1,male,S,C,1
2,female,S,C,0
3,male,S,C,0
4,female,S,C,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data [['sex', 'embarked', 'cabin']], data['survived'], test_size=0.3, random_state = 33)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((916, 3), (393, 3), (916,), (393,))

In [5]:
# Sex has 2 labels
print(X_train.sex.unique())

# Embarked has 3 labels
print(X_train.embarked.unique())

# Cabin has 3 labels
print(X_train.cabin.unique())


['male' 'female']
['S' 'C' 'Q' nan]
[nan 'B' 'E' 'C' 'D' 'F' 'A' 'T' 'G']


In [6]:
# OneHotEncoding with sklearn
X_train = X_train.fillna('Missing')
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories = 'auto', 
                        drop = 'first',
                        sparse = False,
                        handle_unknown = 'error') # Helps deal with rare labels

encoder.fit(X_train.fillna('missing'))

OneHotEncoder(categories='auto', drop='first', dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)

In [7]:
# The learned categories
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Missing', 'Q', 'S'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Missing', 'T'], dtype=object)]

In [8]:
# Transform the train set
trns = encoder.transform(X_train)
column_name = encoder.get_feature_names(['sex', 'embarked', 'cabin'])

pd.DataFrame(trns, columns = column_name).head()


Unnamed: 0,sex_male,embarked_Missing,embarked_Q,embarked_S,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_Missing,cabin_T
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Transform test set
X_test = X_test.fillna('Missing')
trns_test = encoder.transform(X_test)
pd.DataFrame(trns_test, columns=column_name).head()

Unnamed: 0,sex_male,embarked_Missing,embarked_Q,embarked_S,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_Missing,cabin_T
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# **One Hot Encoding-Frequent Categories**


In [10]:
# Import data
house = pd.read_csv('/content/train.csv', usecols = ['Neighborhood', 'Exterior1st', 'Exterior2nd', 'SalePrice'])
house.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [11]:
# Take a look on the number of labels of each features
for col in house.columns:
  print(col, ': ', house[col].nunique(), ' labels')

Neighborhood :  25  labels
Exterior1st :  15  labels
Exterior2nd :  16  labels
SalePrice :  663  labels


In [12]:
X_train, X_test, y_train, y_test = train_test_split(house[['Neighborhood', 'Exterior1st', 'Exterior2nd']], house['SalePrice'], test_size=0.3, random_state = 33)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1022, 3), (438, 3), (1022,), (438,))

In [13]:
# Top 10 frequent categories of Neighborhood.
X_train['Neighborhood'].value_counts().sort_values(ascending = False).head(10)

NAmes      157
CollgCr    103
OldTown     78
Edwards     76
Somerst     66
Gilbert     61
Sawyer      55
NridgHt     50
NWAmes      49
BrkSide     41
Name: Neighborhood, dtype: int64

In [15]:
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
encod = OneHotCategoricalEncoder(top_categories = 10, 
                                 variables = ['Neighborhood', 'Exterior1st', 'Exterior2nd'], # Variables we want to encode
                                 drop_last = False)
encod.fit(X_train)

OneHotCategoricalEncoder(drop_last=False, top_categories=10,
                         variables=['Neighborhood', 'Exterior1st',
                                    'Exterior2nd'])

In [16]:
# Take a look on the selected top categories
encod.encoder_dict_

{'Exterior1st': ['VinylSd',
  'MetalSd',
  'HdBoard',
  'Wd Sdng',
  'Plywood',
  'CemntBd',
  'BrkFace',
  'Stucco',
  'WdShing',
  'AsbShng'],
 'Exterior2nd': ['VinylSd',
  'MetalSd',
  'HdBoard',
  'Wd Sdng',
  'Plywood',
  'CmentBd',
  'Wd Shng',
  'BrkFace',
  'Stucco',
  'AsbShng'],
 'Neighborhood': ['NAmes',
  'CollgCr',
  'OldTown',
  'Edwards',
  'Somerst',
  'Gilbert',
  'Sawyer',
  'NridgHt',
  'NWAmes',
  'BrkSide']}

In [17]:
# Trasform Train and Test sets
X_train = encod.transform(X_train)
X_test = encod.transform(X_test)

# Train set after transform
X_train.head()

Unnamed: 0,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Somerst,Neighborhood_Gilbert,Neighborhood_Sawyer,Neighborhood_NridgHt,Neighborhood_NWAmes,Neighborhood_BrkSide,Exterior1st_VinylSd,Exterior1st_MetalSd,Exterior1st_HdBoard,Exterior1st_Wd Sdng,Exterior1st_Plywood,Exterior1st_CemntBd,Exterior1st_BrkFace,Exterior1st_Stucco,Exterior1st_WdShing,Exterior1st_AsbShng,Exterior2nd_VinylSd,Exterior2nd_MetalSd,Exterior2nd_HdBoard,Exterior2nd_Wd Sdng,Exterior2nd_Plywood,Exterior2nd_CmentBd,Exterior2nd_Wd Shng,Exterior2nd_BrkFace,Exterior2nd_Stucco,Exterior2nd_AsbShng
355,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
115,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
862,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1409,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
999,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


# **Integer Encoder**
We will use HousePrice data set.

In [33]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(house[['Neighborhood', 'Exterior1st', 'Exterior2nd']], house['SalePrice'], test_size=0.3, random_state = 33)
X_train1.shape, X_test1.shape, y_train1.shape, y_test1.shape

((1022, 3), (438, 3), (1022,), (438,))

In [19]:
# Lets create an encoder with sklearn.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(X_train1.Neighborhood)

LabelEncoder()

In [20]:
# Unique Classes
le.classes_

array(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
       'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
       'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown',
       'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber',
       'Veenker'], dtype=object)

In [27]:
from collections import defaultdict
d = defaultdict(LabelEncoder)
train_transformed = X_train1.apply(lambda x: d[x.name].fit_transform(x))
test_transformed = X_test1.apply(lambda x: d[x.name].fit_transform(x))
print(train_transformed.head())
print(test_transformed.head())

      Neighborhood  Exterior1st  Exterior2nd
355              5           10           13
115             21            7            8
862             20            6           10
1409            14            8           10
999              5           10           13
      Neighborhood  Exterior1st  Exterior2nd
753             16           10           10
445              7           11           11
1149            17           10           10
1300             8           10           10
1369             5           10           10


Let's do the same thing with Feature_Engine:

In [34]:
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
ordinal_enc = OrdinalCategoricalEncoder(encoding_method = 'arbitrary',
                                        variables = ['Neighborhood', 'Exterior1st', 'Exterior2nd'])
ordinal_enc.fit(X_train1)

OrdinalCategoricalEncoder(encoding_method='arbitrary',
                          variables=['Neighborhood', 'Exterior1st',
                                     'Exterior2nd'])

In [37]:
ordinal_enc.encoder_dict_

{'Exterior1st': {'AsbShng': 9,
  'AsphShn': 10,
  'BrkComm': 12,
  'BrkFace': 5,
  'CBlock': 11,
  'CemntBd': 7,
  'HdBoard': 2,
  'MetalSd': 1,
  'Plywood': 3,
  'Stucco': 8,
  'VinylSd': 0,
  'Wd Sdng': 4,
  'WdShing': 6},
 'Exterior2nd': {'AsbShng': 10,
  'AsphShn': 12,
  'Brk Cmn': 11,
  'BrkFace': 5,
  'CBlock': 14,
  'CmentBd': 7,
  'HdBoard': 4,
  'ImStucc': 9,
  'MetalSd': 1,
  'Other': 15,
  'Plywood': 2,
  'Stone': 13,
  'Stucco': 8,
  'VinylSd': 0,
  'Wd Sdng': 3,
  'Wd Shng': 6},
 'Neighborhood': {'Blmngtn': 18,
  'Blueste': 24,
  'BrDale': 23,
  'BrkSide': 8,
  'ClearCr': 19,
  'CollgCr': 0,
  'Crawfor': 14,
  'Edwards': 16,
  'Gilbert': 10,
  'IDOTRR': 6,
  'MeadowV': 11,
  'Mitchel': 17,
  'NAmes': 5,
  'NPkVill': 21,
  'NWAmes': 3,
  'NoRidge': 12,
  'NridgHt': 7,
  'OldTown': 4,
  'SWISU': 22,
  'Sawyer': 9,
  'SawyerW': 2,
  'Somerst': 1,
  'StoneBr': 15,
  'Timber': 13,
  'Veenker': 20}}

In [38]:
ordinal_enc.variables

['Neighborhood', 'Exterior1st', 'Exterior2nd']

In [39]:
X_train1 = ordinal_enc.transform(X_train1)
X_test1 = ordinal_enc.transform(X_test1)
X_train1.head()



Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
355,0,0,0
115,1,1,1
862,2,2,2
1409,3,3,2
999,0,0,0


# **Count or Frequency Encoding**

In [0]:
# Count or frequency encoding 
X_train2, X_test2, y_train2, y_test2 = train_test_split(house[['Neighborhood', 'Exterior1st', 'Exterior2nd']], house['SalePrice'], test_size=0.3, random_state = 33)


In [41]:
# Count of the attributes
count_map = X_train2['Neighborhood'].value_counts().to_dict()
count_map

{'Blmngtn': 11,
 'Blueste': 1,
 'BrDale': 10,
 'BrkSide': 41,
 'ClearCr': 17,
 'CollgCr': 103,
 'Crawfor': 37,
 'Edwards': 76,
 'Gilbert': 61,
 'IDOTRR': 23,
 'MeadowV': 11,
 'Mitchel': 30,
 'NAmes': 157,
 'NPkVill': 5,
 'NWAmes': 49,
 'NoRidge': 27,
 'NridgHt': 50,
 'OldTown': 78,
 'SWISU': 19,
 'Sawyer': 55,
 'SawyerW': 40,
 'Somerst': 66,
 'StoneBr': 19,
 'Timber': 26,
 'Veenker': 10}

In [42]:
# Replace the lables with the counts
X_train2['Neighborhood'] = X_train2['Neighborhood'].map(count_map)
X_test2['Neighborhood'] = X_test2['Neighborhood'].map(count_map)
X_train2['Neighborhood'].head()

355     103
115      66
862      40
1409     49
999     103
Name: Neighborhood, dtype: int64

Count/Frequency Encoding with Feature Engine

In [0]:
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder
X_train3, X_test3, y_train3, y_test3 = train_test_split(house[['Neighborhood', 'Exterior1st', 'Exterior2nd']], house['SalePrice'], test_size=0.3, random_state = 33)


In [49]:
encode_cf = CountFrequencyCategoricalEncoder(
    encoding_method = 'count', # to do frequency => enoding_method = 'frequency'
    variables = ['Neighborhood', 'Exterior1st', 'Exterior2nd'])

encode_cf.fit(X_train3)

CountFrequencyCategoricalEncoder(encoding_method='count',
                                 variables=['Neighborhood', 'Exterior1st',
                                            'Exterior2nd'])

In [50]:
X_train3 = encode_cf.transform(X_train3)
X_test3 = encode_cf.transform(X_test3)
X_train3.head()



Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
355,103,356,348
115,66,158,152
862,40,153,103
1409,49,78,103
999,103,356,348
