## Import Packages & Modules

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import preprocessing

## Ordinal Encoder

To convert categorical features to such integer codes, we can use the OrdinalEncoder. This estimator transforms each categorical feature to one new feature of integers (0 to n_categories - 1)

Such integer representation can, however, not be used directly with all scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of browsers was ordered arbitrarily).

In [6]:
## Dataset
X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox']]

print(X)

## Create instance of OrdinalEncoder object 
enc = preprocessing.OrdinalEncoder().fit(X)
enc.transform([['female', 'from US', 'uses Safari'],['male', 'from Europe', 'uses Firefox']])

#enc.categories_

[['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]


array([[0., 1., 1.],
       [1., 0., 0.]])

In [None]:
X = []

## OneHot Encoder (pre-defined categorical lists)

In [119]:
## Categorical Variables
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']

## Create instance of OneHotEncoder object
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
# training data
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)

enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()


array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])

## OneHot Encoder

In [123]:
## Create instance of OneHotEncoder object
enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
X_train = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
X_test = [['female', 'from Asia', 'uses Chrome']]

enc.fit(X_train)

print(enc.transform(X_train).toarray())
print(enc.transform(X_test).toarray())

[[0. 1. 0. 1. 0. 1.]
 [1. 0. 1. 0. 1. 0.]]
[[1. 0. 0. 0. 0. 0.]]


## OneHot Encoder - Avoiding dummy variable trap

In [137]:
X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox'],
     ['female', 'from Asia', 'uses Firefox'],
     ['female', 'from Aus', 'uses Firefox']]

drop_enc = preprocessing.OneHotEncoder(drop='first').fit(X)
print(drop_enc.categories_)
print(drop_enc.transform(X).toarray())



[array(['female', 'male'], dtype=object), array(['from Asia', 'from Aus', 'from Europe', 'from US'], dtype=object), array(['uses Firefox', 'uses Safari'], dtype=object)]
[[1. 0. 0. 1. 1.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]]
