### one hot encode

For example, imagine we have a “color” variable with three categories (‘red‘, ‘green‘, and ‘blue‘). In this case, three binary variables are needed. A “1” value is placed in the binary variable for the color and “0” values for the other colors.

red,	green,	blue
1,		0,		0
0,		1,		0
0,		0,		1

In [1]:
# one hot encode the breast cancer dataset
from pandas import read_csv
from sklearn.preprocessing import OneHotEncoder

In [2]:
# define the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"

In [3]:
# load the dataset
dataset = read_csv(url, header=None)

In [4]:
# retrieve the array of data
data = dataset.values

In [5]:
data.shape

(286, 10)

In [6]:
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

In [7]:
# summarize the raw data
print(X[:3, :])

[["'40-49'" "'premeno'" "'15-19'" "'0-2'" "'yes'" "'3'" "'right'"
  "'left_up'" "'no'"]
 ["'50-59'" "'ge40'" "'15-19'" "'0-2'" "'no'" "'1'" "'right'" "'central'"
  "'no'"]
 ["'50-59'" "'ge40'" "'35-39'" "'0-2'" "'no'" "'2'" "'left'" "'left_low'"
  "'no'"]]


In [8]:
X.shape

(286, 9)

In [9]:
# define the one hot encoding transform
encoder = OneHotEncoder(sparse=False)

In [10]:
encoder

OneHotEncoder(sparse=False)

In [11]:
# fit and apply the transform to the input data
X_oe = encoder.fit_transform(X)

In [12]:
X_oe.shape

(286, 43)

In [13]:
# summarize tratransformed data
print(X_oe[:3, :])

[[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.]]


In [14]:
encoder = OneHotEncoder() # OneHotEncoder(sparse=True)

In [15]:
# fit and apply the transform to the input data
X_oe = encoder.fit_transform(X)

In [20]:
X_oe.shape

(286, 43)

In [22]:
# summarize tratransformed data
print(X_oe[:1, :])

  (0, 2)	1.0
  (0, 8)	1.0
  (0, 11)	1.0
  (0, 20)	1.0
  (0, 28)	1.0
  (0, 32)	1.0
  (0, 34)	1.0
  (0, 37)	1.0
  (0, 41)	1.0


In [24]:
import pandas as pd

In [25]:
fruit = pd.DataFrame({'name':['apple', 'banana', 'cherry', 'durian'],
                      'color':['red', 'yellow', 'red', 'green']})   
fruit

Unnamed: 0,name,color
0,apple,red
1,banana,yellow
2,cherry,red
3,durian,green


In [26]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(fruit['name'])
fruit['name'] = le.transform(fruit['name'])
fruit

Unnamed: 0,name,color
0,0,red
1,1,yellow
2,2,red
3,3,green
