# DataSet Iris

## Import libraries

In [12]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Import dataset

In [30]:
df = pd.read_csv('Iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [31]:
df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

# LabelEncoder 

* **Use LabelEncoder when there are only two possible values of a categorical features.**

## Example

In [37]:
# Import label encoder
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('Iris.csv')
# label_encoder object knows how to understand word labels.
label_encoder = LabelEncoder()

# Encode labels in column 'class'.
label_encoder = label_encoder.fit(df['class'])
df['class_LabelEncoder']= label_encoder.transform(df['class'])

df['class'].unique(),df['class_LabelEncoder'].unique()

(array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object),
 array([0, 1, 2]))

## transform

In [38]:
from random import sample
L = sample(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']*10, 5)
print (L)

['Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica']


In [39]:
label_encoder.transform(L)

array([0, 1, 0, 1, 2])

## inverse_transform

In [40]:
from random import sample
L = sample([0, 1, 2]*10, 5)
print (L)

[1, 0, 1, 0, 1]


In [41]:
label_encoder.inverse_transform(L)

array(['Iris-versicolor', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor'], dtype=object)

# One-Hot-Encoder

## Normal way

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# Import dataset
df = pd.read_csv('Iris.csv')
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(df[['class']]).toarray())
# merge with main df bridge_df on key values
df = df.join(enc_df)
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class,0,1,2
0,5.1,3.5,1.4,0.2,Iris-setosa,1.0,0.0,0.0
1,4.9,3.0,1.4,0.2,Iris-setosa,1.0,0.0,0.0
2,4.7,3.2,1.3,0.2,Iris-setosa,1.0,0.0,0.0
3,4.6,3.1,1.5,0.2,Iris-setosa,1.0,0.0,0.0
4,5.0,3.6,1.4,0.2,Iris-setosa,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,0.0,0.0,1.0
146,6.3,2.5,5.0,1.9,Iris-virginica,0.0,0.0,1.0
147,6.5,3.0,5.2,2.0,Iris-virginica,0.0,0.0,1.0
148,6.2,3.4,5.4,2.3,Iris-virginica,0.0,0.0,1.0


## Using dummies values approach

In [36]:
import pandas as pd
import numpy as np
# Import dataset
df = pd.read_csv('Iris.csv')
# generate binary values using get_dummies
new_df = pd.get_dummies(df, columns=["class"], prefix=["Type_is"] )
# merge with main df bridge_df on key values
new_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Type_is_Iris-setosa,Type_is_Iris-versicolor,Type_is_Iris-virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1


# When to use LabelEncoder & OneHotEncoder

## LabelEncoder

**We apply Label Encoding when:**
  - The categorial feature is ordinal (Like, Size feature [XS, S, M, L ,XL])
  - The Number of values in the categorial feature is quite large, as One-hot encoding can can lead to high memory consumption.

## OneHotEncoder

**We apply One-Hot Encoding when:**
  - The categorial feature is not ordinal (Like, City feature)
  - The Number of values in the categorial feature is less, so One-hot encoding can be effectively applied.