# Encoding
---
- **One Hot Encoding / Dummies Variable**
- **Ordinal Encoding**
- **Label Encoding**

---
---
# One Hot Encoding
---
---  
### Use for categorical Data with no hierachy

In [1]:
import pandas as pd

In [2]:
# Create Data Set

people_dict = {"Marital": ["married", "married", "married", "divorced", "married", "single", "single", "single", "divorced", "single", "divorced", "single"],
                      "Gender": ["male", "male", "female", "female", "male", "female", "female", "male", "female", "female", "male", "female"],
                      "Education": ["Bachelors", "Bachelors", "Master", "Bachelors", "Master", "Phd", "Master", "Secondary", "Master", "Phd", "Phd", "Bachelors"],
                      "Country": ["England", "England", "Thailand", "Canada", "USA", "Thailand", "USA", "USA", "Canada", "England", "Canada", "USA"],
                      "Qualify": ["yes", "yes", "no", "yes", "no", "yes", "yes", "no", "yes", "no", "yes", "yes"]}

people = pd.DataFrame(people_dict)

In [3]:
people

Unnamed: 0,Marital,Gender,Education,Country,Qualify
0,married,male,Bachelors,England,yes
1,married,male,Bachelors,England,yes
2,married,female,Master,Thailand,no
3,divorced,female,Bachelors,Canada,yes
4,married,male,Master,USA,no
5,single,female,Phd,Thailand,yes
6,single,female,Master,USA,yes
7,single,male,Secondary,USA,no
8,divorced,female,Master,Canada,yes
9,single,female,Phd,England,no


### Categorical Data with no hierachy is "Marital", "Gender", "Country"

In [4]:
from sklearn import preprocessing

In [5]:
# Create OneHotEncoder Object
ohe = preprocessing.OneHotEncoder(sparse_output = False).set_output(transform = "pandas")

# Fitting
ohe_marital = ohe.fit_transform(people[["Marital"]])

ohe_marital

Unnamed: 0,Marital_divorced,Marital_married,Marital_single
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
5,0.0,0.0,1.0
6,0.0,0.0,1.0
7,0.0,0.0,1.0
8,1.0,0.0,0.0
9,0.0,0.0,1.0


In [6]:
ohe.categories_

[array(['divorced', 'married', 'single'], dtype=object)]

### One Hot Encoding with more than one features

In [7]:
# Create OneHotEncoder Object
ohe2 = preprocessing.OneHotEncoder(sparse_output = False).set_output(transform = "pandas")

# Fitting
ohe_gen_coun = ohe2.fit_transform(people[["Gender", "Country"]])

ohe_gen_coun

Unnamed: 0,Gender_female,Gender_male,Country_Canada,Country_England,Country_Thailand,Country_USA
0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0
5,1.0,0.0,0.0,0.0,1.0,0.0
6,1.0,0.0,0.0,0.0,0.0,1.0
7,0.0,1.0,0.0,0.0,0.0,1.0
8,1.0,0.0,1.0,0.0,0.0,0.0
9,1.0,0.0,0.0,1.0,0.0,0.0


In [8]:
ohe2.categories_

[array(['female', 'male'], dtype=object),
 array(['Canada', 'England', 'Thailand', 'USA'], dtype=object)]

### Concat Data with OneHotEncoder Data

In [9]:
pd.concat([people, ohe_marital, ohe_gen_coun], axis = 1)

Unnamed: 0,Marital,Gender,Education,Country,Qualify,Marital_divorced,Marital_married,Marital_single,Gender_female,Gender_male,Country_Canada,Country_England,Country_Thailand,Country_USA
0,married,male,Bachelors,England,yes,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,married,male,Bachelors,England,yes,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,married,female,Master,Thailand,no,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,divorced,female,Bachelors,Canada,yes,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,married,male,Master,USA,no,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5,single,female,Phd,Thailand,yes,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
6,single,female,Master,USA,yes,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
7,single,male,Secondary,USA,no,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
8,divorced,female,Master,Canada,yes,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
9,single,female,Phd,England,no,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


---
---
# Dummies Variable
---
---
### Use for categorical Data with no hierachy

In [10]:
people

Unnamed: 0,Marital,Gender,Education,Country,Qualify
0,married,male,Bachelors,England,yes
1,married,male,Bachelors,England,yes
2,married,female,Master,Thailand,no
3,divorced,female,Bachelors,Canada,yes
4,married,male,Master,USA,no
5,single,female,Phd,Thailand,yes
6,single,female,Master,USA,yes
7,single,male,Secondary,USA,no
8,divorced,female,Master,Canada,yes
9,single,female,Phd,England,no


In [11]:
# Use pd.get_dummies

pd.get_dummies(people[["Marital", "Gender", "Country"]]).astype(int)

Unnamed: 0,Marital_divorced,Marital_married,Marital_single,Gender_female,Gender_male,Country_Canada,Country_England,Country_Thailand,Country_USA
0,0,1,0,0,1,0,1,0,0
1,0,1,0,0,1,0,1,0,0
2,0,1,0,1,0,0,0,1,0
3,1,0,0,1,0,1,0,0,0
4,0,1,0,0,1,0,0,0,1
5,0,0,1,1,0,0,0,1,0
6,0,0,1,1,0,0,0,0,1
7,0,0,1,0,1,0,0,0,1
8,1,0,0,1,0,1,0,0,0
9,0,0,1,1,0,0,1,0,0


In [12]:
# Create dumiies DataFrame

dummies_var = pd.get_dummies(people[["Marital", "Gender", "Country"]]).astype(int)

### Concat Data with dummies Data

In [13]:
pd.concat([people, dummies_var], axis = 1)

Unnamed: 0,Marital,Gender,Education,Country,Qualify,Marital_divorced,Marital_married,Marital_single,Gender_female,Gender_male,Country_Canada,Country_England,Country_Thailand,Country_USA
0,married,male,Bachelors,England,yes,0,1,0,0,1,0,1,0,0
1,married,male,Bachelors,England,yes,0,1,0,0,1,0,1,0,0
2,married,female,Master,Thailand,no,0,1,0,1,0,0,0,1,0
3,divorced,female,Bachelors,Canada,yes,1,0,0,1,0,1,0,0,0
4,married,male,Master,USA,no,0,1,0,0,1,0,0,0,1
5,single,female,Phd,Thailand,yes,0,0,1,1,0,0,0,1,0
6,single,female,Master,USA,yes,0,0,1,1,0,0,0,0,1
7,single,male,Secondary,USA,no,0,0,1,0,1,0,0,0,1
8,divorced,female,Master,Canada,yes,1,0,0,1,0,1,0,0,0
9,single,female,Phd,England,no,0,0,1,1,0,0,1,0,0


---
---
# Ordinal Encoding
---
---  
- ### Use for categorical Data with hierachy
- ### Use for independent variable (Features)

In [14]:
people

Unnamed: 0,Marital,Gender,Education,Country,Qualify
0,married,male,Bachelors,England,yes
1,married,male,Bachelors,England,yes
2,married,female,Master,Thailand,no
3,divorced,female,Bachelors,Canada,yes
4,married,male,Master,USA,no
5,single,female,Phd,Thailand,yes
6,single,female,Master,USA,yes
7,single,male,Secondary,USA,no
8,divorced,female,Master,Canada,yes
9,single,female,Phd,England,no


### From above Data Set features is "Marital", "Gender", "Education", "Country"  
### Features with hierachy is only "Education"  
- **Secondary = lowest**
- **Bachelors = second place**
- **Master = third place**
- **Phd = highest**

In [15]:
# Check all categories in Education

people["Education"].unique()

array(['Bachelors', 'Master', 'Phd', 'Secondary'], dtype=object)

In [16]:
# Create variable to arrange hierachy

edu_rank = ["Secondary", "Bachelors", "Master", "Phd"]

In [25]:
# Create OrdinalEncoder Object
education = preprocessing.OrdinalEncoder(categories = [edu_rank])

# Fitting
education.fit_transform(people[["Education"]])

array([[1.],
       [1.],
       [2.],
       [1.],
       [2.],
       [3.],
       [2.],
       [0.],
       [2.],
       [3.],
       [3.],
       [1.]])

In [26]:
# Create new column in DataFrame
people["Education_Ordinal"] = education.fit_transform(people[["Education"]])

people

Unnamed: 0,Marital,Gender,Education,Country,Qualify,Education_Ordinal
0,married,male,Bachelors,England,yes,1.0
1,married,male,Bachelors,England,yes,1.0
2,married,female,Master,Thailand,no,2.0
3,divorced,female,Bachelors,Canada,yes,1.0
4,married,male,Master,USA,no,2.0
5,single,female,Phd,Thailand,yes,3.0
6,single,female,Master,USA,yes,2.0
7,single,male,Secondary,USA,no,0.0
8,divorced,female,Master,Canada,yes,2.0
9,single,female,Phd,England,no,3.0


---
---
# Label Encoding
---
---  
- ### Use for categorical Data 
- ### Use for dependent variable (Result)

In [28]:
people = people.drop(columns = ["Education_Ordinal"])

people

Unnamed: 0,Marital,Gender,Education,Country,Qualify
0,married,male,Bachelors,England,yes
1,married,male,Bachelors,England,yes
2,married,female,Master,Thailand,no
3,divorced,female,Bachelors,Canada,yes
4,married,male,Master,USA,no
5,single,female,Phd,Thailand,yes
6,single,female,Master,USA,yes
7,single,male,Secondary,USA,no
8,divorced,female,Master,Canada,yes
9,single,female,Phd,England,no


### From above Data Set dependent variable (Result) is "Qualify"

In [31]:
# Create LabelEncoder Object
la_qualify = preprocessing.LabelEncoder()

# Fitting
la_qualify.fit_transform(people[["Qualify"]].values.ravel())

array([1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1])

In [32]:
# Create new column in DataFrame
people["Le_Qualify"] = la_qualify.fit_transform(people[["Qualify"]].values.ravel())

people

Unnamed: 0,Marital,Gender,Education,Country,Qualify,Le_Qualify
0,married,male,Bachelors,England,yes,1
1,married,male,Bachelors,England,yes,1
2,married,female,Master,Thailand,no,0
3,divorced,female,Bachelors,Canada,yes,1
4,married,male,Master,USA,no,0
5,single,female,Phd,Thailand,yes,1
6,single,female,Master,USA,yes,1
7,single,male,Secondary,USA,no,0
8,divorced,female,Master,Canada,yes,1
9,single,female,Phd,England,no,0
