In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd



df = pd.DataFrame({'Salary': [25000, 48000, 71000, 85000, 90000, 55000], 
                   'City' : ['Bengaluru','Delhi','Hyderabad','Bengaluru','Hyderabad','Bengaluru'],
                  'Gender': ['Male','Female','Female','Female','Male','Male'],
                  'Exp':[1,3,5,6,9,None]})
df

Unnamed: 0,Salary,City,Gender,Exp
0,25000,Bengaluru,Male,1.0
1,48000,Delhi,Female,3.0
2,71000,Hyderabad,Female,5.0
3,85000,Bengaluru,Female,6.0
4,90000,Hyderabad,Male,9.0
5,55000,Bengaluru,Male,


## Encoder and Imputers

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
lab_enc = LabelEncoder()

In [5]:
df2 = lab_enc.fit_transform(df['City'])

pd.Series(df2)

0    0
1    1
2    2
3    0
4    2
5    0
dtype: int32

In [6]:
df['City'] = df2
df

Unnamed: 0,Salary,City,Gender,Exp
0,25000,0,Male,1.0
1,48000,1,Female,3.0
2,71000,2,Female,5.0
3,85000,0,Female,6.0
4,90000,2,Male,9.0
5,55000,0,Male,


In [7]:
# Practice

df2 = lab_enc.fit_transform(df['Gender'])
pd.Series(df2)

0    1
1    0
2    0
3    0
4    1
5    1
dtype: int32

In [8]:
df['Gender'] = df2
df

Unnamed: 0,Salary,City,Gender,Exp
0,25000,0,1,1.0
1,48000,1,0,3.0
2,71000,2,0,5.0
3,85000,0,0,6.0
4,90000,2,1,9.0
5,55000,0,1,


In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [10]:
ohe = OneHotEncoder()
si = SimpleImputer()

In [11]:
import pandas as pd

df = pd.DataFrame({'Salary': [25000, 48000, 71000, 85000, 90000, 55000], 
                   'City' : ['Bengaluru','Delhi','Hyderabad','Bengaluru','Hyderabad','Bengaluru'],
                  'Gender': ['Male','Female','Female','Female','Male','Male'],
                  'Exp':[1,3,5,6,9,None]})
df

Unnamed: 0,Salary,City,Gender,Exp
0,25000,Bengaluru,Male,1.0
1,48000,Delhi,Female,3.0
2,71000,Hyderabad,Female,5.0
3,85000,Bengaluru,Female,6.0
4,90000,Hyderabad,Male,9.0
5,55000,Bengaluru,Male,


In [12]:
ct = make_column_transformer((ohe,['City','Gender']),
                           (si,['Exp']),
                            remainder = 'passthrough')  # 'passthrough' to keep all other columns

In [13]:
encoded = pd.DataFrame(ct.fit_transform(df))
encoded

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,0.0,0.0,0.0,1.0,1.0,25000.0
1,0.0,1.0,0.0,1.0,0.0,3.0,48000.0
2,0.0,0.0,1.0,1.0,0.0,5.0,71000.0
3,1.0,0.0,0.0,1.0,0.0,6.0,85000.0
4,0.0,0.0,1.0,0.0,1.0,9.0,90000.0
5,1.0,0.0,0.0,0.0,1.0,4.8,55000.0


In [14]:
# Rename the columns as per your choice.
encoded = pd.DataFrame(ct.fit_transform(df),columns=['City_Bengaluru','City_Delhi','City_Hyd','Gender_Male','Gender_Female','Exp','Salary'])

In [15]:
encoded

Unnamed: 0,City_Bengaluru,City_Delhi,City_Hyd,Gender_Male,Gender_Female,Exp,Salary
0,1.0,0.0,0.0,0.0,1.0,1.0,25000.0
1,0.0,1.0,0.0,1.0,0.0,3.0,48000.0
2,0.0,0.0,1.0,1.0,0.0,5.0,71000.0
3,1.0,0.0,0.0,1.0,0.0,6.0,85000.0
4,0.0,0.0,1.0,0.0,1.0,9.0,90000.0
5,1.0,0.0,0.0,0.0,1.0,4.8,55000.0


In [16]:
# Original Data Set
df

Unnamed: 0,Salary,City,Gender,Exp
0,25000,Bengaluru,Male,1.0
1,48000,Delhi,Female,3.0
2,71000,Hyderabad,Female,5.0
3,85000,Bengaluru,Female,6.0
4,90000,Hyderabad,Male,9.0
5,55000,Bengaluru,Male,


# get_dummies

* One hot Encoding and get_dummies almost equal. Major difference is if you want to reduce (drop_first = True) the column size of the dataset you can use get_dummies
* OHE does not add variable names to your dataframe, but get_dummies and variable names.
* Sometimes having more columns might overfit the model.

In [18]:
df1 = pd.get_dummies(df[['City','Gender']])
df1

Unnamed: 0,City_Bengaluru,City_Delhi,City_Hyderabad,Gender_Female,Gender_Male
0,1,0,0,0,1
1,0,1,0,1,0
2,0,0,1,1,0
3,1,0,0,1,0
4,0,0,1,0,1
5,1,0,0,0,1


In [19]:
df1 = pd.get_dummies(df[['City','Gender']], drop_first=True)
df1

Unnamed: 0,City_Delhi,City_Hyderabad,Gender_Male
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,0
4,0,1,1
5,0,0,1


# Ordinal Encoder

In [20]:
from sklearn.preprocessing import OrdinalEncoder

In [21]:
import pandas as pd

Employee = pd.DataFrame({'Position':['SE','Manager','Team Lead','SSE'],
                        'Project':['A','B','C','D'],
                        'Salary':[25000,85000,71000,48000]})
Employee

Unnamed: 0,Position,Project,Salary
0,SE,A,25000
1,Manager,B,85000
2,Team Lead,C,71000
3,SSE,D,48000


In [22]:
ord_enc = OrdinalEncoder(categories=[['SE','SSE','Team Lead','Manager'],['A','B','C','D']])
Encoded_df = ord_enc.fit_transform(Employee[['Position','Project']])

In [23]:
Encoded_df

array([[0., 0.],
       [3., 1.],
       [2., 2.],
       [1., 3.]])

# Binary Encoder

In [29]:
import pandas as pd

df = pd.DataFrame({'Cat_data': ['A','B','C','D','E','F','G','H','I','J','K','B','D','Z','F','C','Z']})
df

Unnamed: 0,Cat_data
0,A
1,B
2,C
3,D
4,E
5,F
6,G
7,H
8,I
9,J


In [25]:
!pip install category_encoders



In [30]:
from category_encoders import BinaryEncoder
from sklearn.preprocessing import OneHotEncoder

In [31]:
bi_enc = BinaryEncoder()

### 1. Count the number of categories (except duplicate)

### 2. A-1, B-2, C-3 and so on.

### 3. Open calculater in programmer mode and click number 1 and check BINary, 2 for B and check the binary

In [32]:
df_bi = bi_enc.fit_transform(df)
df_bi

Unnamed: 0,Cat_data_0,Cat_data_1,Cat_data_2,Cat_data_3
0,0,0,0,1
1,0,0,1,0
2,0,0,1,1
3,0,1,0,0
4,0,1,0,1
5,0,1,1,0
6,0,1,1,1
7,1,0,0,0
8,1,0,0,1
9,1,0,1,0


# Comparing with OneHotEncoder

In [33]:
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(df[['Cat_data']])

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

# Knn Imputer

In [34]:
df = pd.DataFrame({'Salary': [25000, 48000, 71000, 85000, 90000, 55000], 
                   'City' : ['Bengaluru','Delhi','Hyderabad','Bengaluru','Hyderabad','Bengaluru'],
                  'Gender': ['Male','Female','Female','Female','Male','Male'],
                  'Exp':[1,3,5,6,9,None]})
df

Unnamed: 0,Salary,City,Gender,Exp
0,25000,Bengaluru,Male,1.0
1,48000,Delhi,Female,3.0
2,71000,Hyderabad,Female,5.0
3,85000,Bengaluru,Female,6.0
4,90000,Hyderabad,Male,9.0
5,55000,Bengaluru,Male,


In [35]:
# Knn imputer will try to find the relatio with other columns and impute the data according the relation with other columns.
# In this case Age Nan is depending on the similarity with Fare columns
from sklearn.impute import KNNImputer                

In [36]:
knnimp = KNNImputer(n_neighbors=4)
knn_imp = pd.DataFrame(knnimp.fit_transform(df[['Salary','Exp']]))

knn_imp

Unnamed: 0,0,1
0,25000.0,1.0
1,48000.0,3.0
2,71000.0,5.0
3,85000.0,6.0
4,90000.0,9.0
5,55000.0,3.75


# Iterative Imputer

### This method treat other columns (which doesnot have nulls as feature and train on them and treat Null column as label. Finally it will predict the NaN data and impute. Its just like regression problem. Here Null column is label.


In [38]:

# Before using Iterative Imputer, we need to enable it using below code
from sklearn.experimental import enable_iterative_imputer

# import Iterative Imputer
from sklearn.impute import IterativeImputer

In [39]:
df = pd.DataFrame({'Salary': [25000, 48000, 71000, 85000, 90000, 55000], 
                   'City' : ['Bengaluru','Delhi','Hyderabad','Bengaluru','Hyderabad','Bengaluru'],
                  'Gender': ['Male','Female','Female','Female','Male','Male'],
                  'Exp':[1,3,5,6,9,None]})
df

Unnamed: 0,Salary,City,Gender,Exp
0,25000,Bengaluru,Male,1.0
1,48000,Delhi,Female,3.0
2,71000,Hyderabad,Female,5.0
3,85000,Bengaluru,Female,6.0
4,90000,Hyderabad,Male,9.0
5,55000,Bengaluru,Male,


In [40]:
iter_impute = IterativeImputer()
ite_imp = pd.DataFrame(iter_impute.fit_transform(df[['Salary','Exp']]), columns=['Salary','Exp'])
ite_imp

Unnamed: 0,Salary,Exp
0,25000.0,1.0
1,48000.0,3.0
2,71000.0,5.0
3,85000.0,6.0
4,90000.0,9.0
5,55000.0,3.864759
