In [1]:
# import library
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Normal Data set Encoding 

In [29]:
df = pd.DataFrame({'Size':['S','M','L','XL','S','M','L','S','S','L','M']})
df

Unnamed: 0,Size
0,S
1,M
2,L
3,XL
4,S
5,M
6,L
7,S
8,S
9,L


### Scikit learn ordinal Encoding 

In [30]:
# assign numbers for encoding 
ord_data = [['S','M','L','XL']]

In [31]:
from sklearn.preprocessing import OrdinalEncoder

In [34]:
oe = OrdinalEncoder(categories=ord_data)
oe.fit(df[["Size"]])

0,1,2
,categories,"[['S', 'M', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [35]:
oe.transform(df[['Size']])

array([[0.],
       [1.],
       [2.],
       [3.],
       [0.],
       [1.],
       [2.],
       [0.],
       [0.],
       [2.],
       [1.]])

In [36]:
df["Size_en"] = oe.transform(df[["Size"]])

In [37]:
df

Unnamed: 0,Size,Size_en
0,S,0.0
1,M,1.0
2,L,2.0
3,XL,3.0
4,S,0.0
5,M,1.0
6,L,2.0
7,S,0.0
8,S,0.0
9,L,2.0


### Map Function Encoding 

In [38]:
ord_data1 = {"S":0,"M":1,"L":2,"XL":3}

In [41]:
df['Size_en_map'] = df["Size"].map(ord_data1)

In [42]:
df

Unnamed: 0,Size,Size_en,Size_en_map
0,S,0.0,0
1,M,1.0,1
2,L,2.0,2
3,XL,3.0,3
4,S,0.0,0
5,M,1.0,1
6,L,2.0,2
7,S,0.0,0
8,S,0.0,0
9,L,2.0,2


In [43]:
# map  encoding 
df1 = pd.DataFrame({'Class':['seven','Eight','Nine','Ten']})
df1

Unnamed: 0,Class
0,seven
1,Eight
2,Nine
3,Ten


In [46]:
df2 = {'seven':0,"Eight":1,"Nine":2,"Ten":3}

In [48]:
df1['Clss_en'] = df1['Class'].map(df2)

In [49]:
df1

Unnamed: 0,Class,Clss_en
0,seven,0
1,Eight,1
2,Nine,2
3,Ten,3


### Data Set

In [108]:
# load data set
dataset = pd.read_csv('loan.csv')
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [109]:
dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [110]:
dataset['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [111]:
df1 = [['Rural','Semiurban','Urban']]

In [112]:
from sklearn.preprocessing import OrdinalEncoder

In [113]:
# now use sklearn 
oe = OrdinalEncoder(categories=df1)
oe.fit(dataset[['Property_Area']])

0,1,2
,categories,"[['Rural', 'Semiurban', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [114]:
dataset['Property_Area'] = oe.transform(dataset[['Property_Area']])

In [115]:
dataset

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,2.0,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,0.0,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,2.0,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,2.0,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,2.0,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,0.0,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,0.0,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,2.0,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,2.0,Y
