In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
pd.options.mode.chained_assignment = None 

In [2]:
data = pd.read_csv("titanic.csv")
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


# Nominal Encoding

In [3]:
new_data = data[['Survived', 'Cabin']]
new_data.head(3)

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,


In [4]:
new_data.loc[:,'Cabin'].fillna('Missing', inplace=True)

In [5]:
new_data.head(3)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing


### 1. One Hot Encoding

In [6]:
len(new_data['Cabin'].unique())

148

#### One hot encoder from sklearn

In [13]:
ohe = OneHotEncoder(sparse=False)
one_hot_encoded = ohe.fit_transform(new_data[['Cabin']])
one_hot_encoded

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

#### get dummy variable from pandas

In [37]:
one_hot_encoded_data = pd.get_dummies(new_data['Cabin'])

In [40]:
data = new_data.join(one_hot_encoded_data, how='left').drop(columns=['Cabin'])

In [41]:
data.head()

Unnamed: 0,Survived,A10,A14,A16,A19,A20,A23,A24,A26,A31,...,F E69,F G63,F G73,F2,F33,F38,F4,G6,Missing,T
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### 2.One Hot Encoding with many variables

In [184]:
car_data = pd.read_csv("Mercedes-Benz-train.csv")
car_data.head(4)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0


In [185]:
new_car_data = car_data[['X0', 'X1', 'X2', 'X3', 'X4']]
new_car_data.shape

(4209, 5)

In [186]:
top_10 = new_car_data['X0'].value_counts().sort_values(ascending=False).head(10).index

In [187]:
top_10 = list(top_10)
top_10

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']

In [188]:
for cat in top_10:
    new_car_data[cat] = np.where(new_car_data.loc[:,'X0']==cat, 1, 0)

In [189]:
top_10.append('X0')
new_car_data[top_10].head()

Unnamed: 0,z,ak,y,ay,t,x,o,f,n,w,X0
0,0,0,0,0,0,0,0,0,0,0,k
1,0,0,0,0,0,0,0,0,0,0,k
2,0,0,0,0,0,0,0,0,0,0,az
3,0,0,0,0,0,0,0,0,0,0,az
4,0,0,0,0,0,0,0,0,0,0,az


### 3.Mean Encoding

In [190]:
new_data.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [191]:
cabin_map_dict = new_data.groupby(['Cabin'])['Survived'].mean().to_dict()

In [192]:
new_data['Cabin'] = new_data['Cabin'].map(cabin_map_dict)

In [193]:
new_data.head()

Unnamed: 0,Survived,Cabin
0,0,0.299854
1,1,1.0
2,1,0.299854
3,1,0.5
4,0,0.299854


### 4.Count / Frequency Encoding

In [194]:
new_data = data[['Survived', 'Cabin']]
new_data['Cabin'].fillna('Missing', inplace=True)
new_data.head(3)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing


In [195]:
freq_dict = new_data['Cabin'].value_counts().to_dict()

In [196]:
new_data['Cabin'] = new_data['Cabin'].map(freq_dict)

In [197]:
new_data.head()

Unnamed: 0,Survived,Cabin
0,0,687
1,1,1
2,1,687
3,1,2
4,0,687


##### Advantages
 
    Easy To Use
    Not increasing feature space
##### Disadvantages
    It will provide same weight if the frequencies are same


# Ordinal Encoding

### 1.Ordinal Encoding

In [208]:
data = pd.read_csv("titanic.csv")
data = data[['Survived', 'Sex', 'Cabin']]

data['Cabin'].fillna('Missing', inplace=True)
data['Cabin'] = data['Cabin'].astype(str).str[0]

data.head()

Unnamed: 0,Survived,Sex,Cabin
0,0,male,M
1,1,female,C
2,1,female,M
3,1,female,C
4,0,male,M


In [209]:
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])

In [210]:
data.head()

Unnamed: 0,Survived,Sex,Cabin
0,0,1,M
1,1,0,C
2,1,0,M
3,1,0,C
4,0,1,M


### 2. Target Guided Ordinal Encoding

In [215]:
grp_labels = data.groupby(['Cabin'])['Survived'].mean().sort_values().index

In [216]:
guided_dict = {k:i for i, k in enumerate(grp_labels,0)}

In [217]:
guided_dict

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [218]:
data['Cabin'] = data['Cabin'].map(guided_dict)

In [220]:
data.head()

Unnamed: 0,Survived,Sex,Cabin
0,0,1,1
1,1,0,4
2,1,0,1
3,1,0,4
4,0,1,1


### 3. Ordinal Encoding

In [47]:
data = pd.DataFrame(['low', 'medium', 'high', 'medium', 'low', 'high'],columns=['status'])

In [224]:
data

Unnamed: 0,status
0,low
1,medium
2,high
3,medium
4,low
5,high


In [226]:
od = OrdinalEncoder()
data['n_status'] = od.fit_transform(data)

In [227]:
data

Unnamed: 0,status,n_status
0,low,1.0
1,medium,2.0
2,high,0.0
3,medium,2.0
4,low,1.0
5,high,0.0


In [48]:
data

Unnamed: 0,status
0,low
1,medium
2,high
3,medium
4,low
5,high


In [52]:
len(data['status'].unique())

3