### Encoding(categorical variables)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('Travel.csv')


In [3]:
cat_col=[fea for fea in data.columns if data[fea].dtype =='O'] 
cat_col


['TypeofContact',
 'Occupation',
 'Gender',
 'ProductPitched',
 'MaritalStatus',
 'Designation']

In [4]:
X=data[cat_col]
X


Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
0,Self Enquiry,Salaried,Female,Deluxe,Single,Manager
1,Company Invited,Salaried,Male,Deluxe,Divorced,Manager
2,Self Enquiry,Free Lancer,Male,Basic,Single,Executive
3,Company Invited,Salaried,Female,Basic,Divorced,Executive
4,Self Enquiry,Small Business,Male,Basic,Divorced,Executive
...,...,...,...,...,...,...
4883,Self Enquiry,Small Business,Male,Deluxe,Unmarried,Manager
4884,Company Invited,Salaried,Male,Basic,Single,Executive
4885,Self Enquiry,Salaried,Female,Standard,Married,Senior Manager
4886,Self Enquiry,Small Business,Male,Basic,Single,Executive


In [5]:
X.isnull().sum()

TypeofContact     25
Occupation         0
Gender             0
ProductPitched     0
MaritalStatus      0
Designation        0
dtype: int64

In [6]:
# Replacing Null values in "TypeofContact" feature by Mode
X=X.fillna(X.mode().iloc[0])


In [7]:
X.isnull().sum()


TypeofContact     0
Occupation        0
Gender            0
ProductPitched    0
MaritalStatus     0
Designation       0
dtype: int64

#### Method 1: Creating Binary variables through One Hot Encoding

In [8]:
# Using Pandas
X_encoded=pd.get_dummies(X,drop_first=True)
# drop_first =True implies that␣dropping the first binary variable

In [9]:
# Using Sklearn
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(categories='auto',drop='first',sparse=False)

In [10]:
X_encoded=encoder.fit(X)
X_encoded


OneHotEncoder(drop='first', sparse=False)

In [11]:
X_transformed=encoder.transform(X)
X_transformed


array([[1., 0., 1., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.]])

#### Method 2: Repalcing Categories with Ordinal Numbers

In [12]:
 # Using sklearn
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(X)
X_trans=enc.transform(X)
X_trans

array([[1., 2., 1., 1., 2., 2.],
       [0., 2., 2., 1., 0., 2.],
       [1., 0., 2., 0., 2., 1.],
       ...,
       [1., 2., 1., 3., 1., 3.],
       [1., 3., 2., 0., 2., 1.],
       [1., 2., 2., 0., 3., 1.]])

#### Method 3: Label Encoding

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(X['TypeofContact'])


LabelEncoder()

In [14]:
list(le.classes_)


['Company Invited', 'Self Enquiry']

In [15]:
le.transform(X['TypeofContact'])


array([1, 0, 1, ..., 1, 1, 1])