# Categorical Variable Encoding

## 1. Dummy Variables and One-Hot Encoding

### 1.1 Dummy Variables using Pandas

In [1]:
import pandas as pd
import seaborn as sns #for loading dataset

In [2]:
tips_df = sns.load_dataset(name='tips')
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
# creating k dummy variables w.r.t the no. of categories present in a variable.
dummy_df = pd.get_dummies(tips_df)
dummy_df

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,2,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,3,1,0,0,1,0,0,0,1,0,1
2,21.01,3.50,3,1,0,0,1,0,0,0,1,0,1
3,23.68,3.31,2,1,0,0,1,0,0,0,1,0,1
4,24.59,3.61,4,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,0,1,0,0,1,0,0,1
240,27.18,2.00,2,0,1,1,0,0,0,1,0,0,1
241,22.67,2.00,2,1,0,1,0,0,0,1,0,0,1
242,17.82,1.75,2,1,0,0,1,0,0,1,0,0,1


Here we can see that sex_Female is sufficient enough to recognize the sex of the individual, as 1 means Female and 0 means Male.
Hence, there is no requirement to create another variable as sex_Male. Inorder to avoid this we set the parameter 'drop_first'= True. Which leads to k-1 dummy variables instead of k.

In [4]:
dummy_df = pd.get_dummies(tips_df, drop_first=True)
dummy_df

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,1,1,0,0,1,1
1,10.34,1.66,3,0,1,0,0,1,1
2,21.01,3.50,3,0,1,0,0,1,1
3,23.68,3.31,2,0,1,0,0,1,1
4,24.59,3.61,4,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0,1,0,1,0,1
240,27.18,2.00,2,1,0,0,1,0,1
241,22.67,2.00,2,0,0,0,1,0,1
242,17.82,1.75,2,0,1,0,1,0,1


### 1.2 One Hot Encoding using Scikit-Learn

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
oh_enc = OneHotEncoder(sparse=False,drop='first') 
# to get numpy 2D array and avoid sparse matrix set 'sparse'=False
# to get k-1 dummy variables set 'drop', else gives K dummy variables

In [7]:
oh_enc_arr = oh_enc.fit_transform(tips_df[['sex','smoker','day','time']])
oh_enc_arr

array([[0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       ...,
       [1., 1., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [8]:
oh_enc_df=pd.DataFrame(oh_enc_arr, columns =['sex_Male','smoker_Yes',
                                             'day_Sat','day_Sun',
                                    'day_Thur','time_Lunch'])
oh_enc_df

Unnamed: 0,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
239,1.0,0.0,1.0,0.0,0.0,0.0
240,0.0,1.0,1.0,0.0,0.0,0.0
241,1.0,1.0,1.0,0.0,0.0,0.0
242,1.0,0.0,1.0,0.0,0.0,0.0


## 2. Label Encoding and Ordinal Encoding

1. Label Encoding can be applied on ordinal and nominal categorical variables.
2. Ordinal Encoding can be applied only on ordinal categorical variables.

In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [10]:
df = pd.read_csv(r'Datasets\Kaggle\train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
df2 = df[['KitchenQual','BldgType']]
df2

Unnamed: 0,KitchenQual,BldgType
0,Gd,1Fam
1,TA,1Fam
2,Gd,1Fam
3,Gd,1Fam
4,Gd,1Fam
...,...,...
1455,TA,1Fam
1456,TA,1Fam
1457,Gd,1Fam
1458,Gd,1Fam


### 2.1. Label Encoding

In [12]:
le = LabelEncoder()

In [13]:
le.fit_transform(df2['BldgType'])

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
df2['BldgType_LEnc'] = le.fit_transform(df2['BldgType'])
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['BldgType_LEnc'] = le.fit_transform(df2['BldgType'])


Unnamed: 0,KitchenQual,BldgType,BldgType_LEnc
0,Gd,1Fam,0
1,TA,1Fam,0
2,Gd,1Fam,0
3,Gd,1Fam,0
4,Gd,1Fam,0
...,...,...,...
1455,TA,1Fam,0
1456,TA,1Fam,0
1457,Gd,1Fam,0
1458,Gd,1Fam,0


In [15]:
df2['BldgType_LEnc'].value_counts()

0    1220
4     114
2      52
3      43
1      31
Name: BldgType_LEnc, dtype: int64

### 2.2. Ordinal Encoding

In [16]:
df2['KitchenQual'].value_counts()

TA    735
Gd    586
Ex    100
Fa     39
Name: KitchenQual, dtype: int64

In [17]:
order_labels= {'Ex':4,'Gd':3,'Fa':2,'TA':1}

In [18]:
df2['KitchenQual_OEnc'] = df2['KitchenQual'].map(order_labels) 
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['KitchenQual_OEnc'] = df2['KitchenQual'].map(order_labels)


Unnamed: 0,KitchenQual,BldgType,BldgType_LEnc,KitchenQual_OEnc
0,Gd,1Fam,0,3
1,TA,1Fam,0,1
2,Gd,1Fam,0,3
3,Gd,1Fam,0,3
4,Gd,1Fam,0,3
...,...,...,...,...
1455,TA,1Fam,0,1
1456,TA,1Fam,0,1
1457,Gd,1Fam,0,3
1458,Gd,1Fam,0,3


In [19]:
df2['KitchenQual_OEnc'].value_counts()

1    735
3    586
4    100
2     39
Name: KitchenQual_OEnc, dtype: int64