## One-hot Encoding when we have multiple categorical variables

In [55]:
#importing the necessary libraries
import pandas as pd
import numpy as np

## Loading the House price Prediction dataset that has more categorical variables

In [56]:
#reading the data file with only categorical values
data = pd.read_csv('house_price_Categorical.csv')

In [57]:
data = data.select_dtypes(include=['object', 'category'])


In [58]:
data.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [59]:
#identifying the label counts in each categorical variable
for col in data.columns:
  print(col , ':' , len(data[col].unique()), 'lables')

MSZoning : 5 lables
Street : 2 lables
Alley : 3 lables
LotShape : 4 lables
LandContour : 4 lables
Utilities : 2 lables
LotConfig : 5 lables
LandSlope : 3 lables
Neighborhood : 25 lables
Condition1 : 9 lables
Condition2 : 8 lables
BldgType : 5 lables
HouseStyle : 8 lables
RoofStyle : 6 lables
RoofMatl : 8 lables
Exterior1st : 15 lables
Exterior2nd : 16 lables
MasVnrType : 4 lables
ExterQual : 4 lables
ExterCond : 5 lables
Foundation : 6 lables
BsmtQual : 5 lables
BsmtCond : 5 lables
BsmtExposure : 5 lables
BsmtFinType1 : 7 lables
BsmtFinType2 : 7 lables
Heating : 6 lables
HeatingQC : 5 lables
CentralAir : 2 lables
Electrical : 6 lables
KitchenQual : 4 lables
Functional : 7 lables
FireplaceQu : 6 lables
GarageType : 7 lables
GarageFinish : 4 lables
GarageQual : 6 lables
GarageCond : 6 lables
PavedDrive : 3 lables
PoolQC : 4 lables
Fence : 5 lables
MiscFeature : 5 lables
SaleType : 9 lables
SaleCondition : 6 lables


In [60]:
#Performing one-hot encoding and counting the number of dimensions
pd.get_dummies(data, drop_first=True).shape

(1460, 208)

This 208 columns generated might cause curse of dimensionality. Therefore, inorder to avoid this we take the top 10 frequently used categories and create dummies for them and make the rest set to 0

In [61]:
#Finding the top 10 most frequent categories for the column Neighborhood
data.Neighborhood.value_counts().sort_values(ascending = False).head(10)

Unnamed: 0_level_0,count
Neighborhood,Unnamed: 1_level_1
NAmes,225
CollgCr,150
OldTown,113
Edwards,100
Somerst,86
Gilbert,79
NridgHt,77
Sawyer,74
NWAmes,73
SawyerW,59


In [62]:
#Listing the most frequent categories of variables
top_10 = [x for x in data.Neighborhood.value_counts().sort_values(ascending = False).head(10).index]
top_10

['NAmes',
 'CollgCr',
 'OldTown',
 'Edwards',
 'Somerst',
 'Gilbert',
 'NridgHt',
 'Sawyer',
 'NWAmes',
 'SawyerW']

Now, one-hot encoding is performed only for these top 10 categories and other catgeories are set to 0's

In [63]:
for label in top_10:
  data[label] = np.where(data['Neighborhood']==label, 1, 0)

data[['Neighborhood'] + top_10].head(20)

Unnamed: 0,Neighborhood,NAmes,CollgCr,OldTown,Edwards,Somerst,Gilbert,NridgHt,Sawyer,NWAmes,SawyerW
0,CollgCr,0,1,0,0,0,0,0,0,0,0
1,Veenker,0,0,0,0,0,0,0,0,0,0
2,CollgCr,0,1,0,0,0,0,0,0,0,0
3,Crawfor,0,0,0,0,0,0,0,0,0,0
4,NoRidge,0,0,0,0,0,0,0,0,0,0
5,Mitchel,0,0,0,0,0,0,0,0,0,0
6,Somerst,0,0,0,0,1,0,0,0,0,0
7,NWAmes,0,0,0,0,0,0,0,0,1,0
8,OldTown,0,0,1,0,0,0,0,0,0,0
9,BrkSide,0,0,0,0,0,0,0,0,0,0


## A generic function to fetch the top 10 categories/labels used in each categorical variable and only perform one-hot encoding for it

In [64]:
def one_hot_encoding(df, variable, top_x_labels):
  for label in top_x_labels:
    df[variable+'_'+label] = np.where(data[variable]==label, 1, 0)

#Reading the data again
data = pd.read_csv('house_price_Categorical.csv', usecols=['Neighborhood'])

#testing with Neighborhood column
one_hot_encoding(data, 'Neighborhood', top_10)
data.head()


Unnamed: 0,Neighborhood,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Somerst,Neighborhood_Gilbert,Neighborhood_NridgHt,Neighborhood_Sawyer,Neighborhood_NWAmes,Neighborhood_SawyerW
0,CollgCr,0,1,0,0,0,0,0,0,0,0
1,Veenker,0,0,0,0,0,0,0,0,0,0
2,CollgCr,0,1,0,0,0,0,0,0,0,0
3,Crawfor,0,0,0,0,0,0,0,0,0,0
4,NoRidge,0,0,0,0,0,0,0,0,0,0


In [65]:
# Trying it for Exterior2nd variable
data = pd.read_csv('house_price_Categorical.csv', usecols=['Exterior2nd'])

top_10 = [x for x in data.Exterior2nd.value_counts().sort_values(ascending = False).head(10).index]
one_hot_encoding(data, 'Exterior2nd', top_10)
data.head(20)

Unnamed: 0,Exterior2nd,Exterior2nd_VinylSd,Exterior2nd_MetalSd,Exterior2nd_HdBoard,Exterior2nd_Wd Sdng,Exterior2nd_Plywood,Exterior2nd_CmentBd,Exterior2nd_Wd Shng,Exterior2nd_Stucco,Exterior2nd_BrkFace,Exterior2nd_AsbShng
0,VinylSd,1,0,0,0,0,0,0,0,0,0
1,MetalSd,0,1,0,0,0,0,0,0,0,0
2,VinylSd,1,0,0,0,0,0,0,0,0,0
3,Wd Shng,0,0,0,0,0,0,1,0,0,0
4,VinylSd,1,0,0,0,0,0,0,0,0,0
5,VinylSd,1,0,0,0,0,0,0,0,0,0
6,VinylSd,1,0,0,0,0,0,0,0,0,0
7,HdBoard,0,0,1,0,0,0,0,0,0,0
8,Wd Shng,0,0,0,0,0,0,1,0,0,0
9,MetalSd,0,1,0,0,0,0,0,0,0,0
