# **Encoding**

**Convert** categorical data **into numerical data** using **one-hot or label encoding techniques**.

## **One-hot Encoding**
> It is used when **limited amount of data** is present

In [1]:
import pandas as pd

In [11]:
Dataset = pd.read_csv("AmesHousing.csv")
Dataset

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,37.0,7937,Pave,,IR1,Lvl,...,0,,GdPrv,,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,,8885,Pave,,IR1,Low,...,0,,MnPrv,,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,62.0,10441,Pave,,Reg,Lvl,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,77.0,10010,Pave,,Reg,Lvl,...,0,,,,0,4,2006,WD,Normal,170000


### 📝 Steps  
1. **Check & remove null values** → drop missing rows/cols if any.  
2. **Encode categorical data** → use Label/One-Hot/Frequency encoding as needed.  

In [3]:
Dataset.select_dtypes(include='object')

Unnamed: 0,MS Zoning,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,...,Garage Type,Garage Finish,Garage Qual,Garage Cond,Paved Drive,Pool QC,Fence,Misc Feature,Sale Type,Sale Condition
0,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,...,Attchd,Fin,TA,TA,P,,,,WD,Normal
1,RH,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
3,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Mitchel,Norm,...,Detchd,Unf,TA,TA,Y,,GdPrv,,WD,Normal
2926,RL,Pave,,IR1,Low,AllPub,Inside,Mod,Mitchel,Norm,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
2927,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,...,,,,,Y,,MnPrv,Shed,WD,Normal
2928,RL,Pave,,Reg,Lvl,AllPub,Inside,Mod,Mitchel,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [4]:
# check null values
Dataset.isnull().sum()

Order               0
PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      490
                 ... 
Mo Sold             0
Yr Sold             0
Sale Type           0
Sale Condition      0
SalePrice           0
Length: 82, dtype: int64

In [5]:
Dataset['Fence'].isnull().sum()

np.int64(2358)

In [12]:
Dataset['Fence'].fillna(Dataset['Fence'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Dataset['Fence'].fillna(Dataset['Fence'].mode()[0],inplace=True)


In [13]:
Dataset['Garage Type'].fillna(Dataset['Garage Type'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Dataset['Garage Type'].fillna(Dataset['Garage Type'].mode()[0],inplace=True)


## **1. Using Pandas get_dummies()**

In [14]:
en_data = Dataset[['Garage Type','Fence']]
en_data

Unnamed: 0,Garage Type,Fence
0,Attchd,MnPrv
1,Attchd,MnPrv
2,Attchd,MnPrv
3,Attchd,MnPrv
4,Attchd,MnPrv
...,...,...
2925,Detchd,GdPrv
2926,Attchd,MnPrv
2927,Attchd,MnPrv
2928,Attchd,MnPrv


In [12]:
pd.get_dummies?

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mget_dummies[0m[1;33m([0m[1;33m
[0m    [0mdata[0m[1;33m,[0m[1;33m
[0m    [0mprefix[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mprefix_sep[0m[1;33m:[0m [1;34m'str | Iterable[str] | dict[str, str]'[0m [1;33m=[0m [1;34m'_'[0m[1;33m,[0m[1;33m
[0m    [0mdummy_na[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mcolumns[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msparse[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mdrop_first[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mdtype[0m[1;33m:[0m [1;34m'NpDtype | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Convert categorical variable into dummy/indicator variables.

Each variable is

In [15]:
pd.get_dummies(en_data)

Unnamed: 0,Garage Type_2Types,Garage Type_Attchd,Garage Type_Basment,Garage Type_BuiltIn,Garage Type_CarPort,Garage Type_Detchd,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw
0,False,True,False,False,False,False,False,False,True,False
1,False,True,False,False,False,False,False,False,True,False
2,False,True,False,False,False,False,False,False,True,False
3,False,True,False,False,False,False,False,False,True,False
4,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
2925,False,False,False,False,False,True,True,False,False,False
2926,False,True,False,False,False,False,False,False,True,False
2927,False,True,False,False,False,False,False,False,True,False
2928,False,True,False,False,False,False,False,False,True,False


In [16]:
pd.get_dummies(en_data, dtype = int)

Unnamed: 0,Garage Type_2Types,Garage Type_Attchd,Garage Type_Basment,Garage Type_BuiltIn,Garage Type_CarPort,Garage Type_Detchd,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw
0,0,1,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
2925,0,0,0,0,0,1,1,0,0,0
2926,0,1,0,0,0,0,0,0,1,0
2927,0,1,0,0,0,0,0,0,1,0
2928,0,1,0,0,0,0,0,0,1,0


In [24]:
pd.get_dummies(en_data, dtype = int).columns

Index(['Garage Type_2Types', 'Garage Type_Attchd', 'Garage Type_Basment',
       'Garage Type_BuiltIn', 'Garage Type_CarPort', 'Garage Type_Detchd',
       'Fence_GdPrv', 'Fence_GdWo', 'Fence_MnPrv', 'Fence_MnWw'],
      dtype='object')

## **2. Using Scikit-learn's OneHotEncoder**

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
ohe = OneHotEncoder()
ohe.fit_transform(en_data)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5860 stored elements and shape (2930, 10)>

#### **sparse matrix**
A sparse matrix is a type of matrix in which **the majority of its elements are zero**.

In [23]:
#convert into array
ar = ohe.fit_transform(en_data).toarray()
ar

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [26]:
# convert into dataframe
pd.DataFrame(ar, columns = ['Garage Type_2Types', 'Garage Type_Attchd', 'Garage Type_Basment',
       'Garage Type_BuiltIn', 'Garage Type_CarPort', 'Garage Type_Detchd',
       'Fence_GdPrv', 'Fence_GdWo', 'Fence_MnPrv', 'Fence_MnWw'])

Unnamed: 0,Garage Type_2Types,Garage Type_Attchd,Garage Type_Basment,Garage Type_BuiltIn,Garage Type_CarPort,Garage Type_Detchd,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2925,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2926,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2927,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2928,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


**`drop='first'`** → removes one redundant column per feature to avoid multicollinearity and save space.

In [30]:
en_data2 = Dataset[['Lot Shape']]
en_data2

Unnamed: 0,Lot Shape
0,IR1
1,Reg
2,IR1
3,Reg
4,IR1
...,...
2925,IR1
2926,IR1
2927,Reg
2928,Reg


In [36]:
ohe2 = OneHotEncoder(drop='first')
ar2 = ohe2.fit_transform(en_data2).toarray()
ar2

array([[0., 0., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [37]:
pd.DataFrame(ar2, columns = [ 'Lot Shape_IR2', 'Lot Shape_IR3', 'Lot Shape_Reg'])

Unnamed: 0,Lot Shape_IR2,Lot Shape_IR3,Lot Shape_Reg
0,0.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,0.0
...,...,...,...
2925,0.0,0.0,0.0
2926,0.0,0.0,0.0
2927,0.0,0.0,1.0
2928,0.0,0.0,1.0
