# Types of categorical data

## a. Nominal:
### When categories have no relationship e.g. State, Engineering branch

## b. Ordinal:
### When categories have a relationship e.g. Your grade in school

# Types of categorical encoding:

## a. Ordinal Encoding - for Ordinal data

## b. One Hot Encoding - for Nominal data

## c. Label Encoding - for Output variables only
### In Ordinal dataset, if the output/predicted/Y variable is a category, we use label encoding

## Ordinal Encoding

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Datasets/Customer/customer.csv')

In [3]:
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
18,19,Male,Good,School,No
45,61,Male,Poor,PG,Yes
27,69,Female,Poor,PG,No
22,18,Female,Poor,PG,Yes
3,72,Female,Good,PG,No


In [4]:
df = df.iloc[:, 2:]

In [5]:
df.sample(5)

Unnamed: 0,review,education,purchased
3,Good,PG,No
32,Average,UG,Yes
47,Good,PG,Yes
31,Poor,School,Yes
5,Average,School,Yes


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:2], df.iloc[:, -1], test_size = 0.2)

In [18]:
X_train.shape, X_test.shape

((40, 2), (10, 2))

In [19]:
X_train

Unnamed: 0,review,education
37,Average,PG
32,Average,UG
9,Good,UG
48,Good,UG
29,Average,UG
26,Poor,PG
42,Good,PG
43,Poor,PG
25,Good,School
47,Good,PG


In [22]:
from sklearn.preprocessing import OrdinalEncoder

In [23]:
oe = OrdinalEncoder(categories = [['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])
# Order is important here : Max value given to the last category

In [24]:
oe.fit(X_train)

In [25]:
X_train = oe.transform(X_train)
X_test = oe.transform(X_test)

In [26]:
X_train

array([[1., 2.],
       [1., 1.],
       [2., 1.],
       [2., 1.],
       [1., 1.],
       [0., 2.],
       [2., 2.],
       [0., 2.],
       [2., 0.],
       [2., 2.],
       [0., 2.],
       [1., 2.],
       [1., 1.],
       [0., 0.],
       [2., 0.],
       [0., 0.],
       [0., 0.],
       [2., 2.],
       [0., 2.],
       [1., 1.],
       [1., 1.],
       [2., 1.],
       [1., 0.],
       [1., 0.],
       [1., 1.],
       [0., 2.],
       [0., 2.],
       [1., 0.],
       [2., 1.],
       [2., 0.],
       [0., 2.],
       [0., 1.],
       [0., 1.],
       [0., 0.],
       [2., 2.],
       [0., 0.],
       [2., 0.],
       [1., 0.],
       [1., 2.],
       [1., 0.]])

In [27]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
le = LabelEncoder()

In [31]:
le.fit(y_train)

In [32]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [33]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [35]:
y_train

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1])

## One Hot Encoding

##### 1. Dummy variable trap
##### 2. Multi co-linearity 
##### 3. Using most frequent variables

In [37]:
df = pd.read_csv('Datasets/Cars/cars.csv')

In [38]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
1802,Chevrolet,139000,Diesel,Second Owner,190000
2379,Renault,67303,Diesel,First Owner,600000
456,Honda,40000,Diesel,First Owner,700000
5103,Maruti,20000,Petrol,First Owner,400000
472,Tata,110000,Diesel,Second Owner,200000


In [39]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [40]:
df['brand'].nunique()

32

In [41]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [42]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

### 1. OHE using Pandas

In [45]:
pd.get_dummies(df, columns = ['fuel', 'owner'], dtype = int)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


### 2. K-1 OHE

In [46]:
pd.get_dummies(df, columns = ['fuel', 'owner'], dtype = int, drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


##### Only use pandas for OHE in case of data analysis. Don't use it for models. This is because pandas does not remember the order or index of the column. For models we use OHE class in sklearn

### 3. OHE using Sklearn

In [47]:
from sklearn.model_selection import train_test_split_split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, -1], test_size = 0.2, random_state = 0)

In [49]:
X_train

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
4138,Maruti,15000,Petrol,First Owner
...,...,...,...,...
4931,Tata,70000,Diesel,Third Owner
3264,Ford,100000,Diesel,Second Owner
1653,Hyundai,90000,Petrol,Second Owner
2607,Volkswagen,90000,Diesel,First Owner


In [53]:
from sklearn.preprocessing import OneHotEncoder

In [106]:
ohe = OneHotEncoder(drop = 'first', sparse_output = False, dtype = np.int32)
# drop = 'first': Removes the first column after OHE
# sparse_output: OHE generates sparse matrix which can be turned off so that it generates numpy array
# dtype = np.int32: Removes decimal values for OHE labels

##### Here, you are not going to apply OHE on all column in X_train, but only on fuel and owner. In order to do that you need to take those columns out, apply OHE, the merge those columns with the other columns which is very hectic. In order to work around that we use something called Column Transformer.

In [107]:
# Here, using the longer method

ohe.fit_transform(X_train[['fuel', 'owner']])

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [108]:
X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']])

In [109]:
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

In [110]:
X_train_new

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [111]:
# Picking brand and km_driven columns from X_train and merging them with the above columns

X_train[['brand', 'km_driven']].values

array([['Hyundai', 60000],
       ['Tata', 150000],
       ['Hyundai', 110000],
       ...,
       ['Hyundai', 90000],
       ['Volkswagen', 90000],
       ['Hyundai', 110000]], dtype=object)

In [112]:
# Horizontally stacking above numpy arrays

np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Tata', 150000, 1, ..., 0, 0, 1],
       ['Hyundai', 110000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]], dtype=object)

### 4. OHE with top categories

In [120]:
counts = df['brand'].value_counts()

In [121]:
counts

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

##### Car counts < 100 will be changed into an 'Others' category

In [115]:
df['brand'].nunique()

32

In [116]:
threshold = 100

In [119]:
counts[counts <= threshold].index

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [122]:
repl = counts[counts <= threshold].index

In [124]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'), dtype = int)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
