In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')



### Ordinal Data
- **Definition**: Categories with a **meaningful order**, but **no fixed numeric difference** between them.
- **Examples**:
  - Education: `High School` < `Bachelor` < `Master` < `PhD`
  - Size: `Small`, `Medium`, `Large`
  - Rating: `Poor`, `Average`, `Good`, `Excellent`
- **Encoding methods**: Ordinal Encoding, Integer Encoding

In [3]:
customers = pd.read_csv('Data/customers.csv')
customers.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [10]:
df = customers[['review','education','purchased']]
df.head() 
# these two are ordinal categorical features
#let's have hand-on with OrdinalEncoder from sklearn

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [11]:
df['review'].unique()
df['education'].unique()

array(['School', 'UG', 'PG'], dtype=object)

In [24]:
from sklearn.preprocessing import OrdinalEncoder
oe =  OrdinalEncoder(categories=[['Poor','Average', 'Good'],['School','UG','PG']])

In [17]:
from sklearn.model_selection import train_test_split

x = df.drop('purchased',axis=1)
y = df.iloc[:,-1]
x_train_,x_test_,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [25]:
x_train = oe.fit_transform(x_train_)
x_test = oe.transform(x_test_)


In [26]:
x_train[:4,:],x_train_.head(4)

(array([[2., 2.],
        [0., 0.],
        [0., 2.],
        [1., 0.]]),
      review education
 33     Good        PG
 35     Poor    School
 26     Poor        PG
 34  Average    School)

In [None]:
oe.categories_ # to know the categories

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [None]:
oe.feature_names_in_ # to know the features on which OE is applied

array(['review', 'education'], dtype=object)

In [None]:
oe.get_feature_names_out() #does same as above attribute

array(['review', 'education'], dtype=object)

In [None]:
oe.n_features_in_ # to get the no. of features on which OE is applied

2

In [None]:
oe.inverse_transform(np.array([1,2]).reshape(1,2)) #inverse transformation

array([['Average', 'PG']], dtype=object)

##### to handle unknow categorical variable while testing

In [None]:

x_train_0,x_test_0,y_train_0,y_test_0 = train_test_split(x,y,test_size=0.2,random_state=0)

oe0 = OrdinalEncoder(categories=[['Poor','Average', 'Good'],['School','UG','PG']],
                     handle_unknown='use_encoded_value',
                     unknown_value=-1)

In [51]:
xtrain0 = oe0.fit_transform(x_train_0)
xtest0 = oe0.transform(x_test_0)

In [52]:
oe0.transform(np.array(['poor','12th']).reshape(1,2))

array([[-1., -1.]])

##### rare categorical variables

In [53]:
data = pd.DataFrame({'animals': ['cat']*5 + ['dog']*6 + ['hen']*3+['Snake']*3})
data.head()

Unnamed: 0,animals
0,cat
1,cat
2,cat
3,cat
4,cat


In [None]:
oe1 = OrdinalEncoder(max_categories=3) #based on number of maximum categories we want
oe1.fit_transform(data)

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.]])

In [58]:
oe1.infrequent_categories_

[array(['Snake', 'hen'], dtype=object)]

In [59]:
oe1.transform(np.array(['Snake','hen']).reshape(2,1))

array([[2.],
       [2.]])

In [61]:
oe3 = OrdinalEncoder(min_frequency=4)
oe3.fit_transform(data)
oe3.transform(np.array(['Snake','hen','cat','dog']).reshape(4,1))  #Based on the frequency count of the categorical variable

array([[2.],
       [2.],
       [0.],
       [1.]])

## Label encoding:
- Applied on target column

In [62]:
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [64]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

x_train_1,x_test_1,y_train_1,y_test_1 = train_test_split(x,y,test_size=0.2,random_state=0)

y_train1 = le.fit_transform(y_train_1)
y_test1 = le.transform(y_test_1)

In [67]:
le.transform(['Yes','No'])

array([1, 0])


### Nominal Data
- **Definition**: Categories with **no inherent order** or ranking.
- **Examples**:
  - Gender: `Male`, `Female`
  - Color: `Red`, `Green`, `Blue`
  - Country: `India`, `USA`, `Canada`
- **Encoding methods**: One-Hot Encoding, Binary Encoding



In [69]:
cars = pd.read_csv('Data/cars.csv',usecols=['brand','fuel','selling_price'])
cars.head()

Unnamed: 0,brand,fuel,selling_price
0,Maruti,Diesel,450000
1,Skoda,Diesel,370000
2,Honda,Petrol,158000
3,Hyundai,Diesel,225000
4,Maruti,Petrol,130000


In [70]:
x= cars.drop('selling_price',axis=1)
y = cars.iloc[:,-1]

In [76]:
from sklearn.model_selection import train_test_split
x_train2,x_test2,y_train2,y_test2 = train_test_split(x,y,test_size=0.2,random_state=0)


In [94]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False,drop='first',min_frequency=100)

In [95]:
xtrain2 = ohe.fit_transform(x_train2)
xtrain2

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [96]:
ohe.n_features_in_

2

In [97]:
ohe.categories_

[array(['Ambassador', 'Ashok', 'Audi', 'BMW', 'Chevrolet', 'Daewoo',
        'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
        'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
        'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Peugeot',
        'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
       dtype=object),
 array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object)]

In [98]:
ohe.get_feature_names_out()

array(['brand_Ford', 'brand_Honda', 'brand_Hyundai', 'brand_Mahindra',
       'brand_Maruti', 'brand_Renault', 'brand_Tata', 'brand_Toyota',
       'brand_Volkswagen', 'brand_infrequent_sklearn', 'fuel_Petrol',
       'fuel_infrequent_sklearn'], dtype=object)

#### Label binarizer
- used for data with multiclass classification output

In [117]:
from sklearn.preprocessing import LabelBinarizer


In [116]:
data = pd.DataFrame({'animals': ['cat']*5 + ['dog']*6 + ['hen']*3+['Snake']*3})
data.head()

Unnamed: 0,animals
0,cat
1,cat
2,cat
3,cat
4,cat


In [118]:
le = LabelBinarizer()
le.fit_transform(data)

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0]])

## Column Transformers and Pipelines

In [120]:
cars = pd.read_csv('Data/cars.csv')
cars.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [127]:
from sklearn.compose import ColumnTransformer

trasnformer = ColumnTransformer(transformers=[
    ('ohe_brand',OneHotEncoder(sparse_output=False,drop='first',min_frequency=100),['brand']),
    ('ohe',OneHotEncoder(sparse_output=False),['fuel','owner'])

],remainder='passthrough')

trasnformer.set_output(transform='pandas')

In [128]:
trasnformer.fit_transform(cars)

Unnamed: 0,ohe_brand__brand_Chevrolet,ohe_brand__brand_Ford,ohe_brand__brand_Honda,ohe_brand__brand_Hyundai,ohe_brand__brand_Mahindra,ohe_brand__brand_Maruti,ohe_brand__brand_Renault,ohe_brand__brand_Skoda,ohe_brand__brand_Tata,ohe_brand__brand_Toyota,...,ohe__fuel_Diesel,ohe__fuel_LPG,ohe__fuel_Petrol,ohe__owner_First Owner,ohe__owner_Fourth & Above Owner,ohe__owner_Second Owner,ohe__owner_Test Drive Car,ohe__owner_Third Owner,remainder__km_driven,remainder__selling_price
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,145500,450000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,120000,370000
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,140000,158000
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,127000,225000
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,120000,130000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,110000,320000
8124,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,119000,135000
8125,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,120000,382000
8126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,25000,290000


In [129]:
trasnformer.transformers_

[('ohe_brand',
  OneHotEncoder(drop='first', min_frequency=100, sparse_output=False),
  ['brand']),
 ('ohe', OneHotEncoder(sparse_output=False), ['fuel', 'owner']),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [1, 4])]

In [131]:
trasnformer['ohe_brand'].categories_

[array(['Ambassador', 'Ashok', 'Audi', 'BMW', 'Chevrolet', 'Daewoo',
        'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
        'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
        'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Opel',
        'Peugeot', 'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen',
        'Volvo'], dtype=object)]