# Ordinal Encoding Tutorial
### Learn how to convert categorical data with ORDER/RANKING into numerical values
### Use this when categories have a meaningful order (small < medium < large)

In [1]:
import pandas as pd

#### Create a DataFrame with "Size" column containing clothing sizes
#### Sizes have an ORDER: s < m < l < xl < xll

In [3]:
df = pd.DataFrame({"Size": ["s","m","l","m","m","xl","xll","s","s","l","l","xl","m"]})
df.head(10)

Unnamed: 0,Size
0,s
1,m
2,l
3,m
4,m
5,xl
6,xll
7,s
8,s
9,l


## Define the ORDER of Categories
#### Create a list with sizes in the correct order
#### This is IMPORTANT: Order matters in ordinal encoding!
#### s=1st, m=2nd, l=3rd, xl=4th, xll=5th

In [4]:
ord_data = [["s", "m", "l", "xl","xll"]]

## Import OrdinalEncoder
#### OrdinalEncoder converts categories with ORDER into numbers
#### Different from LabelEncoder - it respects the order you define

In [7]:
from sklearn.preprocessing import OrdinalEncoder

## Create OrdinalEncoder with Custom Order
#### Create an encoder object and tell it the order of categories
#### categories=ord_data: Use the order we defined earlier

In [41]:
oe = OrdinalEncoder(categories=ord_data)

## Learn the Categories (fit)
#### fit(): Tell the encoder to recognize the sizes in our dataset
#### Note: Use double brackets [["Size"]] to keep it as a table format

In [42]:
oe.fit(df[["Size"]])

## Convert Text to Numbers (transform)
#### transform(): Convert size names to ordered numbers
#### s→1, m→2, l→3, xl→4, xll→5
#### Note: This doesn't save the result yet

In [18]:
oe.transform(df[["Size"]])

array([[0.],
       [1.],
       [2.],
       [1.],
       [1.],
       [3.],
       [4.],
       [0.],
       [0.],
       [2.],
       [2.],
       [3.],
       [1.]])

In [19]:
df["Size_encoded"] = oe.transform(df[["Size"]])
df

Unnamed: 0,Size,Size_encode,Size_encoded
0,s,0.0,0.0
1,m,1.0,1.0
2,l,2.0,2.0
3,m,1.0,1.0
4,m,1.0,1.0
5,xl,3.0,3.0
6,xll,4.0,4.0
7,s,0.0,0.0
8,s,0.0,0.0
9,l,2.0,2.0


In [24]:
df["Size_encoded"] = oe.transform(df[["Size"]])

In [25]:
ord_data1 = {"s":6, "m":7, "l":8, "xl":9, "xll":10}

In [26]:
df["Size_encoded_map"] = df["Size"].map(ord_data1)

In [27]:
df

Unnamed: 0,Size,Size_encode,Size_encoded,Size_encoded_map
0,s,0.0,0.0,6
1,m,1.0,1.0,7
2,l,2.0,2.0,8
3,m,1.0,1.0,7
4,m,1.0,1.0,7
5,xl,3.0,3.0,9
6,xll,4.0,4.0,10
7,s,0.0,0.0,6
8,s,0.0,0.0,6
9,l,2.0,2.0,8


In [29]:
dataset = pd.read_csv("Hitters.csv")
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,...,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,-Andy Allanson,293,66,1,30,29,14,1,293,66,...,30,29,14,A,E,446,33,20,,A
1,-Alan Ashby,315,81,7,24,38,39,14,3449,835,...,321,414,375,N,W,632,43,10,475.0,N
2,-Alvin Davis,479,130,18,66,72,76,3,1624,457,...,224,266,263,A,W,880,82,14,480.0,A
3,-Andre Dawson,496,141,20,65,78,37,11,5628,1575,...,828,838,354,N,E,200,11,3,500.0,N
4,-Andres Galarraga,321,87,10,39,42,30,2,396,101,...,48,46,33,N,E,805,40,4,91.5,N


In [34]:
dataset["Division"].unique()

array(['E', 'W'], dtype=object)

In [35]:
enc_data_ord = [["E", "W"]]

In [36]:
from sklearn.preprocessing import OrdinalEncoder

In [37]:
oen = OrdinalEncoder(categories=enc_data_ord)

In [40]:
dataset["Division"] = oen.fit_transform(dataset[["Division"]])
dataset.head(10)

Unnamed: 0.1,Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,...,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,-Andy Allanson,293,66,1,30,29,14,1,293,66,...,30,29,14,A,0.0,446,33,20,,A
1,-Alan Ashby,315,81,7,24,38,39,14,3449,835,...,321,414,375,N,1.0,632,43,10,475.0,N
2,-Alvin Davis,479,130,18,66,72,76,3,1624,457,...,224,266,263,A,1.0,880,82,14,480.0,A
3,-Andre Dawson,496,141,20,65,78,37,11,5628,1575,...,828,838,354,N,0.0,200,11,3,500.0,N
4,-Andres Galarraga,321,87,10,39,42,30,2,396,101,...,48,46,33,N,0.0,805,40,4,91.5,N
5,-Alfredo Griffin,594,169,4,74,51,35,11,4408,1133,...,501,336,194,A,1.0,282,421,25,750.0,A
6,-Al Newman,185,37,1,23,8,21,2,214,42,...,30,9,24,N,0.0,76,127,7,70.0,A
7,-Argenis Salazar,298,73,0,24,24,7,3,509,108,...,41,37,12,A,1.0,121,283,9,100.0,A
8,-Andres Thomas,323,81,6,26,32,8,2,341,86,...,32,34,8,N,1.0,143,290,19,75.0,N
9,-Andre Thornton,401,92,17,49,66,65,13,5206,1332,...,784,890,866,A,0.0,0,0,0,1100.0,A


## KEY DIFFERENCES (Ordinal vs Label Encoding):
###### ORDINAL ENCODING: Use when categories have ORDER or RANKING
#####   - Example: S < M < L (sizes)
#####   - Example: Low < Medium < High (satisfaction levels)
#####   - The numbers have meaning (higher = larger/better)
#
## LABEL ENCODING: Use when categories have NO natural order
######   - Example: Colors (Red, Blue, Green)
######   - Example: Names (John, Sarah, Mike)
######   - Numbers are just labels, not ranked