# Encoding Categorical Features

Some algorithms can work with categorical data directly. For example, a decision tree can be learned directly from categorical data with no data transform required (this depends on the specific implementation).

However, most ML algorithms implementation require all input variables and output variables to be **numeric**.

# `OrdinalEncoder`

In [11]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

# Example data
data = np.array([
    ['low' , '1st'],  # <-- first row
    ['medium', '3rd'],# <-- second row 
])

# Specify the order of categories
categories = [
    ['low', 'medium', 'high'], # <-- cats of first feature
    ['1st', '2nd', '3rd'],     # <-- cats of second feature
]

encoder = OrdinalEncoder(categories=categories)
encoded_data = encoder.fit_transform(data)

print(encoded_data)

[[0. 0.]
 [1. 2.]]


In [12]:
import pandas as pd

df = pd.read_csv('../datasets/Traffic/Traffic_Jams.csv', parse_dates=['Time'], index_col=0)
df.head()

  df = pd.read_csv('../datasets/Traffic/Traffic_Jams.csv', parse_dates=['Time'], index_col=0)


Unnamed: 0_level_0,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-08-20 00:00:00,10,Tuesday,13,2,2,24,41,normal
2024-08-20 00:15:00,10,Tuesday,14,1,1,36,52,normal
2024-08-20 00:30:00,10,Tuesday,10,2,2,32,46,normal
2024-08-20 00:45:00,10,Tuesday,10,2,2,36,50,normal
2024-08-20 01:00:00,10,Tuesday,11,2,1,34,48,normal


In [13]:
df['Situation'] = df['Traffic Situation'].astype(dtype="category").\
    cat.set_categories(['low', 'normal', 'high', 'heavy'], ordered=True)
df.drop('Traffic Situation', axis=1, inplace=True)

In [14]:
df['Situation']

Time
2024-08-20 00:00:00    normal
2024-08-20 00:15:00    normal
2024-08-20 00:30:00    normal
2024-08-20 00:45:00    normal
2024-08-20 01:00:00    normal
                        ...  
2024-08-20 10:30:00    normal
2024-08-20 20:00:00      high
2024-08-20 21:00:00      high
2024-08-20 21:30:00      high
2024-08-20 23:45:00    normal
Name: Situation, Length: 6324, dtype: category
Categories (4, object): ['low' < 'normal' < 'high' < 'heavy']

# OneHot Encoding: `pd.get_dummies`

In [8]:
import pandas as pd

# Sample DataFrame
data = {
    'category': ['apple', 'banana', 'apple', 'orange', 'banana', 'apple', 
                 'pear', 'banana', 'pear', 'kiwi', 'grape', 'kiwi', 
                 'apple', 'pear', 'grape', 'orange', 'kiwi', 'pear', 
                 'mango', 'grape', 'banana', 'grape', 'kiwi', 'apple']
}

df = pd.DataFrame(data)

# Determine the top N most frequent categories
N = 3  # Number of top categories to keep
top_n = df['category'].value_counts().nlargest(N).index

# Replace less frequent categories with 'Other'
df['category'] = df['category'].apply(lambda x: x if x in top_n else 'Other')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=['category'])

df_encoded

Unnamed: 0,category_Other,category_apple,category_banana,category_pear
0,False,True,False,False
1,False,False,True,False
2,False,True,False,False
3,True,False,False,False
4,False,False,True,False
5,False,True,False,False
6,False,False,False,True
7,False,False,True,False
8,False,False,False,True
9,True,False,False,False
