# How to Handle Categorical Variable

# 1) One Hot Encoding

Importing THe lIbraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In one Hot Encoding method, each category value is converted into a new column and assigned a value as 1 or 0 to the column.

This will be done using the pandas get_dummies() function and then we will drop the first column in order to avoid dummy variable trap.

In [3]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [4]:
Sex_conv_variable = pd.get_dummies(df['Sex'],drop_first=True).astype(int).head()

In [5]:
Sex_conv_variable

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [6]:
df=pd.concat([df,Sex_conv_variable],axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1.0


Advantages :

· Simple to use and fits well for data with few categories.

Disadvantages:

· A high cardinality of higher categories will increase the feature space, resulting in the curse of dimensionality.

In [7]:
df.drop('Sex',axis=1,inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,0.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,1.0


#2) One Hot Encoding with Multiple Categories

This is one of the ensemble selection techniques pick up from the KDD Orange Cup competition. In this technique, the author made a slight modification to the One hot encoding technique that is instead of creating the new column for every category, they limit creating the new column for 10 most frequent categories. Sounds like a Jargon !!!!

In [8]:
df2 = pd.read_csv('mercendez.csv',usecols=['X0','X1','X2','X3','X4','X5','X6'])
df2.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [9]:
## Check for unique labels in each column
for col in df2.columns:
    print(len(df2[col].unique()))

47
27
44
7
4
29
12


In [10]:
list_top_10=df2.X1.value_counts().sort_values(ascending=False).head(10).index
list_top_10

Index(['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object', name='X1')

Advantages:

· Easy to implement

· Does not expand massively the feature space

Disadvantages :

· Does not keep track of category values that are overlooked.

In [11]:
for category in list_top_10:
    df2[category]=np.where(df2['X1']==category,1,0)
df2.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


In [12]:
df2.drop('X1',axis=1,inplace=True)
df2.head()

Unnamed: 0,X0,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0


# 3) Ordinal Number encoding

Domain information can be used to determine the integer value order. For example, we people love Saturday and Sundays, and most hates Monday. In this scenario the mapping for weekdays goes ‘Monday’ is 1, ‘Tuesday’ is 2, ‘Wednesday’ is 3, ‘Thursday’ is 4, ‘Friday’ is 5,’Saturday’ is 6,’Sunday’ is 7.

In [13]:
data={'Temperature':['Hot','Cold','Very Hot','Warm','Hot','Warm','Warm','Hot','Hot','Cold']}
dataset=pd.DataFrame(data,columns=['Temperature'])
dataset.head()

Unnamed: 0,Temperature
0,Hot
1,Cold
2,Very Hot
3,Warm
4,Hot


Advantages :

· Easy and straightforward to implement

· Widely used in survey and research data encoding.

Disadvantages:

· Do not have a standardized interval scale.

In [14]:
mapping_disctonary_values={'Cold':1,'Warm':2,'Hot':3,'Very Hot':4}
dataset['Temperature_Ordinal']=dataset.Temperature.map(mapping_disctonary_values)
dataset

Unnamed: 0,Temperature,Temperature_Ordinal
0,Hot,3
1,Cold,1
2,Very Hot,4
3,Warm,2
4,Hot,3
5,Warm,2
6,Warm,2
7,Hot,3
8,Hot,3
9,Cold,1


# 4) Count or Frequency Encoding

As an example. If India appears 56 times in the country column and America appears 49 times, we replace India with 56 and America with 49 in the country column.

In [15]:
df3 = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [16]:
columns=[1,3,5,6,7,8,9,13]
df3[columns].head()

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [17]:
df3=df3[columns]
df3.columns=['Employment','Degree','Status','Designation','Family_job','Race','Sex','Country']
df3.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [18]:
for col in df3.columns[:]:
    print(col,':',len(df3[col].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


Advantages:

· Easy to implement

· There will be no increase in feature space.

· Work well with the tree-based algorithms.

Disadvantages:

It will not provide the same weight if the frequencies are the same.

In [19]:
country_map=df3['Country'].value_counts().to_dict()
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Trinadad&Tobago': 19,
 ' Cambodia': 19,
 ' Thailand': 18,
 ' Laos': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [20]:
df3['Country']=df3['Country'].map(country_map)
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['Country']=df3['Country'].map(country_map)


Unnamed: 0,Employment,Degree,Status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


# 5) Target Guided Ordinal Encoding

Steps:

1) Choose a categorical variable.

2) Take the aggregated mean of the categorical variable and apply it to the target variable.

3) Assign higher integer values or a higher rank to the category with the highest mean.

In [21]:
df4=pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
df4.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [22]:
df4['Cabin'] = df4['Cabin'].fillna('Missing')
df4.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [23]:
df4['Cabin']=df4['Cabin'].astype(str).str[0]
df4.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [24]:
ordinal_index=df4.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [25]:
ordinal_label={k:i for i,k in enumerate(ordinal_index,0)}
ordinal_label

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

This line creates a dictionary that assigns numerical values to ordered categorical labels. It is useful when you want to convert text categories (like "Poor", "Fair", etc.) into numeric form for machine learning or statistical analysis.

Explanation:

ordinal_index: A list containing the ordered categories.

Example: ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']

enumerate(ordinal_index, 0): Loops through the list and gives each category a number starting from 0.

{k: i for i, k in ...}: Creates a dictionary where:

k is the category name (e.g., 'Good')

i is the numeric label (e.g., 2)

Advantages:

· Establish a monotonic relationship between the variable and the target.

· Helps in faster learning

Disadvantages:

· Because of the close relationship to the target variable, it often leads to overfitting.

In [26]:
df4['Cabin_ordinal_label']=df4['Cabin'].map(ordinal_label)
df4.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


# 6) Mean Ordinal Encoding

It’s a sight variant of target-guided ordinal encoding and is viral among data scientists. We replace the category with the obtained mean value instead of assigning integer values to it.

In [27]:
mean_ordinal=df4.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

Advantages:

· Improves classification model efficiency.

· Fast acquisition of information

Disadvantages:

· Leads to overfitting

· May lead to possible loss of value if two categories have the same mean

In [28]:
df4['Cabin_mean_ordinal']=df4['Cabin'].map(mean_ordinal)
df4.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label,Cabin_mean_ordinal
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


# 7) Probability Ratio Encoding

Steps :

1) Using the categorical variable, evaluate the probability of the Target variable (where the output is True or 1).

2) Calculate the probability of the Target variable having a False or 0 output.

3) Calculate the probability ratio i.e. P(True or 1) / P(False or 0).

4) Replace the category with a probability ratio.

In [29]:
prob=df4.groupby(['Cabin'])['Survived'].mean()
prob_df=pd.DataFrame(prob)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [30]:
prob_df['Died']=1-prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [31]:
prob_df['Probability Ratio']=prob_df['Survived']/prob_df['Died']
prob_df

Unnamed: 0_level_0,Survived,Died,Probability Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [32]:
prob_encod_dictionary=prob_df['Probability Ratio'].to_dict()
prob_encod_dictionary

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [33]:
df4['Cabin_probabilty_ratio']=df4['Cabin'].map(prob_encod_dictionary)
df4.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_label,Cabin_mean_ordinal,Cabin_probabilty_ratio
0,0,M,1,0.299854,0.428274
1,1,C,4,0.59322,1.458333
2,1,M,1,0.299854,0.428274
3,1,C,4,0.59322,1.458333
4,0,M,1,0.299854,0.428274
