# Encoding Categorical Variables:

Categorical variables need to be converted into numerical representations for machine learning algorithms to process them. One-hot encoding, label encoding, and target encoding are common techniques for this purpose.

## 1. One Hot Encoding
Disadvantage : creates more features


In [130]:
import pandas as pd
import numpy as np

In [113]:
df=pd.read_csv(r'C:\Users\Gaurav\Downloads\titanic.csv',usecols=['Sex'])
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


##### Deleting one column with help of dummy variable trap

In [114]:
pd.get_dummies(df).head()

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [115]:
pd.get_dummies(df,drop_first=True).head()
# male == 1 /  female == 0

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [131]:
df=pd.read_csv(r'C:\Users\Gaurav\Downloads\titanic.csv',usecols=['Embarked'])

In [132]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [133]:
# droping nan value
df.dropna(inplace=True)

In [134]:
pd.get_dummies(df,drop_first=True).head()
# two features will represent third features

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


### 2. One Hot Encoding with many categories in a feature

In [158]:
df=pd.read_csv(r'C:\Users\Gaurav\Downloads\mercedes.csv',usecols=["X0","X1","X2","X3","X4","X5","X6"])

In [159]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [160]:
# let's have a look at how many labels each variable had
for i in df.columns:
    print(i, ':', len(df[i].unique()), 'labels')

X0 : 47 labels
X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels


In [161]:
df.X1.value_counts()

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
y      23
f      23
j      22
n      19
k      17
p       9
g       6
d       3
q       3
ab      3
Name: X1, dtype: int64

In [162]:
# considering only top 10 & dropping rest ---> (KDD cup competition)
df.X1.value_counts().sort_values(ascending=False).head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [163]:
# in terms of list
lst_10=df.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10=list(lst_10)
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [164]:
for categories in lst_10:
    df[categories]=np.where(df['X1']==categories,1,0)
lst_10.append('X1')
df[lst_10]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


In [165]:

# get whole set of dummy variables, for all the categorical variables

def one_hot_encoding_top_x(df, variable, top_x_labels):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)

In [166]:

# read the data again
df = pd.read_csv(r'C:\Users\Gaurav\Downloads\mercedes.csv', usecols=['X1', 'X2'])

# encode X2 into the 10 most frequent categories
one_hot_encoding_top_x(df, 'X2', lst_10)
df.head()

Unnamed: 0,X1,X2,X2_aa,X2_s,X2_b,X2_l,X2_v,X2_r,X2_i,X2_a,X2_c,X2_o,X2_X1
0,v,at,0,0,0,0,0,0,0,0,0,0,0
1,t,av,0,0,0,0,0,0,0,0,0,0,0
2,w,n,0,0,0,0,0,0,0,0,0,0,0
3,t,n,0,0,0,0,0,0,0,0,0,0,0
4,v,n,0,0,0,0,0,0,0,0,0,0,0


## 3. Ordinal Number Encoding

Ordinal data is a categorical, statistical data type where the variables have natural, ordered categories and the distances between the categories is not known.

For example:

- Student's grade in an exam (A, B, C or Fail).
- Educational level, with the categories: Elementary school, High school, College graduate, PhD ranked from 1 to 4.

When the categorical variables are ordinal, the most straightforward best approach is to replace the labels by some ordinal number based on the ranks.

In [167]:
import datetime

In [168]:
today_date=datetime.datetime.today()

In [169]:
today_date

datetime.datetime(2024, 2, 18, 18, 6, 47, 841279)

In [170]:
# differnce btw dates
today_date-datetime.timedelta(1)

datetime.datetime(2024, 2, 17, 18, 6, 47, 841279)

In [171]:
#### last 15 days dates
days=[today_date-datetime.timedelta(x) for x in range(0,15)]
days

[datetime.datetime(2024, 2, 18, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 17, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 16, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 15, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 14, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 13, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 12, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 11, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 10, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 9, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 8, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 7, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 6, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 5, 18, 6, 47, 841279),
 datetime.datetime(2024, 2, 4, 18, 6, 47, 841279)]

In [172]:
import pandas as pd
data=pd.DataFrame(days)
data.columns=["Day"]

In [173]:
data.head()

Unnamed: 0,Day
0,2024-02-18 18:06:47.841279
1,2024-02-17 18:06:47.841279
2,2024-02-16 18:06:47.841279
3,2024-02-15 18:06:47.841279
4,2024-02-14 18:06:47.841279


In [174]:
data['weekday']=data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2024-02-18 18:06:47.841279,Sunday
1,2024-02-17 18:06:47.841279,Saturday
2,2024-02-16 18:06:47.841279,Friday
3,2024-02-15 18:06:47.841279,Thursday
4,2024-02-14 18:06:47.841279,Wednesday


In [175]:
dictionary={'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [24]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [176]:
data['weekday_ordinal']=data['weekday'].map(dictionary)

In [26]:
data

Unnamed: 0,Day,weekday,weekday_ordinal
0,2024-02-15 20:48:31.566936,Thursday,4
1,2024-02-14 20:48:31.566936,Wednesday,3
2,2024-02-13 20:48:31.566936,Tuesday,2
3,2024-02-12 20:48:31.566936,Monday,1
4,2024-02-11 20:48:31.566936,Sunday,7
5,2024-02-10 20:48:31.566936,Saturday,6
6,2024-02-09 20:48:31.566936,Friday,5
7,2024-02-08 20:48:31.566936,Thursday,4
8,2024-02-07 20:48:31.566936,Wednesday,3
9,2024-02-06 20:48:31.566936,Tuesday,2


## 4. Count Or Frequency Encoding

Another way to refer to variables that have a multitude of categories, is to call them variables with high cardinality.

If we have categorical variables containing many multiple labels or high cardinality,then by using one hot encoding, we will expand the feature space dramatically.

One approach that is heavily used in Kaggle competitions, is to replace each label of the categorical variable by the count, this is the amount of times each label appears in the dataset. Or the frequency, this is the percentage of observations within that category. The 2 are equivalent.

##### Advantages
1. Easy To Use
2. Not increasing feature space

##### Disadvantages
1. If some of the labels have the same count, then they will be replaced with the same count and they will loose some valuable information.
2. Adds somewhat arbitrary numbers, and therefore weights to the different labels, that may not be related to their predictive power

In [189]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()                                                         

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [190]:
# cateorgy columns
columns=[1,3,5,6,7,8,9,13]

In [191]:
train_set=train_set[columns]

In [192]:
# assigning column name
train_set.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']

In [193]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [194]:
for feature in train_set.columns[:]:
    print(feature,":",len(train_set[feature].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


##### country-name will replace by Frequency

In [195]:
train_set['Country'].value_counts().to_dict()

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [196]:
country_map=train_set['Country'].value_counts().to_dict()

In [197]:
train_set['Country']=train_set['Country'].map(country_map)
train_set.head(20)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


## 5. Target Guided Ordinal Encoding
1. Ordering the labels according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [198]:
import pandas as pd
df=pd.read_csv(r'C:\Users\Gaurav\Downloads\titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [199]:
df['Cabin'].fillna('Missing',inplace=True)

In [200]:
df['Cabin']=df['Cabin'].astype(str).str[0]
# considering first letter only

In [201]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [202]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [203]:
df.groupby(['Cabin'])['Survived'].mean()
# in each column how many people survive

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [204]:
df.groupby(['Cabin'])['Survived'].mean().sort_values()

Cabin
T    0.000000
M    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [205]:
ordinal_labels=df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [206]:
enumerate(ordinal_labels,0)
# assign 0,1,2,3 as per rank

<enumerate at 0x1a19e6b4480>

In [207]:
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [208]:
df['Cabin_ordinal_labels']=df['Cabin'].map(ordinal_labels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### 5.1. Mean Encoding 

replace by mean value


In [83]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [84]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [86]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


It leads to overfitting

### 5.2. Probability Ratio Encoding

Steps:

1. Probability of Survived based on Cabin--- Categorical Feature
2. Probability of Not Survived---1-pr(Survived)
3. pr(Survived)/pr(Not Survived)
4. Dictonary to map cabin with probability
5. replace with the categorical feature

In [89]:
df=pd.read_csv(r'C:\Users\Gaurav\Downloads\titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [90]:
### Replacing
df['Cabin'].fillna('Missing',inplace=True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [91]:
df['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [92]:
# first letter only
df['Cabin']=df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [93]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [94]:
prob_df=df.groupby(['Cabin'])['Survived'].mean()
prob_df=pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [95]:
prob_df['Died']=1-prob_df['Survived']
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [96]:
prob_df['Probability_ratio']=prob_df['Survived']/prob_df['Died']
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [97]:
probability_encoded=prob_df['Probability_ratio'].to_dict()
df['Cabin_encoded']=df['Cabin'].map(probability_encoded)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274


In [98]:
df.head(20)

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
5,0,M,0.428274
6,0,E,3.0
7,0,M,0.428274
8,1,M,0.428274
9,1,M,0.428274
