### Ordinal Number Encoding 

In [1]:
import datetime

In [2]:
today_date = datetime.datetime.today()

In [3]:
today_date

datetime.datetime(2022, 6, 30, 15, 6, 9, 367239)

In [5]:
datetime.timedelta()   #it gives difference between two datetime values

datetime.timedelta(0)

In [7]:
today_date-datetime.timedelta(2)  #diff of 2 days 

datetime.datetime(2022, 6, 28, 15, 6, 9, 367239)

In [14]:
# List comprehension

days = [today_date-datetime.timedelta(x) for x in range(0,15)]


[datetime.datetime(2022, 6, 30, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 29, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 28, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 27, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 26, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 25, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 24, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 23, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 22, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 21, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 20, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 19, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 18, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 17, 15, 6, 9, 367239),
 datetime.datetime(2022, 6, 16, 15, 6, 9, 367239)]

In [16]:
import pandas as pd

In [22]:
data = pd.DataFrame(days)
data.columns=['Day']


In [23]:
data.head()

Unnamed: 0,Day
0,2022-06-30 15:06:09.367239
1,2022-06-29 15:06:09.367239
2,2022-06-28 15:06:09.367239
3,2022-06-27 15:06:09.367239
4,2022-06-26 15:06:09.367239


In [34]:
data['weekday'] = data['Day'].dt.strftime('%A')
data.head()

Unnamed: 0,Day,weekday
0,2022-06-30 15:06:09.367239,Thursday
1,2022-06-29 15:06:09.367239,Wednesday
2,2022-06-28 15:06:09.367239,Tuesday
3,2022-06-27 15:06:09.367239,Monday
4,2022-06-26 15:06:09.367239,Sunday


%A -Full weekday name like MONDAY, TUESDAY etc

%w -Weekday as a decimal number like 1,2,3 etc

%a -Abbreviated weekday name like SUN,MON etc

%Y -year

%m -month

%d -day

%H -hours

%M -minutes

%S -seconds

In [38]:
dict = {'Monday':1, 'Tuesday':2, 'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7    
}
dict

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [42]:
data['weekday_ordinal'] = data['weekday'].map(dict)
data

Unnamed: 0,Day,weekday,weekday_ordinal
0,2022-06-30 15:06:09.367239,Thursday,4
1,2022-06-29 15:06:09.367239,Wednesday,3
2,2022-06-28 15:06:09.367239,Tuesday,2
3,2022-06-27 15:06:09.367239,Monday,1
4,2022-06-26 15:06:09.367239,Sunday,7
5,2022-06-25 15:06:09.367239,Saturday,6
6,2022-06-24 15:06:09.367239,Friday,5
7,2022-06-23 15:06:09.367239,Thursday,4
8,2022-06-22 15:06:09.367239,Wednesday,3
9,2022-06-21 15:06:09.367239,Tuesday,2


#### Count Frequency Encoding 

#### advantages:

easy to use

Not increasing any feature space 

#### Disadvantages:

frequency of two features same ( it will provide the same weight if the frequencies are same)




### Target Guided Ordinal ENcoding

1. Ordering the labels according to the target

2. Replace the labels by the joint probability of being 1 or 0

In [43]:
import pandas as pd 

In [54]:
df = pd.read_csv('titanic_train.csv', usecols=['cabin','survived'])
df.head()

Unnamed: 0,cabin,survived
0,,1
1,,0
2,,0
3,,1
4,,0


In [56]:
df['cabin'].fillna('Missing',inplace=True)
df.head()

Unnamed: 0,cabin,survived
0,Missing,1
1,Missing,0
2,Missing,0
3,Missing,1
4,Missing,0


In [64]:
df['cabin'] = df['cabin'].astype(str).str[0]
df.head()

Unnamed: 0,cabin,survived
0,M,1
1,M,0
2,M,0
3,M,1
4,M,0


In [67]:
df.cabin.unique()

array(['M', 'C', 'D', 'E', 'B', 'F', 'A', 'G', 'T'], dtype=object)

In [69]:
df.groupby(['cabin'])['survived'].mean()  #E having highest values of survived

cabin
A    0.583333
B    0.733333
C    0.580645
D    0.709677
E    0.791667
F    0.666667
G    0.500000
M    0.282246
T    0.000000
Name: survived, dtype: float64

Called targeted bcz based on the output feature (survived) 

we are finding the mean and then assigning the Rank 

the one having the higher surviving rate getting the highest rank

In [71]:
df.groupby(['cabin'])['survived'].mean().sort_values().index

Index(['T', 'M', 'G', 'C', 'A', 'F', 'D', 'B', 'E'], dtype='object', name='cabin')

In [72]:
ordinal_labels=df.groupby(['cabin'])['survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'G', 'C', 'A', 'F', 'D', 'B', 'E'], dtype='object', name='cabin')

In [75]:
ordinal_labels2  = {k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2    # E got the highest value of ordering 

{'T': 0, 'M': 1, 'G': 2, 'C': 3, 'A': 4, 'F': 5, 'D': 6, 'B': 7, 'E': 8}

#### Note: start labels with zero always the Model will understand its importance 

In [78]:
df['cabin_ordinal_labels']=df['cabin'].map(ordinal_labels2)
df.head()

Unnamed: 0,cabin,survived,cabin_ordinal_labels
0,M,1,1
1,M,0,1
2,M,0,1
3,M,1,1
4,M,0,1


#### Mean Encoding 

In [83]:
mean_ordinal = df.groupby(['cabin'])['survived'].mean().to_dict()

In [84]:
mean_ordinal

{'A': 0.5833333333333334,
 'B': 0.7333333333333333,
 'C': 0.5806451612903226,
 'D': 0.7096774193548387,
 'E': 0.7916666666666666,
 'F': 0.6666666666666666,
 'G': 0.5,
 'M': 0.2822458270106222,
 'T': 0.0}

In [85]:
df['mean_ordinal_encode']= df['cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,cabin,survived,cabin_ordinal_labels,mean_ordinal_encode
0,M,1,1,0.282246
1,M,0,1,0.282246
2,M,0,1,0.282246
3,M,1,1,0.282246
4,M,0,1,0.282246


#### Advantages

it captures information within the label therefore rendering more predictive features

it creates a monotonic relationship between the variable and target

#### Dis

it prons to overfitting 
