## Handling Categorical Features

### One Hot Encoding

In [1]:
import pandas as pd

In [2]:
df_t = pd.read_csv('titanic.csv',usecols = ['Sex'])

In [3]:
df_t.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


#### pd.get_dummies() for one hot encoding

In [4]:
# Feature will be arranged in ascending order
pd.get_dummies(df_t,drop_first = True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [5]:
df_t = pd.read_csv('titanic.csv',usecols = ['Embarked'])

In [6]:
df_t.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
df_t.dropna(inplace = True)

In [8]:
pd.get_dummies(df_t,drop_first = True).head()
# When both Embarked_Q and Embarked_S are 0 then it is Embarked_C
#  When we have N categorical features then we have to consider the N-1 categorical features

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


#### Disadvantage
Curse of Dimentions

## One Hot Encoding with many categories in a feature

In [9]:
merc = pd.read_csv('mercedes.csv',usecols = ['X0','X1','X2','X3','X4','X5','X6'])
merc.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [10]:
merc.shape

(4209, 7)

In [11]:
for i in merc.columns:
    print(merc[i].value_counts())

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
m      34
ai     34
e      32
ba     27
at     25
a      21
ax     19
i      18
am     18
aq     18
u      17
aw     16
l      16
ad     14
k      11
au     11
b      11
as     10
r      10
bc      6
ao      4
c       3
aa      2
q       2
ab      1
g       1
ac      1
Name: X0, dtype: int64
aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
y      23
f      23
j      22
n      19
k      17
p       9
g       6
q       3
ab      3
d       3
Name: X1, dtype: int64
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
z       19
ag   

In [12]:
for i in merc.columns:
    print(i,len(merc[i].unique()))

X0 47
X1 27
X2 44
X3 7
X4 4
X5 29
X6 12


In [13]:
merc.X1.value_counts().head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [14]:
lst_10_X1 = list(merc.X1.value_counts().head(10).index)

In [15]:
lst_10_X1

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [16]:
import numpy as np

for cat in lst_10_X1:
    merc[cat] = np.where(merc['X1']==cat,1,0)

In [17]:
lst_10_X1.append('X1')

In [18]:
merc[lst_10_X1]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


### Ordinal Number Encoding
Example
* Grading System --- A,B,C,D,F<br>
A-1<br>
B-2<br>
C-3<br>
D-4<br>
E-5<br>

* Experience of a batsman<br>
A-10yrs--1<br>
A-5yrs--2<br>
A-3yrs--3<br>

In [19]:
import datetime

In [20]:
today_date = datetime.datetime.today()

In [21]:
today_date

datetime.datetime(2020, 8, 9, 18, 53, 59, 753511)

In [22]:
today_date-datetime.timedelta(3)

datetime.datetime(2020, 8, 6, 18, 53, 59, 753511)

In [23]:
### List Comprehension
days = [today_date-datetime.timedelta(x) for x in range(0,15)]

In [24]:
data = pd.DataFrame(days)
data.columns = ['Day']

In [25]:
data

Unnamed: 0,Day
0,2020-08-09 18:53:59.753511
1,2020-08-08 18:53:59.753511
2,2020-08-07 18:53:59.753511
3,2020-08-06 18:53:59.753511
4,2020-08-05 18:53:59.753511
5,2020-08-04 18:53:59.753511
6,2020-08-03 18:53:59.753511
7,2020-08-02 18:53:59.753511
8,2020-08-01 18:53:59.753511
9,2020-07-31 18:53:59.753511


In [26]:
data['weekday'] = data['Day'].dt.day_name()

In [27]:
data.head()

Unnamed: 0,Day,weekday
0,2020-08-09 18:53:59.753511,Sunday
1,2020-08-08 18:53:59.753511,Saturday
2,2020-08-07 18:53:59.753511,Friday
3,2020-08-06 18:53:59.753511,Thursday
4,2020-08-05 18:53:59.753511,Wednesday


In [28]:
dictionary  = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [29]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [30]:
data['weekday_ordinal'] = data['weekday'].map(dictionary)

In [31]:
data.head()

Unnamed: 0,Day,weekday,weekday_ordinal
0,2020-08-09 18:53:59.753511,Sunday,7
1,2020-08-08 18:53:59.753511,Saturday,6
2,2020-08-07 18:53:59.753511,Friday,5
3,2020-08-06 18:53:59.753511,Thursday,4
4,2020-08-05 18:53:59.753511,Wednesday,3


## Count Or Frequency Encoding 

In [3]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header=None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
len(train_set[1].unique())

9

In [5]:
columns = [1,3,5,6,7,8,9,13]
train_set = train_set[columns]

In [6]:
train_set.columns = ['Employment','Degree','Status','Designation','Family_Job','Race','Sex','Country']

In [7]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_Job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [8]:
for col in train_set.columns[:]:
    print(col,":",len(train_set[col].unique()),"labels")
  

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family_Job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [9]:
train_set['Country'].value_counts()

 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [10]:
country_map = train_set['Country'].value_counts().to_dict()

In [12]:
train_set['Country'] = train_set['Country'].map(country_map)
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_Job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


#### Advantages
>* Easy to use <br>
* Not increasing any feature space

#### Disadvantages
>* It'll provide same weight if frequencies are same

### Tartget Guided Ordinal Encoding
* Ordering the labels acc. to the target
* Replace the labels by the joint probability of being 0 or 1

In [43]:
dft = pd.read_csv('titanic.csv',usecols = ['Cabin','Survived']) 

In [44]:
dft.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [45]:
dft['Cabin'].fillna("Missing",inplace = True)
dft.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [46]:
dft['Cabin'] = dft['Cabin'].astype(str).str[0]

In [47]:
dft.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [48]:
dft.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [49]:
dft.groupby('Cabin')['Survived'].mean()
#dft.groupby('Cabin')['Survived'].sum()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [50]:
ordinal_labels = dft.groupby('Cabin')['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [51]:
ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels,0)}

In [52]:
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [53]:
dft['Cabin_ordinal_labels'] = dft['Cabin'].map(ordinal_labels2)

In [54]:
dft.head(10)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1
5,0,M,1
6,0,E,7
7,0,M,1
8,1,M,1
9,1,M,1


### Mean Encoding

In [57]:
 mean_ordinal = dft.groupby('Cabin')['Survived'].mean().to_dict()


In [58]:
dft['Mean_ordinal_encode'] = dft['Cabin'].map(mean_ordinal)

In [60]:
dft.head(10)

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,Mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
5,0,M,1,0.299854
6,0,E,7,0.75
7,0,M,1,0.299854
8,1,M,1,0.299854
9,1,M,1,0.299854


#### Probability Ratio Encoding
1. Probability of Survived based on Cabin--- Categorical Feature
2. Probability of Not Survived---1-pr(Survived)
3. pr(Survived)/pr(Not Survived)
4. Dictonary to map cabin with probability
5. replace with the categorical feature

In [2]:
df=pd.read_csv('titanic.csv',usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [3]:
### Replacing
df['Cabin'].fillna('Missing',inplace=True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [None]:
df['Cabin'].unique()

In [4]:
df['Cabin']=df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [5]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [7]:
prob_df=df.groupby(['Cabin'])['Survived'].mean()

In [11]:
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [13]:
prob_df['Died'] = 1-prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [14]:
prob_df['Prob_ratio'] = prob_df['Survived']/prob_df['Died']

In [15]:
prob_df

Unnamed: 0_level_0,Survived,Died,Prob_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [18]:
prob_encoded = prob_df['Prob_ratio'].to_dict()

In [19]:
df['Cabin_encode'] = df['Cabin'].map(prob_encoded)

In [20]:
df.head(10)

Unnamed: 0,Survived,Cabin,Cabin_encode
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
5,0,M,0.428274
6,0,E,3.0
7,0,M,0.428274
8,1,M,0.428274
9,1,M,0.428274
