In [1]:
import pandas as pd
import numpy as np

## Handling categorical features

### 1. One Hot encoding
### 2. Ordinal Number encoding
### 3. count or Frequency encoding
### 4. Target Guided Ordinal Encoding

## 1. One Hot encoding

#### Creating dummies

In [2]:
df=pd.read_csv('D:/pandas data analysis/csv data/pandas data analysis/titanic_data.csv',usecols=['Sex','Embarked'])
df.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [3]:
pd.get_dummies(df.Sex,drop_first=True).head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [4]:
df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [5]:
df.Embarked.isnull().sum()

2

In [6]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
# replacing nan with freq used value (s)

In [8]:
df.Embarked.fillna(df.Embarked.value_counts().index[0],inplace=True)

In [9]:
pd.get_dummies(df['Embarked'],drop_first=True).head()

Unnamed: 0,Q,S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


#### Disadvantage:
it creates more features(curse of dimention)

#### creating one hot encoding with many categories in a feature (KDD orange cup)
#### Here we are going to take first 10 higher recuring values and assigning it with 1 and imputing 0 to remaining categories. It is a technique used in kaggle competition

In [10]:
df=pd.read_csv("mercedes.csv",usecols=['X0','X1','X2','X3','X4','X5'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5
0,k,v,at,a,d,u
1,k,t,av,e,d,y
2,az,w,n,c,d,x
3,az,t,n,f,d,x
4,az,v,n,f,d,h


In [11]:
# checking unique values X0 feature

In [12]:
df.X0.unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [13]:
top_10=list(df.X0.value_counts().head(10).index)
top_10

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']

In [14]:
for i in top_10:
    df[i]=np.where(df['X0']==i,1,0)

In [15]:
df.head(10)

Unnamed: 0,X0,X1,X2,X3,X4,X5,z,ak,y,ay,t,x,o,f,n,w
0,k,v,at,a,d,u,0,0,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,0,0,0,0,0,0,0,0,0,0
5,t,b,e,c,d,g,0,0,0,0,1,0,0,0,0,0
6,al,r,e,f,d,f,0,0,0,0,0,0,0,0,0,0
7,o,l,as,f,d,f,0,0,0,0,0,0,1,0,0,0
8,w,s,as,e,d,f,0,0,0,0,0,0,0,0,0,1
9,j,b,aq,c,d,f,0,0,0,0,0,0,0,0,0,0


## 2. Ordinal Number encoding

#### Ranking caregories in the feature

In [100]:
import datetime

In [101]:
today_date=datetime.datetime.today()

In [102]:
today_date

datetime.datetime(2021, 7, 14, 12, 23, 28, 907081)

In [103]:
today_date-datetime.timedelta(1)    # timedelta is used to find difference between the dates

datetime.datetime(2021, 7, 13, 12, 23, 28, 907081)

In [104]:
# List comprehension
days=list(today_date-datetime.timedelta(x) for x in range(0,15))

In [105]:
days

[datetime.datetime(2021, 7, 14, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 13, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 12, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 11, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 10, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 9, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 8, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 7, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 6, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 5, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 4, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 3, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 2, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 1, 12, 23, 28, 907081),
 datetime.datetime(2021, 6, 30, 12, 23, 28, 907081)]

In [106]:
df=pd.DataFrame(days,columns=['Day'])       # converting to dataframe and changing column name
df.head()

Unnamed: 0,Day
0,2021-07-14 12:23:28.907081
1,2021-07-13 12:23:28.907081
2,2021-07-12 12:23:28.907081
3,2021-07-11 12:23:28.907081
4,2021-07-10 12:23:28.907081


In [107]:
df['Day_name']=df['Day'].dt.day_name()

In [108]:
df

Unnamed: 0,Day,Day_name
0,2021-07-14 12:23:28.907081,Wednesday
1,2021-07-13 12:23:28.907081,Tuesday
2,2021-07-12 12:23:28.907081,Monday
3,2021-07-11 12:23:28.907081,Sunday
4,2021-07-10 12:23:28.907081,Saturday
5,2021-07-09 12:23:28.907081,Friday
6,2021-07-08 12:23:28.907081,Thursday
7,2021-07-07 12:23:28.907081,Wednesday
8,2021-07-06 12:23:28.907081,Tuesday
9,2021-07-05 12:23:28.907081,Monday


In [109]:
dic={'Monday':1,
    'Tuesday':2,
    'Wednesday':3,
    'Thursday':4,
    'Friday':5,
    'Saturday':6,
    'Sunday':7}

In [110]:
dic

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [111]:
df['Day_val']=df['Day_name'].map(dic)

In [112]:
df

Unnamed: 0,Day,Day_name,Day_val
0,2021-07-14 12:23:28.907081,Wednesday,3
1,2021-07-13 12:23:28.907081,Tuesday,2
2,2021-07-12 12:23:28.907081,Monday,1
3,2021-07-11 12:23:28.907081,Sunday,7
4,2021-07-10 12:23:28.907081,Saturday,6
5,2021-07-09 12:23:28.907081,Friday,5
6,2021-07-08 12:23:28.907081,Thursday,4
7,2021-07-07 12:23:28.907081,Wednesday,3
8,2021-07-06 12:23:28.907081,Tuesday,2
9,2021-07-05 12:23:28.907081,Monday,1


## 3. Count or Frequency Encoding

In [1]:
df=pd.read_csv('adult.data',header=None,)

<IPython.core.display.Javascript object>

In [2]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.columns=['age','workclass','fnlwgt','education','education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
 'capital-gain','capital-loss', 'hours-per-week', 'native-country','labels']

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,labels
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df=df.select_dtypes(include='object').drop(['labels'],axis=1)
df.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [6]:
# finding length of unique values in all features

for feature in df:
    print(feature,':',len(df[feature].unique()),'labels')

workclass : 9 labels
education : 16 labels
marital-status : 7 labels
occupation : 15 labels
relationship : 6 labels
race : 5 labels
sex : 2 labels
native-country : 42 labels


In [7]:
df['native-country']=np.where(df['native-country']== ' ?',' Others',df['native-country'])

<IPython.core.display.Javascript object>

In [8]:
dic=df['native-country'].value_counts().to_dict()

In [9]:
df['native-country']=df['native-country'].map(dic)

In [10]:
df.head(20) 

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


##### Advantages
1. Easy To Use
2. Not increasing feature space
##### Disadvantages
1. It will provide same weight if the frequencies are same

## 4. Target Guided Ordinal Encoding

In [11]:
df=pd.read_csv('D:/pandas data analysis/csv data/pandas data analysis/titanic_data.csv',usecols=['Cabin','Survived'])
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [12]:
df.Cabin.isnull().sum()

687

In [13]:
df.Cabin=df.Cabin.fillna('Missing')

In [14]:
df.Cabin.unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [15]:
# There are many cabins category in this feature. So we are taking the 1st letter of the category 

In [16]:
df['Cabin']=df['Cabin'].astype(str).str[0]

In [17]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [18]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [19]:
ordinal_val=df.groupby(['Cabin'])['Survived'].mean().sort_values().index

In [20]:
ordinal_val2={i:k for k,i in enumerate(ordinal_val)}    # finding mean and assaigning grade and mapping

In [21]:
ordinal_val2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [22]:
df['cabin_targ']=df['Cabin'].map(ordinal_val2)

In [23]:
df.head()

Unnamed: 0,Survived,Cabin,cabin_targ
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


## 5. Mean encoding

In [24]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean()  

In [25]:
mean_ordinal=mean_ordinal.to_dict() # Same as target guided ordinal encoding--->without assigning values just implementing mean

In [26]:
df['Cabin_mean']=df['Cabin'].map(mean_ordinal)

In [27]:
df.head()

Unnamed: 0,Survived,Cabin,cabin_targ,Cabin_mean
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
