In [1]:
import pandas as pd
import numpy as np

## Handling categorical features

### 1. One Hot encoding
### 2. Ordinal Number encoding
### 3. count or Frequency encoding
### 4. Target Guided Ordinal Encoding

## 1. One Hot encoding

#### Creating dummies

In [2]:
df=pd.read_csv('D:/pandas data analysis/csv data/pandas data analysis/titanic_data.csv',usecols=['Sex','Embarked'])
df.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [3]:
pd.get_dummies(df.Sex,drop_first=True).head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [4]:
df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [5]:
df.Embarked.isnull().sum()

2

In [6]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
# replacing nan with freq used value (s)

In [8]:
df.Embarked.fillna(df.Embarked.value_counts().index[0],inplace=True)

In [9]:
pd.get_dummies(df['Embarked'],drop_first=True).head()

Unnamed: 0,Q,S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


#### Disadvantage:
it creates more features(curse of dimention)

#### creating one hot encoding with many categories in a feature (KDD orange cup)
#### Here we are going to take first 10 higher recuring values and assigning it with 1 and imputing 0 to remaining categories. It is a technique used in kaggle competition

In [10]:
df=pd.read_csv("mercedes.csv",usecols=['X0','X1','X2','X3','X4','X5'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5
0,k,v,at,a,d,u
1,k,t,av,e,d,y
2,az,w,n,c,d,x
3,az,t,n,f,d,x
4,az,v,n,f,d,h


In [11]:
# checking unique values X0 feature

In [12]:
df.X0.unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [13]:
top_10=list(df.X0.value_counts().head(10).index)
top_10

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']

In [14]:
for i in top_10:
    df[i]=np.where(df['X0']==i,1,0)

In [15]:
df.head(10)

Unnamed: 0,X0,X1,X2,X3,X4,X5,z,ak,y,ay,t,x,o,f,n,w
0,k,v,at,a,d,u,0,0,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,0,0,0,0,0,0,0,0,0,0
5,t,b,e,c,d,g,0,0,0,0,1,0,0,0,0,0
6,al,r,e,f,d,f,0,0,0,0,0,0,0,0,0,0
7,o,l,as,f,d,f,0,0,0,0,0,0,1,0,0,0
8,w,s,as,e,d,f,0,0,0,0,0,0,0,0,0,1
9,j,b,aq,c,d,f,0,0,0,0,0,0,0,0,0,0


## 2. Ordinal Number encoding

#### Ranking caregories in the feature

In [100]:
import datetime

In [101]:
today_date=datetime.datetime.today()

In [102]:
today_date

datetime.datetime(2021, 7, 14, 12, 23, 28, 907081)

In [103]:
today_date-datetime.timedelta(1)    # timedelta is used to find difference between the dates

datetime.datetime(2021, 7, 13, 12, 23, 28, 907081)

In [104]:
# List comprehension
days=list(today_date-datetime.timedelta(x) for x in range(0,15))

In [105]:
days

[datetime.datetime(2021, 7, 14, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 13, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 12, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 11, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 10, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 9, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 8, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 7, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 6, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 5, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 4, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 3, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 2, 12, 23, 28, 907081),
 datetime.datetime(2021, 7, 1, 12, 23, 28, 907081),
 datetime.datetime(2021, 6, 30, 12, 23, 28, 907081)]

In [106]:
df=pd.DataFrame(days,columns=['Day'])       # converting to dataframe and changing column name
df.head()

Unnamed: 0,Day
0,2021-07-14 12:23:28.907081
1,2021-07-13 12:23:28.907081
2,2021-07-12 12:23:28.907081
3,2021-07-11 12:23:28.907081
4,2021-07-10 12:23:28.907081


In [107]:
df['Day_name']=df['Day'].dt.day_name()

In [108]:
df

Unnamed: 0,Day,Day_name
0,2021-07-14 12:23:28.907081,Wednesday
1,2021-07-13 12:23:28.907081,Tuesday
2,2021-07-12 12:23:28.907081,Monday
3,2021-07-11 12:23:28.907081,Sunday
4,2021-07-10 12:23:28.907081,Saturday
5,2021-07-09 12:23:28.907081,Friday
6,2021-07-08 12:23:28.907081,Thursday
7,2021-07-07 12:23:28.907081,Wednesday
8,2021-07-06 12:23:28.907081,Tuesday
9,2021-07-05 12:23:28.907081,Monday


In [109]:
dic={'Monday':1,
    'Tuesday':2,
    'Wednesday':3,
    'Thursday':4,
    'Friday':5,
    'Saturday':6,
    'Sunday':7}

In [110]:
dic

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [111]:
df['Day_val']=df['Day_name'].map(dic)

In [112]:
df

Unnamed: 0,Day,Day_name,Day_val
0,2021-07-14 12:23:28.907081,Wednesday,3
1,2021-07-13 12:23:28.907081,Tuesday,2
2,2021-07-12 12:23:28.907081,Monday,1
3,2021-07-11 12:23:28.907081,Sunday,7
4,2021-07-10 12:23:28.907081,Saturday,6
5,2021-07-09 12:23:28.907081,Friday,5
6,2021-07-08 12:23:28.907081,Thursday,4
7,2021-07-07 12:23:28.907081,Wednesday,3
8,2021-07-06 12:23:28.907081,Tuesday,2
9,2021-07-05 12:23:28.907081,Monday,1
