## Encoding Categorical Data

In [8]:
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv("datasets/disease.csv")
df.sample(5, random_state=54)


Unnamed: 0,gender,city,age,bp,cough,disease
36,Female,Shaikhupura,38.0,normal,Mild,No
70,Female,Islamabad,68.0,normal,Strong,No
48,Male,Shaikhupura,66.0,low,Moderate,No
94,Male,Lahore,79.0,normal,Strong,Yes
81,Male,Islamabad,65.0,normal,Mild,No


In [10]:
df.shape

(100, 6)

In [11]:
df.nunique()

gender      2
city        4
age        54
bp          3
cough       3
disease     2
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gender   100 non-null    object 
 1   city     100 non-null    object 
 2   age      87 non-null     float64
 3   bp       100 non-null    object 
 4   cough    100 non-null    object 
 5   disease  100 non-null    object 
dtypes: float64(1), object(5)
memory usage: 4.8+ KB


In [13]:
df.city.value_counts()

Lahore         32
Shaikhupura    30
Islamabad      22
Karachi        16
Name: city, dtype: int64

In [14]:
df.bp.value_counts()

normal    47
low       28
high      25
Name: bp, dtype: int64

In [15]:
df.cough.value_counts()

Mild        47
Strong      30
Moderate    23
Name: cough, dtype: int64

## Sklearn's LabelEncoder Vs OrdinalEncoder
### i) Label Encoder (target coloumn only)

In [18]:
y = df.iloc[:, -1]
y

0     Yes
1      No
2      No
3     Yes
4      No
     ... 
95     No
96    Yes
97     No
98     No
99    Yes
Name: disease, Length: 100, dtype: object

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
lb = LabelEncoder()
lb.fit(y)
transformed_target = lb.transform(y)

In [21]:
lb.classes_

array(['No', 'Yes'], dtype=object)

In [22]:
transformed_target

array([1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1])

In [23]:
transformed_target.shape

(100,)

### ii) Ordinal Encoder (Input Features)

In [24]:

df_bp_cough = df.iloc[:, 3:5]

In [25]:
from sklearn.preprocessing import OrdinalEncoder

In [26]:
#oe = OrdinalEncoder()
oe = OrdinalEncoder(categories=[['low', 'normal','high'],['Mild', 'Moderate', 'Strong']], dtype=np.int8)
oe.fit(df_bp_cough)
new_df = oe.transform(df_bp_cough)

In [27]:
oe.categories_

[array(['low', 'normal', 'high'], dtype=object),
 array(['Mild', 'Moderate', 'Strong'], dtype=object)]

In [28]:
new_df

array([[0, 1],
       [0, 0],
       [1, 2],
       [2, 1],
       [2, 0],
       [2, 0],
       [1, 2],
       [1, 1],
       [2, 2],
       [0, 0],
       [0, 0],
       [2, 1],
       [1, 2],
       [0, 0],
       [0, 0],
       [1, 2],
       [1, 0],
       [0, 1],
       [2, 0],
       [2, 2],
       [0, 2],
       [2, 1],
       [1, 2],
       [1, 0],
       [1, 1],
       [1, 0],
       [2, 1],
       [1, 2],
       [1, 0],
       [2, 1],
       [0, 0],
       [1, 0],
       [2, 2],
       [1, 1],
       [0, 0],
       [1, 2],
       [1, 0],
       [1, 1],
       [2, 0],
       [1, 1],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 1],
       [1, 2],
       [1, 0],
       [1, 1],
       [1, 0],
       [0, 1],
       [0, 0],
       [1, 0],
       [2, 1],
       [2, 2],
       [2, 0],
       [1, 0],
       [1, 0],
       [2, 2],
       [0, 2],
       [0, 2],
       [2, 0],
       [1, 2],
       [0, 1],
       [0, 2],
       [1, 0],
       [1, 0],
       [0, 1],
       [2,

## Nominal Encoding
### i) Pandas(get_dummies)

In [29]:
X_Gender = df.iloc[:, 0:1]
X_Gender

Unnamed: 0,gender
0,Male
1,Male
2,Male
3,Female
4,Female
...,...
95,Female
96,Female
97,Female
98,Female


In [30]:
pd.get_dummies(data=X_Gender) 

Unnamed: 0,gender_Female,gender_Male
0,0,1
1,0,1
2,0,1
3,1,0
4,1,0
...,...,...
95,1,0
96,1,0
97,1,0
98,1,0


In [31]:
#To HANDLE Multicollinearity drop one column
pd.get_dummies(data=X_Gender, drop_first=True)

Unnamed: 0,gender_Male
0,1
1,1
2,1
3,0
4,0
...,...
95,0
96,0
97,0
98,0


In [32]:
pd.get_dummies(data=df, columns=['gender', 'city'], drop_first=True)

Unnamed: 0,age,bp,cough,disease,gender_Male,city_Karachi,city_Lahore,city_Shaikhupura
0,60.0,low,Moderate,Yes,1,0,1,0
1,27.0,low,Mild,No,1,0,0,0
2,,normal,Strong,No,1,0,0,0
3,31.0,high,Moderate,Yes,0,0,1,0
4,65.0,high,Mild,No,0,1,0,0
...,...,...,...,...,...,...,...,...
95,,normal,Mild,No,0,0,0,1
96,51.0,high,Strong,Yes,0,0,1,0
97,20.0,normal,Mild,No,0,0,0,1
98,5.0,low,Moderate,No,0,1,0,0


## ii) ONE HOT Encoding

In [33]:
X_gender_city = df.iloc[:, 0:2]
X_gender_city

Unnamed: 0,gender,city
0,Male,Lahore
1,Male,Islamabad
2,Male,Islamabad
3,Female,Lahore
4,Female,Karachi
...,...,...
95,Female,Shaikhupura
96,Female,Lahore
97,Female,Shaikhupura
98,Female,Karachi


In [34]:
from sklearn.preprocessing import OneHotEncoder

In [35]:
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32)
ohe.fit(X_gender_city)



In [36]:
ohe.categories_

[array(['Female', 'Male'], dtype=object),
 array(['Islamabad', 'Karachi', 'Lahore', 'Shaikhupura'], dtype=object)]

In [37]:
ohe.transform(X_gender_city)# gender ---> 1 and city --> 3 total 4 columns

array([[1, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 1],
       [1, 0, 0, 1],
       [1, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 0, 1],
       [1, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [1, 0, 1, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [1, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0,

## Train Test Split and Transformers

In [39]:
df = pd.read_csv("datasets/disease.csv")
df.sample(5, random_state=54)

Unnamed: 0,gender,city,age,bp,cough,disease
36,Female,Shaikhupura,38.0,normal,Mild,No
70,Female,Islamabad,68.0,normal,Strong,No
48,Male,Shaikhupura,66.0,low,Moderate,No
94,Male,Lahore,79.0,normal,Strong,Yes
81,Male,Islamabad,65.0,normal,Mild,No


In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X = df.drop('disease',axis=1)
y = df['disease']
X , y

(    gender         city   age      bp     cough
 0     Male       Lahore  60.0     low  Moderate
 1     Male    Islamabad  27.0     low      Mild
 2     Male    Islamabad   NaN  normal    Strong
 3   Female       Lahore  31.0    high  Moderate
 4   Female      Karachi  65.0    high      Mild
 ..     ...          ...   ...     ...       ...
 95  Female  Shaikhupura   NaN  normal      Mild
 96  Female       Lahore  51.0    high    Strong
 97  Female  Shaikhupura  20.0  normal      Mild
 98  Female      Karachi   5.0     low  Moderate
 99  Female       Lahore  10.0     low    Strong
 
 [100 rows x 5 columns],
 0     Yes
 1      No
 2      No
 3     Yes
 4      No
      ... 
 95     No
 96    Yes
 97     No
 98     No
 99    Yes
 Name: disease, Length: 100, dtype: object)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=54)
X_train.head(), y_train.head()

(    gender         city   age      bp     cough
 90  Female    Islamabad  59.0     low  Moderate
 31    Male       Lahore   NaN  normal      Mild
 58    Male      Karachi  23.0     low    Strong
 74  Female    Islamabad  34.0    high    Strong
 89    Male  Shaikhupura  46.0     low    Strong,
 90     No
 31     No
 58    Yes
 74     No
 89     No
 Name: disease, dtype: object)

In [43]:
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32)
ohe.fit(X_train[['gender', 'city']])
X_train_trans = ohe.transform(X_train[['gender', 'city']])
X_test_trans =  ohe.transform(X_test[['gender', 'city']])
X_train_trans.shape, X_test_trans.shape



((80, 4), (20, 4))

In [44]:
oe = OrdinalEncoder(categories=[['low', 'normal','high'],['Mild', 'Moderate', 'Strong']], dtype=np.int8)
oe.fit(X_train[['bp','cough']])
bp_cough_train = oe.transform(X_train[['bp','cough']])
bp_cough_test = oe.transform(X_test[['bp','cough']])
bp_cough_train.shape, bp_cough_test.shape

((80, 2), (20, 2))

In [45]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)
y_train.shape, y_test.shape

((80,), (20,))

In [46]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_train[['age']])
X_train_age = imp.transform(X_train[['age']])
X_test_age = imp.transform(X_test[['age']])
X_train_age.shape, X_test_age.shape

((80, 1), (20, 1))