# Feature Engineering 
# ENCODING
# COLUMN TRANSFORMER



In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.model_selection import train_test_split

df = pd.read_csv('covid_toy.csv')

In [8]:
df.sample(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
4,65,Female,101.0,Mild,Mumbai,No
50,19,Male,101.0,Mild,Delhi,Yes
14,51,Male,104.0,Mild,Bangalore,No
0,60,Male,103.0,Mild,Kolkata,No
23,80,Female,98.0,Mild,Delhi,Yes


In [9]:
df.isnull().sum()


age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [18]:
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,0:5],df.iloc[:,-1], test_size=0.2)

# Normal way (Aam Zindagi )

In [19]:
# imputing null values in fever

si = SimpleImputer()

x_train_fever = si.fit_transform(x_train[['fever']])

x_test_fever = si.transform(x_test[['fever']])

In [20]:
# ordinal encoding for cough

oe  = OrdinalEncoder(categories=[["Mild","Strong"]])

x_train_cough = oe.fit_transform(x_train[['cough']])

x_test_cough = oe.fit_transform(x_test[['cough']])

x_train

Unnamed: 0,age,gender,fever,cough,city
89,46,Male,103.0,Strong,Bangalore
59,6,Female,104.0,Mild,Kolkata
70,68,Female,101.0,Strong,Delhi
20,12,Male,98.0,Strong,Bangalore
92,82,Female,102.0,Strong,Kolkata
...,...,...,...,...,...
6,14,Male,101.0,Strong,Bangalore
85,16,Female,103.0,Mild,Bangalore
51,11,Female,100.0,Strong,Kolkata
16,69,Female,103.0,Mild,Kolkata


In [21]:
# onehot encoding for gender and city

ohe = OneHotEncoder(drop = "first", sparse_output=False)

x_train_gender_city = ohe.fit_transform(x_train[['gender', 'city']])

x_test_gender_city = ohe.fit_transform(x_test[['gender', 'city']])


In [25]:
# extracing age

x_train_age = x_train[['age']].values
x_test_age = x_test[['age']].values


In [29]:
# now combine all 

x_train_transformed = np.concatenate((x_train_age,x_train_fever, x_train_gender_city,x_train_cough),axis =1 )

x_test_transformed = np.concatenate((x_test_age,x_test_fever, x_test_gender_city,x_test_cough),axis =1 )


# better way (MENTOS ZINDAGI)

In [32]:
# using column transformer

from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[('t1',SimpleImputer(),['fever'])
,('t2',OneHotEncoder(drop='first', sparse_output=False),['gender', 'city'])
,('t3',OrdinalEncoder(categories=[["Mild","Strong"]]),['cough'])
],remainder="passthrough")


In [36]:
transformer.fit_transform(x_train).shape


(80, 7)

In [37]:
transformer.transform(x_test).shape

(20, 7)