In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./covid_toy.csv")
data.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [2]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]


from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

## Missing values

In [3]:
data.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

## Fill missing value with Simple Imputer

In [4]:
from sklearn.impute import SimpleImputer
sm = SimpleImputer()
sm.fit(xtrain[["fever"]])

xtrain_fever = sm.transform(xtrain[["fever"]])
xtest_fever = sm.transform(xtest[["fever"]])

print(xtrain_fever.shape,xtest_fever.shape)

(80, 1) (20, 1)


In [5]:
pd.DataFrame(xtrain_fever).isnull().sum()

0    0
dtype: int64

In [6]:


xt_fever = pd.DataFrame(xtrain_fever,dtype=np.int32,columns=["fever"])
xt_fever

Unnamed: 0,fever
0,100
1,101
2,99
3,98
4,102
...,...
75,100
76,103
77,98
78,99


## Applying Ordinal Encoding on cough

In [7]:
from sklearn.preprocessing import OrdinalEncoder
od = OrdinalEncoder(categories=[["Mild","Strong"]],dtype=np.int32)

od.fit(xtrain[["cough"]])

xtrain_cough = od.transform(xtrain[["cough"]])
xtest_cough = od.transform(xtest[["cough"]])

print(xtrain_cough.shape,xtest_cough.shape)

(80, 1) (20, 1)


In [8]:
xt_cough = pd.DataFrame(xtrain_cough,columns=["cough"])
xt_cough

Unnamed: 0,cough
0,0
1,0
2,1
3,1
4,1
...,...
75,0
76,0
77,0
78,1


## Applying One Hot Encoding on gender and city

In [9]:
xtrain.gender.value_counts()

Female    49
Male      31
Name: gender, dtype: int64

In [10]:
xtrain.city.value_counts()

Kolkata      25
Bangalore    25
Delhi        16
Mumbai       14
Name: city, dtype: int64

In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False,dtype=np.int32)

xtrain_gender_city = ohe.fit_transform(xtrain[["gender","city"]])
xtest_gender_city = ohe.fit_transform(xtest[["gender","city"]])

print(xtrain_gender_city.shape,xtest_gender_city.shape)

(80, 6) (20, 6)


In [12]:
xt_gen_city = pd.DataFrame(xtrain_gender_city,columns=["Female","Male","Bangalore","Delhi","Kolkata","Mumbai"])
xt_gen_city

Unnamed: 0,Female,Male,Bangalore,Delhi,Kolkata,Mumbai
0,0,1,0,0,1,0
1,1,0,0,0,0,1
2,0,1,1,0,0,0
3,1,0,0,0,1,0
4,1,0,1,0,0,0
...,...,...,...,...,...,...
75,0,1,0,1,0,0
76,1,0,1,0,0,0
77,1,0,0,0,1,0
78,1,0,0,0,1,0


## Extract age

In [13]:
xtrain_age = xtrain.drop(columns=["gender","fever","cough","city"]).values
xtest_age = xtest.drop(columns=["gender","fever","cough","city"]).values

print(xtrain_age.shape,xtest_age.shape)


(80, 1) (20, 1)


In [14]:
xt_age = pd.DataFrame(xtrain_age,columns=["age"])
xt_age

Unnamed: 0,age
0,27
1,81
2,66
3,10
4,82
...,...
75,38
76,16
77,31
78,25


## concatenation all column

In [15]:
import numpy as np

xtrain_cn = np.concatenate((xtrain_age,xtrain_gender_city,xtrain_fever,xtrain_cough),axis=1)
xtest_cn = np.concatenate((xtest_age,xtest_gender_city,xtest_fever,xtest_cough),axis=1)

print(xtrain.shape,xtest.shape)

(80, 5) (20, 5)


In [16]:
pd.DataFrame(xtrain_cn)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,27.0,0.0,1.0,0.0,0.0,1.0,0.0,100.000000,0.0
1,81.0,1.0,0.0,0.0,0.0,0.0,1.0,101.000000,0.0
2,66.0,0.0,1.0,1.0,0.0,0.0,0.0,99.000000,1.0
3,10.0,1.0,0.0,0.0,0.0,1.0,0.0,98.000000,1.0
4,82.0,1.0,0.0,1.0,0.0,0.0,0.0,102.000000,1.0
...,...,...,...,...,...,...,...,...,...
75,38.0,0.0,1.0,0.0,1.0,0.0,0.0,100.833333,0.0
76,16.0,1.0,0.0,1.0,0.0,0.0,0.0,103.000000,0.0
77,31.0,1.0,0.0,0.0,0.0,1.0,0.0,98.000000,0.0
78,25.0,1.0,0.0,0.0,0.0,1.0,0.0,99.000000,1.0


## new column for Better Understand (only xtrain)

In [17]:
new = pd.DataFrame(xt_fever)
new[xt_cough.columns] = xt_cough
new[xt_gen_city.columns] = xt_gen_city
new[xt_age.columns] = xt_age
new


Unnamed: 0,fever,cough,Female,Male,Bangalore,Delhi,Kolkata,Mumbai,age
0,100,0,0,1,0,0,1,0,27
1,101,0,1,0,0,0,0,1,81
2,99,1,0,1,1,0,0,0,66
3,98,1,1,0,0,0,1,0,10
4,102,1,1,0,1,0,0,0,82
...,...,...,...,...,...,...,...,...,...
75,100,0,0,1,0,1,0,0,38
76,103,0,1,0,1,0,0,0,16
77,98,0,1,0,0,0,1,0,31
78,99,1,1,0,0,0,1,0,25


## Easy way with colum transformer

* ### transformer = columns
* ### transformer are pass in tuple
* ### remainder = drop , passthrough

### example

### transformer = ColumnTransformer(

### transformers=[

### (" transformer name ",
### Encoder function etc,
### [ colums ] )

### ]

### ,remainder="passthrough")


In [18]:
from sklearn.compose import ColumnTransformer



transformer = ColumnTransformer(transformers=[
    
    ("t1",SimpleImputer(),["fever"]),
    ("t2",OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
    ("t3",OneHotEncoder(sparse=False),["gender","city"])

],remainder="passthrough")

In [19]:
n = transformer.fit_transform(xtrain)


In [27]:
n = pd.DataFrame(n,columns=["fever","cough","Female","Male","Bangalore","Delhi","Kolkata","Mumbai","age"],dtype=np.int32)


## Previous xtrain

In [26]:
new

Unnamed: 0,fever,cough,Female,Male,Bangalore,Delhi,Kolkata,Mumbai,age
0,100,0,0,1,0,0,1,0,27
1,101,0,1,0,0,0,0,1,81
2,99,1,0,1,1,0,0,0,66
3,98,1,1,0,0,0,1,0,10
4,102,1,1,0,1,0,0,0,82
...,...,...,...,...,...,...,...,...,...
75,100,0,0,1,0,1,0,0,38
76,103,0,1,0,1,0,0,0,16
77,98,0,1,0,0,0,1,0,31
78,99,1,1,0,0,0,1,0,25


## Transformer Colum (xtrain)

In [28]:
n

Unnamed: 0,fever,cough,Female,Male,Bangalore,Delhi,Kolkata,Mumbai,age
0,100,0,0,1,0,0,1,0,27
1,101,0,1,0,0,0,0,1,81
2,99,1,0,1,1,0,0,0,66
3,98,1,1,0,0,0,1,0,10
4,102,1,1,0,1,0,0,0,82
...,...,...,...,...,...,...,...,...,...
75,100,0,0,1,0,1,0,0,38
76,103,0,1,0,1,0,0,0,16
77,98,0,1,0,0,0,1,0,31
78,99,1,1,0,0,0,1,0,25
