In [14]:
import pandas as pd
import numpy as np

data = pd.read_csv("./covid_toy.csv")
data.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [29]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

## Missing values

In [30]:
data.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

## Fill missing value with Simple Imputer

In [10]:
from sklearn.impute import SimpleImputer
sm = SimpleImputer()


sm.fit(xtrain[["fever"]])

xtrain_fever = sm.transform(xtrain[["fever"]])
xtest_fever = sm.transform(xtest[["fever"]])

print(xtrain_fever.shape,xtest_fever.shape)

(80, 1) (20, 1)


In [12]:
pd.DataFrame(xtrain_fever).isnull().sum()

0    0
dtype: int64

In [26]:
pd.DataFrame(xtest_fever).isnull().sum()

0    0
dtype: int64

In [28]:
xt_fever = pd.DataFrame(xtrain_fever,dtype=np.int32,columns=["fever"])
xt_fever.isnull().sum()

fever    0
dtype: int64

## Applying Ordinal Encoding on cough

In [36]:
data.cough.value_counts()

Mild      62
Strong    38
Name: cough, dtype: int64

In [40]:
from sklearn.preprocessing import OrdinalEncoder

od = OrdinalEncoder(categories=[["Mild","Strong"]],dtype=np.int32)

od.fit(xtrain[["cough"]])

xtrain_cough = od.transform(xtrain[["cough"]])
xtest_cough = od.transform(xtest[["cough"]])

print(xtrain_cough.shape,xtest_cough.shape)

(80, 1) (20, 1)


In [41]:
xt_cough = pd.DataFrame(xtrain_cough,columns=["cough"])
xt_cough

Unnamed: 0,cough
0,0
1,1
2,0
3,0
4,1
...,...
75,1
76,0
77,0
78,0


## Applying One Hot Encoding on gender and city

In [50]:
xtrain.gender.value_counts()

Female    46
Male      34
Name: gender, dtype: int64

In [51]:
xtrain.city.value_counts()

Kolkata      26
Bangalore    26
Mumbai       14
Delhi        14
Name: city, dtype: int64

In [52]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False,dtype=np.int32)

xtrain_gender_city = ohe.fit_transform(xtrain[["gender","city"]])
xtest_gender_city = ohe.fit_transform(xtest[["gender","city"]])

print(xtrain_gender_city.shape,xtest_gender_city.shape)

(80, 6) (20, 6)


In [53]:
xt_gen_city = pd.DataFrame(xtrain_gender_city,columns=["Female","Male","Bangalore","Delhi","Kolkata","Mumbai"])
xt_gen_city

Unnamed: 0,Female,Male,Bangalore,Delhi,Kolkata,Mumbai
0,1,0,0,0,1,0
1,1,0,0,1,0,0
2,0,1,1,0,0,0
3,0,1,0,0,1,0
4,1,0,1,0,0,0
...,...,...,...,...,...,...
75,1,0,0,1,0,0
76,0,1,0,0,1,0
77,0,1,0,0,1,0
78,1,0,0,0,0,1


## Extract age

In [54]:
xtrain_age = xtrain.drop(columns=["gender","fever","cough","city"]).values
xtest_age = xtest.drop(columns=["gender","fever","cough","city"]).values

print(xtrain_age.shape,xtest_age.shape)


(80, 1) (20, 1)


In [55]:
xt_age = pd.DataFrame(xtrain_age,columns=["age"])
xt_age

Unnamed: 0,age
0,69
1,59
2,73
3,16
4,47
...,...
75,34
76,24
77,27
78,65


## concatenation all column

In [56]:
import numpy as np

xtrain_cn = np.concatenate((xtrain_age,xtrain_gender_city,xtrain_fever,xtrain_cough),axis=1)
xtest_cn = np.concatenate((xtest_age,xtest_gender_city,xtest_fever,xtest_cough),axis=1)

print(xtrain.shape,xtest.shape)

(80, 5) (20, 5)


In [57]:
pd.DataFrame(xtrain_cn)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,69.0,1.0,0.0,0.0,0.0,1.0,0.0,104.000000,0.0
1,59.0,1.0,0.0,0.0,1.0,0.0,0.0,100.000000,1.0
2,73.0,0.0,1.0,1.0,0.0,0.0,0.0,100.916667,0.0
3,16.0,0.0,1.0,0.0,0.0,1.0,0.0,101.000000,0.0
4,47.0,1.0,0.0,1.0,0.0,0.0,0.0,101.000000,1.0
...,...,...,...,...,...,...,...,...,...
75,34.0,1.0,0.0,0.0,1.0,0.0,0.0,101.000000,1.0
76,24.0,0.0,1.0,0.0,0.0,1.0,0.0,100.000000,0.0
77,27.0,0.0,1.0,0.0,0.0,1.0,0.0,104.000000,0.0
78,65.0,1.0,0.0,0.0,0.0,0.0,1.0,101.000000,0.0


## new column for Better Understand (only xtrain)

In [58]:
new = pd.DataFrame(xt_fever)
new[xt_cough.columns] = xt_cough
new[xt_gen_city.columns] = xt_gen_city
new[xt_age.columns] = xt_age
new


Unnamed: 0,fever,cough,Female,Male,Bangalore,Delhi,Kolkata,Mumbai,age
0,104,0,1,0,0,0,1,0,69
1,100,1,1,0,0,1,0,0,59
2,100,0,0,1,1,0,0,0,73
3,101,0,0,1,0,0,1,0,16
4,101,1,1,0,1,0,0,0,47
...,...,...,...,...,...,...,...,...,...
75,101,1,1,0,0,1,0,0,34
76,100,0,0,1,0,0,1,0,24
77,104,0,0,1,0,0,1,0,27
78,101,0,1,0,0,0,0,1,65


## Easy way with colum transformer

* ### transformer = columns
* ### transformer are pass in tuple
* ### remainder = drop , passthrough

### example

### transformer = ColumnTransformer(

### transformers=[

### (" transformer name ",
### Encoder function etc,
### [ colums ] )

### ]

### ,remainder="passthrough")


In [59]:
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    
    ("t1",SimpleImputer(),["fever"]),
    ("t2",OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
    ("t3",OneHotEncoder(sparse=False),["gender","city"])

],remainder="passthrough")

In [61]:
n = transformer.fit_transform(xtrain)


In [27]:
n = pd.DataFrame(n,columns=["fever","cough","Female","Male","Bangalore","Delhi","Kolkata","Mumbai","age"],dtype=np.int32)


## Previous xtrain

In [26]:
new

Unnamed: 0,fever,cough,Female,Male,Bangalore,Delhi,Kolkata,Mumbai,age
0,100,0,0,1,0,0,1,0,27
1,101,0,1,0,0,0,0,1,81
2,99,1,0,1,1,0,0,0,66
3,98,1,1,0,0,0,1,0,10
4,102,1,1,0,1,0,0,0,82
...,...,...,...,...,...,...,...,...,...
75,100,0,0,1,0,1,0,0,38
76,103,0,1,0,1,0,0,0,16
77,98,0,1,0,0,0,1,0,31
78,99,1,1,0,0,0,1,0,25


## Transformer Colum (xtrain)

In [28]:
n

Unnamed: 0,fever,cough,Female,Male,Bangalore,Delhi,Kolkata,Mumbai,age
0,100,0,0,1,0,0,1,0,27
1,101,0,1,0,0,0,0,1,81
2,99,1,0,1,1,0,0,0,66
3,98,1,1,0,0,0,1,0,10
4,102,1,1,0,1,0,0,0,82
...,...,...,...,...,...,...,...,...,...
75,100,0,0,1,0,1,0,0,38
76,103,0,1,0,1,0,0,0,16
77,98,0,1,0,0,0,1,0,31
78,99,1,1,0,0,0,1,0,25
