In [99]:
import numpy as np
import pandas as pd

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [101]:

from sklearn import set_config
set_config(display='diagram')

In [102]:
df = pd.read_csv("covid_toy.csv")

In [103]:
df.sample(10)

Unnamed: 0,age,gender,fever,cough,city,has_covid
67,65,Male,99.0,Mild,Bangalore,No
55,81,Female,101.0,Mild,Mumbai,Yes
86,25,Male,104.0,Mild,Bangalore,Yes
31,83,Male,103.0,Mild,Kolkata,No
2,42,Male,101.0,Mild,Delhi,No
41,82,Male,,Mild,Kolkata,Yes
94,79,Male,,Strong,Kolkata,Yes
48,66,Male,99.0,Strong,Bangalore,No
71,75,Female,104.0,Strong,Delhi,No
96,51,Female,101.0,Strong,Kolkata,Yes


In [104]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [105]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),
                                                 df['has_covid'],
                                                 test_size=0.2,
                                                random_state=42)

In [106]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
55,81,Female,101.0,Mild,Mumbai
88,5,Female,100.0,Mild,Kolkata
26,19,Female,100.0,Mild,Kolkata
42,27,Male,100.0,Mild,Delhi
69,73,Female,103.0,Mild,Delhi


In [107]:
trf1 = ColumnTransformer([
    ("impute_fever",SimpleImputer(),[2])
], remainder="passthrough")

In [108]:
trf2 = ColumnTransformer([
    ("oe_fever",OrdinalEncoder(categories=[["Mild","Strong"]]),[3])
    ],remainder="passthrough")

In [109]:
trf3 = ColumnTransformer([
    ("ohe_gender_city",OneHotEncoder(sparse_output=False, handle_unknown="ignore"),[1,4])
], remainder="passthrough")

In [110]:
trf4 = ColumnTransformer([
    ('scale', StandardScaler(),[0,2])
], remainder="passthrough")


In [111]:
trf6 = DecisionTreeClassifier()

In [112]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ("trf4",trf4), 
])

In [113]:
pipe.fit_transform(X_train,y_train)

array([[-0.42008402520840293, -0.42008402520840293, 0.0, ..., 0.0, 81,
        'Female'],
       [-0.42008402520840293, 2.3804761428476167, 0.0, ..., 0.0, 5,
        'Female'],
       [-0.42008402520840293, 2.3804761428476167, 0.0, ..., 0.0, 19,
        'Female'],
       ...,
       [-0.42008402520840293, -0.42008402520840293, 0.0, ..., 0.0, 51,
        'Male'],
       [-0.42008402520840293, -0.42008402520840293, 0.0, ..., 1.0, 82,
        'Female'],
       [-0.42008402520840293, 2.3804761428476167, 0.0, ..., 1.0, 11,
        'Female']], dtype=object)

In [114]:
from sklearn import set_config
set_config(display='diagram')