In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [3]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [4]:
from sklearn.model_selection import train_test_split    
x_train, x_test, y_train, y_test = train_test_split(df.drop('has_covid', axis=1), 
                                                    df['has_covid'], 
                                                    test_size=0.2)

In [5]:
x_train

Unnamed: 0,age,gender,fever,cough,city
8,19,Female,100.0,Strong,Bangalore
56,71,Male,,Strong,Kolkata
29,34,Female,,Strong,Mumbai
87,47,Male,101.0,Strong,Bangalore
40,49,Female,102.0,Mild,Delhi
...,...,...,...,...,...
45,72,Male,99.0,Mild,Bangalore
22,71,Female,98.0,Strong,Kolkata
88,5,Female,100.0,Mild,Kolkata
81,65,Male,99.0,Mild,Delhi


1. AAM ZINDAGI

~fever col

In [6]:
#adding simple imputer to handle missing values in fever col
#as we already seen there is missing value in fever col
#simple imputer simply replace the missing valuws with mean/median/mode

from sklearn.impute import SimpleImputer
si = SimpleImputer()
x_train_fever = si.fit_transform(x_train[['fever']])
x_test_fever = si.fit_transform(x_test[['fever']])
x_train_fever.shape


(80, 1)

~ cough col

In [7]:
#ordinal encoding for 'cough' column
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Mild', 'Strong']])
x_train_cough = oe.fit_transform(x_train[['cough']])
x_test_cough = oe.fit_transform(x_test[['cough']])
x_train_cough.shape

(80, 1)

~ gender and city cols

In [8]:
#ONE HOT ENCODING FOR GENDER AND CITY COLUMN
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, drop='first')
x_train_gender = ohe.fit_transform(x_train[['gender']])
x_test_gender = ohe.fit_transform(x_test[['gender']])
x_train_city = ohe.fit_transform(x_train[['city']])
x_test_city = ohe.fit_transform(x_test[['city']])
x_train_city.shape
x_train_gender.shape

(80, 1)

if we wanted then we can also do the scaling on age col..but in the lec it has been told not to do anything as it is perfect data so i am not scaling it..but we should do 

now we will extract age col and concatenate all these scaled col with it

In [9]:
# extracting age
x_train_age = x_train.drop(columns=['fever', 'cough', 'gender', 'city']).values
x_test_age = x_test.drop(columns=['fever', 'cough', 'gender', 'city']).values
x_train_age.shape

(80, 1)

In [10]:
#now combine all the preprocessed columns together(concatenate)
x_train_final = np.concatenate([x_train_age, 
                                x_train_fever, 
                                x_train_cough,
                                x_train_gender,
                                x_train_city], axis=1)  
x_test_final = np.concatenate([x_test_age, 
                               x_test_fever,
                               x_test_cough,
                               x_test_gender,
                               x_test_city], axis=1)
x_train_final.shape

(80, 7)

BUT THIS IS A VERY BIG PROCESS WE CAN DO THIS IN ONE LINE USING COLUMN TRANSFORMER

In [11]:
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer(
    transformers=[('tnf1', SimpleImputer(), ['fever']),
                  ('tnf2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
                  ('tnf3', OneHotEncoder(sparse_output=False, drop='first'), ['gender', 'city'] )
                    ], remainder='passthrough')
# pass through for age column as we dont need to do any preprocessing on age column
x_train_final_ct = transformer.fit_transform(x_train)
x_test_final_ct = transformer.transform(x_test)


In [12]:
x_train_final_ct.shape

(80, 7)

In [13]:
x_test_final_ct

array([[ 98.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  31.        ],
       [100.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  47.        ],
       [ 98.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  73.        ],
       [102.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   1.        ,  74.        ],
       [104.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  56.        ],
       [100.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  55.        ],
       [ 98.        ,   1.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  10.        ],
       [ 99.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  14.        ],
       [ 98.        ,   0.        ,   0.        ,   0.        ,
          1.    