# Data Preprocessing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing Dataset

In [6]:
'''
                                          ABOUT THE DATASET

The dataset has 6 columns in total, with 5 columns for features and 1 column for output.
The features are:
Age(Num)
body_temperature(str)
chronic_disease(str)
breathing_issue (str)
Blood O2 Level in Percentage(Num)
Needed Hospitalization(str)

X is the set of inputs or independent variables or features.
In the dataset, except the last column, all are input variables or features hence splitting the dataset accordingly.
'''

data_set = pd.read_csv('/content/Covid_Data.csv')
X = data_set.iloc[:, :-2].values
y = data_set.iloc[:, -2].values

In [7]:
print(X)

[[10.0 'Normal' 'no' 'no' 97.0]
 [12.0 'Normal' 'no' 'no' 97.0]
 [15.0 'Normal' 'no' 'no' 94.0]
 [10.0 'Normal' 'no' 'no' 97.0]
 [13.0 'Moderate' 'no' 'no' 94.0]
 [12.0 'Moderate' 'no' 'no' 97.0]
 [13.0 'Moderate' 'no' 'no' 93.0]
 [15.0 'Moderate' 'no' 'no' 92.0]
 [18.0 'Moderate' 'no' 'no' 66.0]
 [19.0 'Normal' 'no' 'no' 92.0]
 [20.0 'Normal' 'no' 'no' 93.0]
 [17.0 'Normal' 'no' 'no' 93.0]
 [16.0 'Normal' 'no' 'no' 92.0]
 [18.0 'Normal' 'no' 'no' 93.0]
 [20.0 'Normal' 'no' 'no' 92.0]
 [25.0 'Moderate' 'no' 'no' 93.0]
 [24.0 'Moderate' 'no' 'no' 92.0]
 [26.0 'High' 'no' 'no' 94.0]
 [28.0 'Normal' 'no' 'no' 99.0]
 [29.0 'Normal' 'no' 'no' 93.0]
 [30.0 'Moderate' 'no' 'no' 62.0]
 [19.0 'Normal' 'no' 'no' 89.0]
 [25.0 'Normal' 'no' 'yes' 86.0]
 [26.0 'Normal' 'no' 'no' nan]
 [28.0 'Normal' 'no' 'no' 89.0]
 [30.0 'Moderate' 'yes' 'no' 86.0]
 [35.0 'Moderate' 'no' 'no' 89.0]
 [32.0 'Moderate' 'no' 'yes' 84.0]
 [nan 'Moderate' 'yes' 'no' 90.0]
 [32.0 'Moderate' 'no' 'no' 89.0]
 [35.0 'Modera

In [8]:
print(y)

['No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'No' 'Yes' 'Yes' 'Yes']





## Handling Missing Data

In [9]:
'''
The dataset has 6 columns in total, with 5 columns for features and 1 column for output.
The features are:
Age(Num)
body_temperature(str)
chronic_disease(str)
breathing_issue (str)
Blood O2 Level in Percentage(Num)
Needed Hospitalization(str) - Output

In case of any missing data, we will use SimpleImputer library to fill it with the data based on strategy. For numeric, strategy will be mean.
Here, if you check the dataset, the missing values are for Age and Blood O2 level, hence we will use these columns only i.e., 0 and 4. The transformed array by
Panda will be having np.nan wherever values are missing.
'''

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

imputer.fit(X[:, 0:1])
X[: , 0:1] = imputer.transform(X[:, 0:1])

In [10]:
print(X)

[[10.0 'Normal' 'no' 'no' 97.0]
 [12.0 'Normal' 'no' 'no' 97.0]
 [15.0 'Normal' 'no' 'no' 94.0]
 [10.0 'Normal' 'no' 'no' 97.0]
 [13.0 'Moderate' 'no' 'no' 94.0]
 [12.0 'Moderate' 'no' 'no' 97.0]
 [13.0 'Moderate' 'no' 'no' 93.0]
 [15.0 'Moderate' 'no' 'no' 92.0]
 [18.0 'Moderate' 'no' 'no' 66.0]
 [19.0 'Normal' 'no' 'no' 92.0]
 [20.0 'Normal' 'no' 'no' 93.0]
 [17.0 'Normal' 'no' 'no' 93.0]
 [16.0 'Normal' 'no' 'no' 92.0]
 [18.0 'Normal' 'no' 'no' 93.0]
 [20.0 'Normal' 'no' 'no' 92.0]
 [25.0 'Moderate' 'no' 'no' 93.0]
 [24.0 'Moderate' 'no' 'no' 92.0]
 [26.0 'High' 'no' 'no' 94.0]
 [28.0 'Normal' 'no' 'no' 99.0]
 [29.0 'Normal' 'no' 'no' 93.0]
 [30.0 'Moderate' 'no' 'no' 62.0]
 [19.0 'Normal' 'no' 'no' 89.0]
 [25.0 'Normal' 'no' 'yes' 86.0]
 [26.0 'Normal' 'no' 'no' nan]
 [28.0 'Normal' 'no' 'no' 89.0]
 [30.0 'Moderate' 'yes' 'no' 86.0]
 [35.0 'Moderate' 'no' 'no' 89.0]
 [32.0 'Moderate' 'no' 'yes' 84.0]
 [45.130434782608695 'Moderate' 'yes' 'no' 90.0]
 [32.0 'Moderate' 'no' 'no' 89.0]

In [11]:
imputer.fit(X[:, 4:5])
X[: , 4:5] = imputer.transform(X[:, 4:5])

In [12]:
print(X)

[[10.0 'Normal' 'no' 'no' 97.0]
 [12.0 'Normal' 'no' 'no' 97.0]
 [15.0 'Normal' 'no' 'no' 94.0]
 [10.0 'Normal' 'no' 'no' 97.0]
 [13.0 'Moderate' 'no' 'no' 94.0]
 [12.0 'Moderate' 'no' 'no' 97.0]
 [13.0 'Moderate' 'no' 'no' 93.0]
 [15.0 'Moderate' 'no' 'no' 92.0]
 [18.0 'Moderate' 'no' 'no' 66.0]
 [19.0 'Normal' 'no' 'no' 92.0]
 [20.0 'Normal' 'no' 'no' 93.0]
 [17.0 'Normal' 'no' 'no' 93.0]
 [16.0 'Normal' 'no' 'no' 92.0]
 [18.0 'Normal' 'no' 'no' 93.0]
 [20.0 'Normal' 'no' 'no' 92.0]
 [25.0 'Moderate' 'no' 'no' 93.0]
 [24.0 'Moderate' 'no' 'no' 92.0]
 [26.0 'High' 'no' 'no' 94.0]
 [28.0 'Normal' 'no' 'no' 99.0]
 [29.0 'Normal' 'no' 'no' 93.0]
 [30.0 'Moderate' 'no' 'no' 62.0]
 [19.0 'Normal' 'no' 'no' 89.0]
 [25.0 'Normal' 'no' 'yes' 86.0]
 [26.0 'Normal' 'no' 'no' 82.07246376811594]
 [28.0 'Normal' 'no' 'no' 89.0]
 [30.0 'Moderate' 'yes' 'no' 86.0]
 [35.0 'Moderate' 'no' 'no' 89.0]
 [32.0 'Moderate' 'no' 'yes' 84.0]
 [45.130434782608695 'Moderate' 'yes' 'no' 90.0]
 [32.0 'Moderate' '

## Encoding Categorical Data

### Encoding independent variables

In [13]:
'''
As the machine learning algorithm will not understand the string value for required computation,
we will have to assign some numerical values to each of the string valued columns. For example, in body_temperature,
there are 3 categories: Normal, Moderate and High. So, for a person having moderate body temperature, it can be assigned
010. As it is understood, the categorical encoding will result in 3 columns for body temperature each representing Normal, Moderate and High body temperature

For encoding of categorical features or independent variables:
OneHotEncoder transforms each categorical feature with n_categories possible values into n_categories binary features, with one of them 1, and all others 0.
'''

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [14]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [1])], remainder= 'passthrough')
X = np.array(ct.fit_transform(X))

In [15]:
print(X)

[[0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [0.0 0.0 1.0 12.0 'no' 'no' 97.0]
 [0.0 0.0 1.0 15.0 'no' 'no' 94.0]
 [0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 94.0]
 [0.0 1.0 0.0 12.0 'no' 'no' 97.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 93.0]
 [0.0 1.0 0.0 15.0 'no' 'no' 92.0]
 [0.0 1.0 0.0 18.0 'no' 'no' 66.0]
 [0.0 0.0 1.0 19.0 'no' 'no' 92.0]
 [0.0 0.0 1.0 20.0 'no' 'no' 93.0]
 [0.0 0.0 1.0 17.0 'no' 'no' 93.0]
 [0.0 0.0 1.0 16.0 'no' 'no' 92.0]
 [0.0 0.0 1.0 18.0 'no' 'no' 93.0]
 [0.0 0.0 1.0 20.0 'no' 'no' 92.0]
 [0.0 1.0 0.0 25.0 'no' 'no' 93.0]
 [0.0 1.0 0.0 24.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 26.0 'no' 'no' 94.0]
 [0.0 0.0 1.0 28.0 'no' 'no' 99.0]
 [0.0 0.0 1.0 29.0 'no' 'no' 93.0]
 [0.0 1.0 0.0 30.0 'no' 'no' 62.0]
 [0.0 0.0 1.0 19.0 'no' 'no' 89.0]
 [0.0 0.0 1.0 25.0 'no' 'yes' 86.0]
 [0.0 0.0 1.0 26.0 'no' 'no' 82.07246376811594]
 [0.0 0.0 1.0 28.0 'no' 'no' 89.0]
 [0.0 1.0 0.0 30.0 'yes' 'no' 86.0]
 [0.0 1.0 0.0 35.0 'no' 'no' 89.0]
 [0.0 1.0 0.0 32.0 'no' 'yes' 84.0]
 [0.

### Encoding dependent variables

In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
print(y)

[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1]


## Splitting data into Test set & Training Set


In [18]:
#random_state controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [19]:
print(X_train)

[[1.0 0.0 0.0 86.0 'yes' 'yes' 71.0]
 [0.0 1.0 0.0 35.0 'no' 'yes' 92.0]
 [1.0 0.0 0.0 75.0 'no' 'yes' 72.0]
 [1.0 0.0 0.0 42.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 74.0 'yes' 'yes' 75.0]
 [1.0 0.0 0.0 62.0 'yes' 'yes' 68.0]
 [0.0 1.0 0.0 24.0 'no' 'no' 92.0]
 [0.0 1.0 0.0 48.0 'no' 'yes' 93.0]
 [1.0 0.0 0.0 58.0 'no' 'no' 70.0]
 [0.0 1.0 0.0 45.130434782608695 'yes' 'no' 90.0]
 [0.0 1.0 0.0 15.0 'no' 'no' 92.0]
 [0.0 0.0 1.0 70.0 'no' 'yes' 88.0]
 [1.0 0.0 0.0 52.0 'yes' 'yes' 80.0]
 [1.0 0.0 0.0 60.0 'yes' 'yes' 68.0]
 [1.0 0.0 0.0 54.0 'yes' 'yes' 70.0]
 [0.0 0.0 1.0 29.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 78.0 'yes' 'yes' 71.0]
 [1.0 0.0 0.0 36.0 'yes' 'no' 90.0]
 [0.0 1.0 0.0 30.0 'yes' 'no' 86.0]
 [1.0 0.0 0.0 50.0 'yes' 'yes' 77.0]
 [0.0 0.0 1.0 18.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 69.0 'no' 'yes' 53.0]
 [0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [1.0 0.0 0.0 26.0 'no' 'no' 94.0]
 [0.0 1.0 0.0 18.0 'no' 'no' 66.0]
 [1.0 0.0 0.0 62.0 'yes' 'yes' 69.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 4

In [20]:
print(X_test)

[[0.0 0.0 1.0 25.0 'no' 'yes' 86.0]
 [0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [1.0 0.0 0.0 68.0 'yes' 'no' 67.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 94.0]
 [1.0 0.0 0.0 59.0 'yes' 'no' 68.0]
 [0.0 0.0 1.0 28.0 'no' 'no' 99.0]
 [0.0 0.0 1.0 20.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 46.0 'yes' 'no' 91.0]
 [1.0 0.0 0.0 53.0 'yes' 'no' 55.0]
 [0.0 0.0 1.0 16.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 38.0 'no' 'no' 75.0]
 [0.0 0.0 1.0 19.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 94.0 'yes' 'yes' 64.0]
 [0.0 1.0 0.0 12.0 'no' 'no' 97.0]]


In [21]:
print(y_train)

[1 0 1 0 1 1 0 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 0 1 1 1 0 0 1 0 1 0
 1 1 0 0 0 1 1 0 1 1 0 1 0 0 0 1 0 0 1]


In [22]:
print(y_test)

[0 0 1 0 1 0 0 0 1 0 1 0 1 0]


## Feature Scaling

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 6:] = sc.fit_transform(X_train[:, 6:])
X_test[:, 6:] = sc.fit_transform(X_test[:, 6:])

In [24]:
print(X_train)

[[1.0 0.0 0.0 86.0 'yes' 'yes' -0.9941858494300261]
 [0.0 1.0 0.0 35.0 'no' 'yes' 0.9574354145081887]
 [1.0 0.0 0.0 75.0 'no' 'yes' -0.9012515035282064]
 [1.0 0.0 0.0 42.0 'no' 'no' 0.9574354145081887]
 [1.0 0.0 0.0 74.0 'yes' 'yes' -0.6224484658227472]
 [1.0 0.0 0.0 62.0 'yes' 'yes' -1.2729888871354853]
 [0.0 1.0 0.0 24.0 'no' 'no' 0.9574354145081887]
 [0.0 1.0 0.0 48.0 'no' 'yes' 1.0503697604100084]
 [1.0 0.0 0.0 58.0 'no' 'no' -1.087120195331846]
 [0.0 1.0 0.0 45.130434782608695 'yes' 'no' 0.7715667227045492]
 [0.0 1.0 0.0 15.0 'no' 'no' 0.9574354145081887]
 [0.0 0.0 1.0 70.0 'no' 'yes' 0.5856980309009097]
 [1.0 0.0 0.0 52.0 'yes' 'yes' -0.15777673631364833]
 [1.0 0.0 0.0 60.0 'yes' 'yes' -1.2729888871354853]
 [1.0 0.0 0.0 54.0 'yes' 'yes' -1.087120195331846]
 [0.0 0.0 1.0 29.0 'no' 'no' 1.0503697604100084]
 [1.0 0.0 0.0 78.0 'yes' 'yes' -0.9941858494300261]
 [1.0 0.0 0.0 36.0 'yes' 'no' 0.7715667227045492]
 [0.0 1.0 0.0 30.0 'yes' 'no' 0.3998293390972702]
 [1.0 0.0 0.0 50.0 'yes' '