# 2. Importing necessary libraries

In [39]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 3. Import The Dataset 

In [13]:
df = pd.read_csv( "COVID19.csv" )
#let us see first 5 rows of the dataset
df.head()

Unnamed: 0,Sno,age,gender,body temperature,Dry Cough,sour throat,weakness,breathing problem,drowsiness,pain in chest,...,diabetes,heart disease,lung disease,stroke or reduced immunity,symptoms progressed,high blood pressue,kidney disease,change in appetide,Loss of sense of smell,Corona result
0,1,20.0,Male,98.6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,No Risk
1,2,19.0,Male,99.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,No Risk
2,3,55.0,Female,102.0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,Less Risk
3,4,40.0,Female,100.0,0,0,0,0,1,1,...,1,0,0,0,1,1,0,1,0,High Risk
4,5,33.0,Male,99.2,0,1,0,1,0,0,...,0,0,1,1,1,0,0,0,1,High Risk


In [3]:
#dataset contains 21 columns(variables) and 127 records
df.shape

(127, 21)

In [4]:
#name of all the columns
df.columns

Index(['Sno', 'age', 'gender', 'body temperature', 'Dry Cough', 'sour throat',
       'weakness', 'breathing problem', 'drowsiness', 'pain in chest',
       'travel history to infected countries', 'diabetes', 'heart disease',
       'lung disease', 'stroke or reduced immunity', 'symptoms progressed',
       'high blood pressue', 'kidney disease', 'change in appetide',
       'Loss of sense of smell', 'Corona result'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Sno                                   127 non-null    int64  
 1   age                                   123 non-null    float64
 2   gender                                127 non-null    object 
 3   body temperature                      124 non-null    float64
 4   Dry Cough                             127 non-null    int64  
 5   sour throat                           127 non-null    int64  
 6   weakness                              127 non-null    int64  
 7   breathing problem                     127 non-null    int64  
 8   drowsiness                            127 non-null    int64  
 9   pain in chest                         127 non-null    int64  
 10  travel history to infected countries  127 non-null    int64  
 11  diabetes           

In [6]:
df.isnull().sum()

Sno                                     0
age                                     4
gender                                  0
body temperature                        3
Dry Cough                               0
sour throat                             0
weakness                                0
breathing problem                       0
drowsiness                              0
pain in chest                           0
travel history to infected countries    0
diabetes                                0
heart disease                           0
lung disease                            0
stroke or reduced immunity              0
symptoms progressed                     0
high blood pressue                      0
kidney disease                          0
change in appetide                      0
Loss of sense of smell                  0
Corona result                           0
dtype: int64

# 4 .Removing the null values by removing them from dataset

In [7]:
#function to remove all records having null values.
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

Sno                                     0
age                                     0
gender                                  0
body temperature                        0
Dry Cough                               0
sour throat                             0
weakness                                0
breathing problem                       0
drowsiness                              0
pain in chest                           0
travel history to infected countries    0
diabetes                                0
heart disease                           0
lung disease                            0
stroke or reduced immunity              0
symptoms progressed                     0
high blood pressue                      0
kidney disease                          0
change in appetide                      0
Loss of sense of smell                  0
Corona result                           0
dtype: int64

In [9]:
df.shape

(120, 21)

#  Replacing the null values using Pandas

In [10]:
#replacing the null values by the mean of the values in the column

df["age"].replace( np.NaN , df["age"].mean() , inplace = True )   
df["body temperature"].replace( np.NaN , df["body temperature"].mean() , inplace = True )

In [11]:
df.shape

(120, 21)

In [12]:
df.isnull().sum()

Sno                                     0
age                                     0
gender                                  0
body temperature                        0
Dry Cough                               0
sour throat                             0
weakness                                0
breathing problem                       0
drowsiness                              0
pain in chest                           0
travel history to infected countries    0
diabetes                                0
heart disease                           0
lung disease                            0
stroke or reduced immunity              0
symptoms progressed                     0
high blood pressue                      0
kidney disease                          0
change in appetide                      0
Loss of sense of smell                  0
Corona result                           0
dtype: int64

# 5.Encoding of the dataset

In [17]:
le = LabelEncoder()
df["gender"] = le.fit_transform( df["gender"] )

In [18]:
df.head(3)

Unnamed: 0,Sno,age,gender,body temperature,Dry Cough,sour throat,weakness,breathing problem,drowsiness,pain in chest,...,diabetes,heart disease,lung disease,stroke or reduced immunity,symptoms progressed,high blood pressue,kidney disease,change in appetide,Loss of sense of smell,Corona result
0,1,20.0,1,98.6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,No Risk
1,2,19.0,1,99.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,No Risk
2,3,55.0,0,102.0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,Less Risk


In [20]:
df.head(4)

Unnamed: 0,Sno,age,gender,body temperature,Dry Cough,sour throat,weakness,breathing problem,drowsiness,pain in chest,...,diabetes,heart disease,lung disease,stroke or reduced immunity,symptoms progressed,high blood pressue,kidney disease,change in appetide,Loss of sense of smell,Corona result
0,1,20.0,1,98.6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,No Risk
1,2,19.0,1,99.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,No Risk
2,3,55.0,0,102.0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,Less Risk
3,4,40.0,0,100.0,0,0,0,0,1,1,...,1,0,0,0,1,1,0,1,0,High Risk


In [22]:
le = LabelEncoder()
df["Corona result"] = le.fit_transform( df["Corona result"] )
df.head(4)

Unnamed: 0,Sno,age,gender,body temperature,Dry Cough,sour throat,weakness,breathing problem,drowsiness,pain in chest,...,diabetes,heart disease,lung disease,stroke or reduced immunity,symptoms progressed,high blood pressue,kidney disease,change in appetide,Loss of sense of smell,Corona result
0,1,20.0,1,98.6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,2,19.0,1,99.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,3,55.0,0,102.0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,1
3,4,40.0,0,100.0,0,0,0,0,1,1,...,1,0,0,0,1,1,0,1,0,0


In [23]:
bins= [0,10,20,30,40,50,60,70,80,100]  #initialising bins for age column
labels = ['a','b','c','d','e','f','g','h','i'] #initialising labels for bins
#creating a column that groups all age values into or initialised bins
Age = pd.cut(df['age'], bins=bins, labels=labels, right=False) 
#creating a column in dataset for our binned column of Age
df["Age"]=Age
#removing previous column of age
df=df.drop(['age'],axis=1)


#doing the same for body temperature
bins= [96,98.6,102,110]
labels = ['normal','fever','high fever']
Temperature = pd.cut(df['body temperature'], bins=bins, labels=labels, right=False)
df["Temperature"]=Temperature
df=df.drop(['body temperature'],axis=1)

In [24]:
df.head()

Unnamed: 0,Sno,gender,Dry Cough,sour throat,weakness,breathing problem,drowsiness,pain in chest,travel history to infected countries,diabetes,...,lung disease,stroke or reduced immunity,symptoms progressed,high blood pressue,kidney disease,change in appetide,Loss of sense of smell,Corona result,Age,Temperature
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,c,fever
1,2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,b,fever
2,3,0,1,1,1,1,1,1,0,0,...,0,0,0,0,0,1,0,1,f,high fever
3,4,0,0,0,0,0,1,1,1,1,...,0,0,1,1,0,1,0,0,e,fever
4,5,1,0,1,0,1,0,0,1,0,...,1,1,1,0,0,0,1,0,d,fever


In [32]:
# Applying one-hot encoding using pandas dummies .

df=pd.concat([df,pd.get_dummies(df['Age'],drop_first=True)],axis=1)
df = df.drop (['Age'],axis=1)   

df=pd.concat([df,pd.get_dummies(df['Temperature'],drop_first=True)],axis=1)
df = df.drop (['Temperature'],axis=1)
df.head()

Unnamed: 0,Sno,gender,Dry Cough,sour throat,weakness,breathing problem,drowsiness,pain in chest,travel history to infected countries,diabetes,...,b,c,d,e,f,g,h,i,fever,high fever
0,1,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,2,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,3,0,1,1,1,1,1,1,0,0,...,0,0,0,0,1,0,0,0,0,1
3,4,0,0,0,0,0,1,1,1,1,...,0,0,0,1,0,0,0,0,1,0
4,5,1,0,1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


# 6.Splitting of Dataset

In [34]:
df.columns

Index(['Sno', 'gender', 'Dry Cough', 'sour throat', 'weakness',
       'breathing problem', 'drowsiness', 'pain in chest',
       'travel history to infected countries', 'diabetes', 'heart disease',
       'lung disease', 'stroke or reduced immunity', 'symptoms progressed',
       'high blood pressue', 'kidney disease', 'change in appetide',
       'Loss of sense of smell', 'Corona result', 'b', 'c', 'd', 'e', 'f', 'g',
       'h', 'i', 'fever', 'high fever'],
      dtype='object')

In [38]:
X = df.drop(['Corona result' , 'Sno'] , axis = 1)
y = df['Corona result']

print(X.columns)

Index(['gender', 'Dry Cough', 'sour throat', 'weakness', 'breathing problem',
       'drowsiness', 'pain in chest', 'travel history to infected countries',
       'diabetes', 'heart disease', 'lung disease',
       'stroke or reduced immunity', 'symptoms progressed',
       'high blood pressue', 'kidney disease', 'change in appetide',
       'Loss of sense of smell', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
       'fever', 'high fever'],
      dtype='object')


In [40]:
X_train , X_test , y_train , y_test = train_test_split(X , y , random_state = 2 , test_size = 0.20)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(101, 27)
(101,)
(26, 27)
(26,)


# 7. Scaling of the data

In [41]:
ss = StandardScaler()
train = ss.fit_transform(X_train)

In [42]:
train

array([[ 0.89642146, -0.99014754, -1.1155467 , ..., -0.14213381,
         0.66547513, -0.527713  ],
       [ 0.89642146,  1.00995049,  0.89642146, ..., -0.14213381,
        -1.50268577,  1.89496942],
       [ 0.89642146,  1.00995049,  0.89642146, ..., -0.14213381,
         0.66547513, -0.527713  ],
       ...,
       [-1.1155467 , -0.99014754, -1.1155467 , ..., -0.14213381,
         0.66547513, -0.527713  ],
       [-1.1155467 , -0.99014754, -1.1155467 , ..., -0.14213381,
        -1.50268577, -0.527713  ],
       [-1.1155467 , -0.99014754, -1.1155467 , ..., -0.14213381,
         0.66547513, -0.527713  ]])

In [43]:
train.shape

(101, 27)