# Final Project - Machine Learning

## Diagnosis of COVID-19 and its clinical spectrum

## General Objective 
### - Predict confirmed COVID-19 cases among suspected cases.

## Specific Objective
### - Determine the main features 
### - Compare the accuracy between different classifiers
### - GridSearch for decision of optimal parameters 
### - Combining features using PCA instead of features removal

### Importing Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

### Importing Dataset

In [2]:
dataset = pd.read_excel('datasets_574076_1040311_dataset.xlsx')

In [3]:
dataset.head(10)

Unnamed: 0,Patient ID,Patient age quantile,SARS-Cov-2 exam result,"Patient addmited to regular ward (1=yes, 0=no)","Patient addmited to semi-intensive unit (1=yes, 0=no)","Patient addmited to intensive care unit (1=yes, 0=no)",Hematocrit,Hemoglobin,Platelets,Mean platelet volume,...,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),Arteiral Fio2,Phosphor,ctO2 (arterial blood gas analysis)
0,44477f75e8169d2,13,negative,0,0,0,,,,,...,,,,,,,,,,
1,126e9dd13932f68,17,negative,0,0,0,0.236515,-0.02234,-0.517413,0.010677,...,,,,,,,,,,
2,a46b4402a0e5696,8,negative,0,0,0,,,,,...,,,,,,,,,,
3,f7d619a94f97c45,5,negative,0,0,0,,,,,...,,,,,,,,,,
4,d9e41465789c2b5,15,negative,0,0,0,,,,,...,,,,,,,,,,
5,75f16746216c4d1,9,negative,0,0,0,,,,,...,,,,,,,,,,
6,2a2245e360808d7,13,negative,0,0,0,,,,,...,,,,,,,,,,
7,509197ec73f1400,16,negative,0,0,0,,,,,...,,,,,,,,,,
8,8bb9d64f0215244,1,negative,0,1,0,-1.571682,-0.774212,1.429667,-1.672222,...,,,,,,,,,0.730393,
9,5f1ed301375586c,17,negative,0,0,0,,,,,...,,,,,,,,,,


### Couting unique values for each feature

In [4]:
dataset.nunique()

Patient ID                                               5644
Patient age quantile                                       20
SARS-Cov-2 exam result                                      2
Patient addmited to regular ward (1=yes, 0=no)              2
Patient addmited to semi-intensive unit (1=yes, 0=no)       2
Patient addmited to intensive care unit (1=yes, 0=no)       2
Hematocrit                                                176
Hemoglobin                                                 84
Platelets                                                 249
Mean platelet volume                                       48
Red blood Cells                                           211
Lymphocytes                                               318
Mean corpuscular hemoglobin concentration (MCHC)           57
Leukocytes                                                475
Basophils                                                  17
Mean corpuscular hemoglobin (MCH)                          91
Eosinoph

### Couting null values from features

In [5]:
dataset.isna().sum()

Patient ID                                                  0
Patient age quantile                                        0
SARS-Cov-2 exam result                                      0
Patient addmited to regular ward (1=yes, 0=no)              0
Patient addmited to semi-intensive unit (1=yes, 0=no)       0
Patient addmited to intensive care unit (1=yes, 0=no)       0
Hematocrit                                               5041
Hemoglobin                                               5041
Platelets                                                5042
Mean platelet volume                                     5045
Red blood Cells                                          5042
Lymphocytes                                              5042
Mean corpuscular hemoglobin concentration (MCHC)         5042
Leukocytes                                               5042
Basophils                                                5042
Mean corpuscular hemoglobin (MCH)                        5042
Eosinoph

### ... or not null values

In [6]:
dataset.count()

Patient ID                                               5644
Patient age quantile                                     5644
SARS-Cov-2 exam result                                   5644
Patient addmited to regular ward (1=yes, 0=no)           5644
Patient addmited to semi-intensive unit (1=yes, 0=no)    5644
Patient addmited to intensive care unit (1=yes, 0=no)    5644
Hematocrit                                                603
Hemoglobin                                                603
Platelets                                                 602
Mean platelet volume                                      599
Red blood Cells                                           602
Lymphocytes                                               602
Mean corpuscular hemoglobin concentration (MCHC)          602
Leukocytes                                                602
Basophils                                                 602
Mean corpuscular hemoglobin (MCH)                         602
Eosinoph

### Counting negatives and positives Covid19 exams result

In [7]:
dataset['SARS-Cov-2 exam result'].describe()

count         5644
unique           2
top       negative
freq          5086
Name: SARS-Cov-2 exam result, dtype: object

### Criteria of feature removal. 
#### --- High missing values ---> 80%
#### --- High values of missing data on positive result

In [8]:
## Removing label that will not be used
dataset = dataset.drop(columns=['Patient addmited to regular ward (1=yes, 0=no)', 'Patient addmited to semi-intensive unit (1=yes, 0=no)', 'Patient addmited to intensive care unit (1=yes, 0=no)', 'Patient ID'])

# Here we can see how much of the dataset has its cfeatures with missing values greater than 80%
columnMissing = 0
for column in dataset:
    if(dataset[column].isna().sum() > 4515):
        columnMissing +=1
columnsOver80 = columnMissing/107;
print(f'Percentage of features which missing data is greater than 85% ---> {round(columnsOver80, 3)}')

Percentage of features which missing data is greater than 85% ---> 0.822


In [9]:
#Now we're gonna remove these features.
listDrop = []
for column in dataset:
    if(dataset[column].isna().sum() > 4515):
        listDrop.append(column)
dataset = dataset.drop(columns=listDrop)

In [10]:
print(f'Remaining features: {len(dataset.columns.values)}')

Remaining features: 19


### Couting missing values greater than 80% and 70% in positives exams result

In [11]:
dataset2 = dataset
dataset2 = dataset2.loc[dataset['SARS-Cov-2 exam result'] == 'positive']
columnMissing = 0
columnMissing1 = 0
for column in dataset:
    if(dataset2[column].isnull().sum() > 446):
        columnMissing +=1
    if(dataset2[column].isnull().sum() > 390):
        columnMissing1 +=1
columnsOver80 = columnMissing/19;
columnsOver50 = columnMissing1/19;
print(f'Percentage of features which missing data is greater than 80% for positive exam result---> {100*round(columnsOver80, 3)}%')
print(f'Percentage of features which missing data is greater than 70% for positive exam result---> {100*round(columnsOver50, 3)}%')

Percentage of features which missing data is greater than 80% for positive exam result---> 0.0%
Percentage of features which missing data is greater than 70% for positive exam result---> 89.5%


### As we can see almost 90% of the features from positive exams results has missing value higher than 70%. So remove these features it's not the best solution, but imputation. 

In [12]:
dataset.dtypes

Patient age quantile            int64
SARS-Cov-2 exam result         object
Respiratory Syncytial Virus    object
Influenza A                    object
Influenza B                    object
Parainfluenza 1                object
CoronavirusNL63                object
Rhinovirus/Enterovirus         object
Coronavirus HKU1               object
Parainfluenza 3                object
Chlamydophila pneumoniae       object
Adenovirus                     object
Parainfluenza 4                object
Coronavirus229E                object
CoronavirusOC43                object
Inf A H1N1 2009                object
Bordetella pertussis           object
Metapneumovirus                object
Parainfluenza 2                object
dtype: object

In [13]:
dataset

Unnamed: 0,Patient age quantile,SARS-Cov-2 exam result,Respiratory Syncytial Virus,Influenza A,Influenza B,Parainfluenza 1,CoronavirusNL63,Rhinovirus/Enterovirus,Coronavirus HKU1,Parainfluenza 3,Chlamydophila pneumoniae,Adenovirus,Parainfluenza 4,Coronavirus229E,CoronavirusOC43,Inf A H1N1 2009,Bordetella pertussis,Metapneumovirus,Parainfluenza 2
0,13,negative,,,,,,,,,,,,,,,,,
1,17,negative,not_detected,not_detected,not_detected,not_detected,not_detected,detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected
2,8,negative,,,,,,,,,,,,,,,,,
3,5,negative,,,,,,,,,,,,,,,,,
4,15,negative,not_detected,not_detected,not_detected,not_detected,not_detected,detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected
5,9,negative,,,,,,,,,,,,,,,,,
6,13,negative,,,,,,,,,,,,,,,,,
7,16,negative,,,,,,,,,,,,,,,,,
8,1,negative,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected
9,17,negative,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected


### Up til now this analisys does not worked as expected, the remaining features has only 2 categorigal values that way imputation data will be unmeaningful and even worse these features are presence of other virus and not some indicator of Covid19. So we're gonna removal these features and start over approching new method.   

In [14]:
# Creating a list of columns names for later removal. The content of this list is the presence of other virus
listofremove = dataset.columns.values
listofremove = listofremove[2:]

In [15]:
##Iniciating a new dataset for new aproach
newdataset = pd.read_excel('datasets_574076_1040311_dataset.xlsx')
## Removing label that will not be used which includes the features related to other virus.
newdataset = newdataset.drop(columns=['Patient addmited to regular ward (1=yes, 0=no)', 'Patient addmited to semi-intensive unit (1=yes, 0=no)', 'Patient addmited to intensive care unit (1=yes, 0=no)', 'Patient ID', 'Influenza B, rapid test', 'Influenza A, rapid test'])
newdataset = newdataset.drop(columns = listofremove)

### Here we're gonna remove all columns whose values are all null 

In [16]:
#Removing columns without any values
columnMissing = 0
listofremove2 = []
for column in newdataset:
    if(newdataset[column].isnull().sum() >= 5644):
        columnMissing +=1
        listofremove2.append(column)
        
newdataset2 = newdataset.drop(columns=listofremove2)


### Here we removed all columns whose values are constant

In [17]:
#Remove unique values == 1
listofremove3 = []
for col in newdataset2.columns:
    if newdataset2[col].nunique() == 1:
        #print(f'Coluna: {col} e valores: {newdataset2[col].nunique()}')
        listofremove3.append(col)
newdataset2 = newdataset2.drop(columns=listofremove3)

### Changing categorical data into numerical data

In [18]:
## Replacing some categorical data into binary
newdataset2 = newdataset2.replace({'positive': 1, 'negative': 0, 'detected': 1, 'not_detected': 0, 'not_done': 0})


### We're dealing with rows, so rows whose missing values are greater than 75% are deleted

In [19]:
newdataset2 = newdataset2.reset_index(drop=True)
newdataset3 = newdataset2.copy()
listindex = []
for x, linhas in enumerate(newdataset2.to_numpy()):
    #npcount = sum(np.isnan(i) for i in linhas)
    npcount = np.count_nonzero(pd.isnull(linhas))
    if(npcount > len(linhas)*.75):
        listindex.append(x)

In [20]:
newdataset3.drop(newdataset3.index[listindex], inplace=True)
newdataset3 = newdataset3.reset_index(drop=True)


In [21]:
newdataset3.shape

(464, 77)

### Before we had 10% of positive exams results and now this rate increased to 15%

In [22]:
newdataset3['SARS-Cov-2 exam result'].value_counts()

0    397
1     67
Name: SARS-Cov-2 exam result, dtype: int64

### Examinating the amount of missing value for each column

In [23]:
newdataset3.isna().sum()

Patient age quantile                                  0
SARS-Cov-2 exam result                                0
Hematocrit                                            1
Hemoglobin                                            1
Platelets                                             1
Mean platelet volume                                  4
Red blood Cells                                       1
Lymphocytes                                           1
Mean corpuscular hemoglobin concentration (MCHC)      1
Leukocytes                                            1
Basophils                                             1
Mean corpuscular hemoglobin (MCH)                     1
Eosinophils                                           1
Mean corpuscular volume (MCV)                         1
Monocytes                                             2
Red blood cell distribution width (RDW)               1
Serum Glucose                                       257
Neutrophils                                     

### We have decided drop all columns with 50% or higher of its values missing

In [24]:
#Removing columns with 95%> of missing values
columnMissing = 0
listofremove3 = []
for column in newdataset3:
    if(newdataset3[column].isnull().sum() >= len(newdataset3)*0.50):
        columnMissing +=1
        listofremove3.append(column)
newdataset4 = newdataset3.drop(columns=listofremove3)

### So far we've concluded that the new aproach lead us to a new data with less missing values and a higher rate of positives exams result, the total number of features has decreased to 21 and the samples decreased for 464 aganist 5644 

In [25]:
newdataset4.to_csv('preprocessedData.csv')

In [26]:
newdataset4.shape

(464, 22)

In [27]:
newdataset4.isna().sum()

Patient age quantile                                 0
SARS-Cov-2 exam result                               0
Hematocrit                                           1
Hemoglobin                                           1
Platelets                                            1
Mean platelet volume                                 4
Red blood Cells                                      1
Lymphocytes                                          1
Mean corpuscular hemoglobin concentration (MCHC)     1
Leukocytes                                           1
Basophils                                            1
Mean corpuscular hemoglobin (MCH)                    1
Eosinophils                                          1
Mean corpuscular volume (MCV)                        1
Monocytes                                            2
Red blood cell distribution width (RDW)              1
Neutrophils                                         90
Urea                                                71
Proteina C

In [28]:
newdataset4.head(5)

Unnamed: 0,Patient age quantile,SARS-Cov-2 exam result,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,...,Eosinophils,Mean corpuscular volume (MCV),Monocytes,Red blood cell distribution width (RDW),Neutrophils,Urea,Proteina C reativa mg/dL,Creatinine,Potassium,Sodium
0,17,0,0.236515,-0.02234,-0.517413,0.010677,0.102004,0.318366,-0.95079,-0.09461,...,1.482158,0.166192,0.357547,-0.625073,-0.619086,1.198059,-0.147895,2.089928,-0.305787,0.862512
1,1,0,-1.571682,-0.774212,1.429667,-1.672222,-0.850035,-0.005738,3.331071,0.36455,...,1.018625,-1.336024,0.068652,-0.978899,-0.127395,-0.067309,-0.286986,-1.838623,0.93002,0.503132
2,9,0,-0.747693,-0.586244,-0.42948,-0.213711,-1.361315,-1.114514,0.542882,-0.884923,...,-0.66695,1.668409,1.276759,-1.067355,0.88057,-0.811643,,-0.908177,0.435697,-0.215628
3,9,0,0.190738,-0.147652,-0.668155,1.020415,-0.127191,0.002791,-1.249524,-1.132592,...,-0.70909,0.566783,2.012129,0.613318,-0.42241,-1.332677,,-0.908177,-0.552949,-0.575008
4,13,0,1.014726,0.854844,-0.178244,0.796029,0.489872,-0.730707,-0.353319,-0.075131,...,0.217977,0.767079,0.068652,0.171035,0.651115,-0.737209,-0.434025,-0.701411,,


### IMPUTATION ALGORITHM - Here we're following a technique of imputation of medical dataset proposed by a research group. 


#### Step 1: Selecting dataset

In [29]:
newdataset4 = newdataset4.reset_index(drop=True)
dataset = newdataset4.copy()
del dataset2
del newdataset
del newdataset2
del newdataset3
del newdataset4

### Simple Imputation

In [30]:
target = dataset["SARS-Cov-2 exam result"].to_numpy()
data = dataset.drop(columns='SARS-Cov-2 exam result').to_numpy()
#data = data.drop(columns='Patient age quantile').to_numpy()

In [31]:
from sklearn.impute import SimpleImputer, KNNImputer
#data = KNNImputer(n_neighbors=6, weights='distance', metric='nan_euclidean',).fit_transform(data
data = SimpleImputer(strategy="mean").fit_transform(data)

### Normalization

In [32]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#data = StandardScaler().fit_transform(data)
data = MinMaxScaler().fit_transform(data)

In [33]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(data, target)

feat_importance = clf.tree_.compute_feature_importances(normalize=False)
print("feat importance = " + str(feat_importance))


feat importance = [0.00287356 0.00954731 0.         0.03665988 0.         0.02608199
 0.00090517 0.00543103 0.07102968 0.00810492 0.         0.00701459
 0.00431034 0.01616115 0.00190162 0.         0.00940846 0.01597041
 0.02058113 0.0059387  0.00517241]


In [34]:
dataset = pd.DataFrame(data)
dataset.insert(0, "target", target, True)
dataset.to_csv('processedData.csv', index=False)
#pd.DataFrame(data).to_csv('processedData.csv')
#pd.DataFrame(target).to_csv('processedDataTarget.csv')


In [35]:
from sklearn.model_selection import train_test_split as tts
xTrain, xTest, yTrain, yTest = tts(data, target, shuffle = True, test_size = 0.25)

In [36]:
from sklearn.neighbors import KNeighborsClassifier
KNN_SK = KNeighborsClassifier(n_neighbors=5)
KNN_SK.fit(xTrain, yTrain)
results_SK = KNN_SK.predict(xTest)

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(yTest, results_SK)

0.8189655172413793