# Preprocessing workflow [Goals] :
1. Put the data in a format conducive to ML :
- Train / Test
- Encoding
- NaN Cleaning

=> Create the first ~very basic~ model, Evaluation / Diagnosis

2. Improve the performance of the model :
- Feature Selection
- Feature Engineering
- Feature Scaling
- Outliers Elimination

# Environment setup

In [160]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [161]:
%cd /content/drive/MyDrive/Colab\ Notebooks/Machine Learnia/Machine\ Learning/COVID-19\ Project
!ls

/content/drive/MyDrive/Colab Notebooks/Machine Learnia/Machine Learning/COVID-19 Project
dataset.csv  dataset.xlsx  EDA_Exploratory_Data_Analysis.ipynb	Preprocessing.ipynb


In [162]:
import warnings
warnings.filterwarnings("ignore")

# Loading the data

In [163]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [166]:
data = pd.read_excel('dataset.xlsx')

In [167]:
df = data.copy()

In [168]:
df.shape

(5644, 111)

Creating the same subsets as in EDA :

In [169]:
missing_rate = df.isna().sum()/df.shape[0]

In [170]:
blood_columns = list(df.columns[(missing_rate < 0.9) & (missing_rate > 0.88)])
viral_columns = list(df.columns[(missing_rate < 0.88) & (missing_rate > 0.75)])

In [171]:
key_columns = ['Patient age quantile', 'SARS-Cov-2 exam result']

In [172]:
df = df[key_columns + blood_columns + viral_columns]
df.head()

Unnamed: 0,Patient age quantile,SARS-Cov-2 exam result,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,...,Adenovirus,Parainfluenza 4,Coronavirus229E,CoronavirusOC43,Inf A H1N1 2009,Bordetella pertussis,Metapneumovirus,Parainfluenza 2,"Influenza B, rapid test","Influenza A, rapid test"
0,13,negative,,,,,,,,,...,,,,,,,,,,
1,17,negative,0.236515,-0.02234,-0.517413,0.010677,0.102004,0.318366,-0.95079,-0.09461,...,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,negative,negative
2,8,negative,,,,,,,,,...,,,,,,,,,,
3,5,negative,,,,,,,,,...,,,,,,,,,,
4,15,negative,,,,,,,,,...,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,,


In [173]:
df.shape

(5644, 35)

# Traint/Test, Encoding, Cleaning :

Splitting train / test data :

In [175]:
from sklearn.model_selection import train_test_split

In [190]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=0)

In [187]:
train_set['SARS-Cov-2 exam result'].value_counts()

Series([], Name: count, dtype: int64)

In [188]:
test_set['SARS-Cov-2 exam result'].value_counts()

SARS-Cov-2 exam result
negative    1018
positive     111
Name: count, dtype: int64

Encoding data :

In [179]:
def encoding(df):
    code = {'negative':0,
            'positive':1,
            'not_detected':0,
            'detected':1}

    for col in df.select_dtypes('object').columns:
        df.loc[:,col] = df[col].map(code)

    return df

In [180]:
def imputation(df):
  return df.dropna(axis=0)

In [181]:
def preprocessing(df):

    df = encoding(df)
    df = imputation(df)

    X = df.drop('SARS-Cov-2 exam result', axis=1)
    y = df['SARS-Cov-2 exam result']

    print(y.value_counts())

    return X, y

In [191]:
X_train, y_train = preprocessing(train_set)

SARS-Cov-2 exam result
0    73
1    10
Name: count, dtype: int64


In [192]:
X_test, y_test = preprocessing(test_set)

SARS-Cov-2 exam result
0    13
1     3
Name: count, dtype: int64


# First model :

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(random_state=0)

# Evaluation process
We'll be using F1 as evaluation metrics

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve #Is our model showing an Over/Underfitting so we can decide what's next

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
def evaluation(model):
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  conf = confusion_matrix(y_test, y_pred)
  sn.heatmap(conf, cmap="YlGnBu", annot=True, fmt='d')
  plt.show()

  print(classification_report(y_test, y_pred))

In [None]:
evaluation(model)