In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
data = pd.read_excel('data/dataset.xlsx')

In [53]:
df = data.copy()

In [54]:
missing_rate = df.isna().sum()/df.shape[0]

blood_columns = list(df.columns[(missing_rate < 0.9) & (missing_rate > 0.88)])
viral_columns = list(df.columns[(missing_rate < 0.80) & (missing_rate > 0.75)])
key_columns = ['Patient age quantile','SARS-Cov-2 exam result']

In [55]:
df = df[key_columns + blood_columns + viral_columns]
df.shape

(5644, 33)

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
trainset, testset = train_test_split(df, test_size = 0.2, random_state=0)

In [58]:
#encoding qualitative values into quantitative values

def encoding(df):
    code = {
        'positive' : 1,
        'negative' : 0,
        'detected' : 1,
        'not_detected' : 0
    }
    for col in df.select_dtypes('object'):
        df[col] = df[col].map(code)
    
    return df

In [59]:
#delation of the missing data

def na_delation(df):
    return df.dropna(axis = 0)

In [60]:
def preprocessing(df):
    df = encoding(df)
    df = na_delation(df)
    
    X = df.drop('SARS-Cov-2 exam result', axis = 1)
    y = df['SARS-Cov-2 exam result']
    
    return X, y

In [69]:
X_train, y_train = preprocessing(trainset)
X_test, y_test = preprocessing(testset)

In [63]:
from sklearn.tree import DecisionTreeClassifier

In [64]:
model = DecisionTreeClassifier(random_state = 0)

In [66]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

In [70]:
def evaluation(model):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))

In [71]:
evaluation(model)

[[52  2]
 [10  2]]
              precision    recall  f1-score   support

           0       0.84      0.96      0.90        54
           1       0.50      0.17      0.25        12

    accuracy                           0.82        66
   macro avg       0.67      0.56      0.57        66
weighted avg       0.78      0.82      0.78        66

