In [1]:
from pycaret.classification import *
import numpy as np
import pandas as pd
import imblearn

In [2]:
covid_df = pd.read_csv('covid_with_NA.csv')

In [4]:
df1 = covid_df.drop(['to_patient_id','Unnamed: 0'],axis = 1)

In [5]:
data = df1.drop(['last.status'],axis = 1)
data = pd.get_dummies(data)

In [6]:
data['last.status'] = df1['last.status']

#### Using Pycaret comparing models

In [7]:
def comparemodel(dataset):
    clfs = setup(
           data = dataset, 
           target = 'last.status',
           silent=True, 
           session_id=1,)
    best_model = compare_models(sort = 'Accuracy')
    best_results = pull()
    return best_results

In [8]:
def createmodel(dataset, model):
    clfs = setup(
           data = dataset, 
           target = 'last.status',
           silent=True, 
           session_id=1,)
    model_name = create_model(model)
    # plotting a model
    plot_model(model_name,'confusion_matrix')

#### Method 1: deleting all the NAs

In [11]:
df1_base = df1.dropna()
base_data = df1_base.drop(['last.status'],axis = 1)
base_data = pd.get_dummies(base_data)
base_data['last.status'] = df1_base['last.status']

#### Save data after deleting the NAs

In [None]:
base_data.to_csv('base_data.csv',index = False)

#### Method 2: Oversample the minority class

In [14]:
X_train = base_data.drop(['last.status'], axis=1)
y_train = base_data['last.status']

In [15]:
X_train_resampled, y_train_resampled = \
imblearn.over_sampling.SMOTE().fit_resample(X_train, y_train)

In [16]:
imbalanced_data = X_train_resampled
imbalanced_data['last.status'] = y_train_resampled

#### Save data after oversampling the minority class

In [None]:
imbalanced_data.to_csv('imbalanced_data.csv',index = False)

#### Compare different models

In [48]:
base_results = comparemodel(base_data)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Ridge Classifier,0.9293,0.0,0.9891,0.936,0.9617,0.4859,0.5362,0.007
1,K Neighbors Classifier,0.922,0.7587,0.9918,0.9267,0.9581,0.3839,0.4529,0.0053
2,Logistic Regression,0.9195,0.9143,0.9756,0.9378,0.9561,0.4585,0.4895,0.046
3,Light Gradient Boosting Machine,0.9171,0.9189,0.9703,0.9402,0.9547,0.4603,0.4828,0.0517
4,Linear Discriminant Analysis,0.9122,0.91,0.9592,0.9444,0.9513,0.4755,0.4976,0.0087
5,CatBoost Classifier,0.9122,0.9234,0.9783,0.9282,0.9525,0.3741,0.4177,1.4621
6,Ada Boost Classifier,0.9098,0.9045,0.9621,0.9395,0.9504,0.4404,0.4609,0.1612
7,Gradient Boosting Classifier,0.9098,0.9196,0.9675,0.9349,0.9506,0.4153,0.4459,0.1293
8,Random Forest Classifier,0.9073,0.8145,0.9675,0.9327,0.9495,0.3724,0.3906,0.1127
9,Extreme Gradient Boosting,0.9024,0.9113,0.9621,0.9321,0.9465,0.3685,0.39,0.0991


In [None]:
imbalanced_results = comparemodel(imbalanced_data)

#### Merge the compare_model results to a single dataset

In [49]:
base_results_new = base_results.drop(['Recall','Prec.','Kappa','MCC','TT (Sec)'], axis = 1)

In [50]:
imbalanced_results_new = imbalanced_results.drop(['Recall','Prec.','Kappa','MCC','TT (Sec)'],axis = 1)

In [51]:
base_results_new['Method'] ='base'
imbalanced_results_new['Method'] = 'Imbalanced'

In [54]:
dataset_all = pd.concat([base_results_new, imbalanced_results_new])

In [57]:
dataset_long = pd.melt(dataset_all, id_vars =['Model','Method'], value_vars = ['Accuracy','AUC','F1'])

In [61]:
dataset_long.head()

Unnamed: 0,Model,Method,variable,value
0,Ridge Classifier,base,Accuracy,0.9293
1,K Neighbors Classifier,base,Accuracy,0.922
2,Logistic Regression,base,Accuracy,0.9195
3,Light Gradient Boosting Machine,base,Accuracy,0.9171
4,Linear Discriminant Analysis,base,Accuracy,0.9122


In [69]:
dataset_long.to_csv('delete_sampling_results.csv',index = False)