In [149]:
# import libraries algorithm training
from sklearn.model_selection import cross_validate, train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, roc_curve, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

In [150]:
# load the dataset
df = pd.read_csv("cleaned_covid.csv")
SEED = 42

In [151]:
# check for duplicates
df = df[df["Result"] != "INDETERMINATE"]
df.duplicated().sum()

np.int64(0)

In [152]:
df.head(2)

Unnamed: 0,Sex,Birth Year,Chest pain,Cough,Diarrhea,Fatigue or general weakness,Fever,Headache,Thorax (sore throat),Nausea,Runny nose,Sore throat or pharyngitis,Vomiting,Loss of Taste,Loss of Smell,Result,age
0,MALE,1988,NO,NO,NO,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,NEGATIVE,37
1,FEMALE,1978,NO,YES,NO,NO,NO,NO,YES,NO,YES,YES,NO,NO,NO,NEGATIVE,47


In [153]:
df.columns

Index(['Sex', 'Birth Year', 'Chest pain', 'Cough', 'Diarrhea',
       'Fatigue or general weakness', 'Fever', 'Headache',
       'Thorax (sore throat)', 'Nausea', 'Runny nose',
       'Sore throat or pharyngitis', 'Vomiting', 'Loss of Taste',
       'Loss of Smell', 'Result', 'age'],
      dtype='object')

In [154]:
def label_encode_columns(df, columns):
    """
    Label encodes the specified categorical columns in a DataFrame without using sklearn.
    
    Args:
    df (pd.DataFrame): The input DataFrame.
    columns (list): List of column names to label encode.
    
    Returns:
    pd.DataFrame: DataFrame with label-encoded columns.
    """
    df = df.copy()  # Avoid modifying the original DataFrame
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype("category").cat.codes  # Convert to category codes
    return df

In [155]:
# Columns to encode
columns_to_encode = ['Chest pain', 'Cough', 'Diarrhea','Fatigue or general weakness', 'Fever', 'Headache',
                    'Thorax (sore throat)', 'Nausea', 'Runny nose', 'Sore throat or pharyngitis', 'Vomiting',
                    'Loss of Taste', 'Loss of Smell', 'Result']

# Apply function
df_encoded = label_encode_columns(df, columns_to_encode)

In [156]:
df_encoded

Unnamed: 0,Sex,Birth Year,Chest pain,Cough,Diarrhea,Fatigue or general weakness,Fever,Headache,Thorax (sore throat),Nausea,Runny nose,Sore throat or pharyngitis,Vomiting,Loss of Taste,Loss of Smell,Result,age
0,MALE,1988,0,0,0,0,1,0,0,0,0,0,0,0,0,0,37
1,FEMALE,1978,0,1,0,0,0,0,1,0,1,1,0,0,0,0,47
2,MALE,1995,0,1,0,0,0,0,1,0,1,1,0,0,0,0,30
3,MALE,1967,0,0,0,0,1,0,0,0,0,0,0,0,0,0,58
4,MALE,2015,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822,FEMALE,1990,0,0,0,1,0,1,0,0,0,0,0,0,0,0,35
1823,MALE,1995,0,1,0,0,1,0,0,0,0,0,0,0,0,0,30
1824,FEMALE,1999,1,0,1,0,0,0,0,0,0,0,1,0,1,0,26
1825,FEMALE,1995,1,0,0,1,0,1,0,0,0,0,0,1,0,0,30


In [157]:
def one_hot_encode_columns(df, columns):
    """
    One-hot encodes the specified categorical columns in a DataFrame without using sklearn.

    Args:
    df (pd.DataFrame): The input DataFrame.
    columns (list): List of column names to one-hot encode.

    Returns:
    pd.DataFrame: DataFrame with one-hot encoded columns.
    """
    df = df.copy()  # Avoid modifying the original DataFrame
    return pd.get_dummies(df, columns=columns, drop_first=False)  # Keep all categories

In [158]:
# Columns to encode
columns_to_encode = ['Sex']

# Apply function
df_encoded = one_hot_encode_columns(df_encoded, columns_to_encode)

In [159]:
df_encoded.head(2)

Unnamed: 0,Birth Year,Chest pain,Cough,Diarrhea,Fatigue or general weakness,Fever,Headache,Thorax (sore throat),Nausea,Runny nose,Sore throat or pharyngitis,Vomiting,Loss of Taste,Loss of Smell,Result,age,Sex_FEMALE,Sex_MALE
0,1988,0,0,0,0,1,0,0,0,0,0,0,0,0,0,37,False,True
1,1978,0,1,0,0,0,0,1,0,1,1,0,0,0,0,47,True,False


In [160]:
# 
X = df_encoded.drop(["Result", "Birth Year"], axis = 1)
y = df_encoded["Result"]
y.value_counts()

Result
0    1320
1     504
Name: count, dtype: int64

In [161]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED) 

In [162]:
# Scale the Data (Normalization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Experimentation with cross validation

In [163]:
# initialize the algorithms for training
lrc = LogisticRegression()
rfc = RandomForestClassifier()
dtc = DecisionTreeClassifier()

#### training of logistics regression

In [164]:
# train a Logistic Regression Model
lrc.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [165]:
# Make Predictions
y_pred = lrc.predict(X_test_scaled)



In [166]:
class_report = classification_report(y_test, y_pred)
# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("\n Accuracy of Logistics Regression:\n", accuracy_score(y_test, y_pred))
print("\n Precision of Logistics Regression:\n", precision_score(y_test, y_pred))
print("\n Recall of Logistics Regression:\n", recall_score(y_test, y_pred))
print("\n F1_score of Logistics Regression:\n", f1_score(y_test, y_pred))
print("\n roc_au of Logistics Regression:\n", roc_auc)
print("\n Logistics Regression Classification Report:\n", class_report)


 Accuracy of Logistics Regression:
 0.5748175182481752

 Precision of Logistics Regression:
 0.3389830508474576

 Recall of Logistics Regression:
 0.5095541401273885

 F1_score of Logistics Regression:
 0.4071246819338422

 roc_au of Logistics Regression:
 0.5552885790151009

 Logistics Regression Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.60      0.67       391
           1       0.34      0.51      0.41       157

    accuracy                           0.57       548
   macro avg       0.55      0.56      0.54       548
weighted avg       0.63      0.57      0.59       548



#### training of decision tree

In [167]:
# train a Decision Tree Model
dtc.fit(X_train, y_train)

In [168]:
# Make Predictions
y_pred_dtc = dtc.predict(X_test_scaled)



In [169]:
# ROC Curve and AUC
fpr_dtc, tpr_dtc, thresholds_dtc = roc_curve(y_test, y_pred_dtc)
roc_auc_dtc = auc(fpr_dtc, tpr_dtc)
# classification report
class_report_dtc = classification_report(y_test, y_pred_dtc)
print("\n Accuracy of Decision Tree:\n", accuracy_score(y_test, y_pred_dtc))
print("\n Precision of Decision Tree:\n", precision_score(y_test, y_pred_dtc))
print("\n Recall of Decision Tree:\n", recall_score(y_test, y_pred_dtc))
print("\n F1_score of Decision Tree:\n", f1_score(y_test, y_pred_dtc))
print("\n roc_auc of Decision Tree:\n", roc_auc_dtc)
print("\nDecision Tree Classification Report:\n", class_report_dtc)


 Accuracy of Decision Tree:
 0.6551094890510949

 Precision of Decision Tree:
 0.3048780487804878

 Recall of Decision Tree:
 0.1592356687898089

 F1_score of Decision Tree:
 0.20920502092050208

 roc_auc of Decision Tree:
 0.5067278088194569

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.85      0.78       391
           1       0.30      0.16      0.21       157

    accuracy                           0.66       548
   macro avg       0.51      0.51      0.49       548
weighted avg       0.60      0.66      0.62       548



#### training of random forest

In [170]:
# train a Random Forest Model
rfc.fit(X_train, y_train)

In [171]:
# Make Predictions
y_pred_rfc = rfc.predict(X_test_scaled)



In [172]:
# ROC Curve and AUC
fpr_rfc, tpr_rfc, thresholds_rfc = roc_curve(y_test, y_pred)
roc_auc_rfc = auc(fpr_rfc, tpr_rfc)
# classification report
class_report_rfc = classification_report(y_test, y_pred_rfc)
print("\n Accuracy of Random Forest:\n", accuracy_score(y_test, y_pred_rfc))
print("\n Precision of Random Forest:\n", precision_score(y_test, y_pred_rfc))
print("\n Recall of Random Forest:\n", recall_score(y_test, y_pred_rfc))
print("\n F1_score of Random Forest:\n", f1_score(y_test, y_pred_rfc))
print("\n roc_auc of Random Forest:\n", roc_auc_rfc)
print("\n Random Forest Classification Report:\n", class_report_rfc)


 Accuracy of Random Forest:
 0.6824817518248175

 Precision of Random Forest:
 0.373134328358209

 Recall of Random Forest:
 0.1592356687898089

 F1_score of Random Forest:
 0.22321428571428573

 roc_auc of Random Forest:
 0.5552885790151009

 Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.89      0.80       391
           1       0.37      0.16      0.22       157

    accuracy                           0.68       548
   macro avg       0.55      0.53      0.51       548
weighted avg       0.62      0.68      0.64       548



#### Experimentation with cross validation

In [173]:
k = 5 
kf = KFold(n_splits=k, shuffle=True, random_state=SEED)

In [174]:
# Scale Features
X_scaled = scaler.fit_transform(X)

#### optimizing logistics regression using kfold

In [175]:
# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold split
scores = cross_val_score(lrc, X_scaled, y, cv=kf, scoring='f1_macro')  # Compute accuracy for each fold

In [176]:
# Print Results
print(f"Cross-Validation Scores: {scores}")
print(f"Mean f1 score: {np.mean(scores):.2f}")

Cross-Validation Scores: [0.45811241 0.51252893 0.46712474 0.48321197 0.48266203]
Mean f1 score: 0.48


#### optimizing decision tree using kfold

In [177]:
# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold split
scores_dtc = cross_val_score(dtc, X_scaled, y, cv=kf, scoring='f1_macro')  # Compute accuracy for each fold

In [178]:
# Print Results
print(f"Cross-Validation Scores: {scores_dtc}")
print(f"Mean f1 score: {np.mean(scores_dtc):.2f}")

Cross-Validation Scores: [0.47347669 0.49657049 0.48481506 0.46393438 0.47109235]
Mean f1 score: 0.48


#### optimizing random forest using kfold

In [179]:
# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold split
scores_rfc = cross_val_score(rfc, X_scaled, y, cv=kf, scoring='f1_macro')  # Compute accuracy for each fold

In [180]:
# Print Results
print(f"Cross-Validation Scores: {scores_rfc}")
print(f"Mean f1 score: {np.mean(scores_rfc):.2f}")

Cross-Validation Scores: [0.48055028 0.49850828 0.48239103 0.4877193  0.47266909]
Mean f1 score: 0.48
