<a href="https://colab.research.google.com/github/NeuralNetGeek/monkeypox-diagnosis/blob/main/Monkey_Pox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
import shap
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

DATA COLLECTION

The dataset was sourced from https://www.kaggle.com/datasets/muhammad4hmed/monkeypox-patients-dataset

DATA CLEANING, EXPLORATION AND PREPROCESSING

In [63]:
cols = ['Patient_ID', 'Systemic Illness', 'Rectal Pain', 'Sore Throat', 'Penile Oedema', 'Oral Lesions', 'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection', 'Sexually Transmitted Infection', 'MonkeyPox']
df = pd.read_csv('monkey.data.csv')
df.columns = cols
df.drop(['Patient_ID', 'Systemic Illness'], axis=1, inplace=True)

df['Genital Symptoms'] = df[['Rectal Pain', 'Penile Oedema']].max(axis=1)

features = ['Genital Symptoms', 'Oral Lesions', 'Sore Throat', 'Swollen Tonsils', 'Solitary Lesion', 'HIV Infection']

df[features] = df[features].astype(int)

df['Monkey Pox'] = df['MonkeyPox'].map({'Positive': 1, 'Negative': 0})
df = df.drop(['Rectal Pain', 'Penile Oedema', 'Sexually Transmitted Infection', 'MonkeyPox'], axis=1)
df.drop_duplicates(inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 128 entries, 0 to 1659
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Sore Throat       128 non-null    int64
 1   Oral Lesions      128 non-null    int64
 2   Solitary Lesion   128 non-null    int64
 3   Swollen Tonsils   128 non-null    int64
 4   HIV Infection     128 non-null    int64
 5   Genital Symptoms  128 non-null    int64
 6   Monkey Pox        128 non-null    int64
dtypes: int64(7)
memory usage: 8.0 KB


In [64]:
df['Monkey Pox'].value_counts()

Unnamed: 0_level_0,count
Monkey Pox,Unnamed: 1_level_1
0,64
1,64


In [65]:
features_encoded = ['Genital Symptoms', 'Oral Lesions', 'Sore Throat', 'Swollen Tonsils', 'Solitary Lesion', 'HIV Infection']
target = 'Monkey Pox'


X = df[features_encoded]
y = df[target]

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.3, stratify=y_temp, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

TRAIN, VALID AND TEST

In [66]:
def evaluate_classifiers(models, X_scaled, y, show_reports=True, show_confusion=True):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    summary = []

    for name, model in models.items():
        try:
            cv_scores = cross_validate(model, X_scaled, y, cv=skf, scoring='f1')
            mean_cv_score = cv_scores['test_score'].mean()

            model.fit(X_scaled, y)
            y_pred = model.predict(X_scaled)

            acc = accuracy_score(y, y_pred)
            prec = precision_score(y, y_pred, zero_division=0)
            rec = recall_score(y, y_pred, zero_division=0)
            f1 = f1_score(y, y_pred, zero_division=0)

            roc_auc = np.nan
            if hasattr(model, "predict_proba"):
                y_prob = model.predict_proba(X_scaled)[:, 1]
                if len(np.unique(y)) > 1:
                    roc_auc = roc_auc_score(y, y_prob)
                else:
                    roc_auc = np.nan

            summary.append({
                'Model': name,
                'Accuracy': acc,
                'Precision': prec,
                'Recall': rec,
                'F1 Score': f1,
                'ROC-AUC': roc_auc,
                'Cross-Validation F1 Mean': mean_cv_score
            })

            if show_reports:
                print(f"\nModel: {name}")
                print(f"Cross-Validation F1 Mean: {mean_cv_score:.4f}")
                if show_confusion:
                  print("Confusion Matrix:")
                  print(confusion_matrix(y, y_pred))
                  print("\nClassification Report:")
                  print(classification_report(y, y_pred, zero_division=0))

        except Exception as e:
            print(f"\nError with {name}: {e}")
            summary.append({
                'Model': name,
                'Accuracy': np.nan,
                'Precision': np.nan,
                'Recall': np.nan,
                'F1 Score': np.nan,
                'ROC-AUC': np.nan,
                'Cross-Validation F1 Mean': np.nan
            })

    return pd.DataFrame(summary)

models = {
    "Logistic Regression": LogisticRegression(penalty='l1', C=1, solver='liblinear', class_weight='balanced', max_iter=200),
    "K-Nearest Neighbors": KNeighborsClassifier(weights='uniform', n_neighbors=3),
    "Naive Bayes": GaussianNB(var_smoothing=1e-08),
    "Support Vector Machine": SVC(C=0.1, kernel='linear', gamma='scale', class_weight='balanced', probability=True)
}


results = evaluate_classifiers(models, X_train_scaled, y_train)
print("\nModel Performance Summary:")
print(results)


Model: Logistic Regression
Cross-Validation F1 Mean: 0.4184
Confusion Matrix:
[[20 15]
 [11 25]]

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.57      0.61        35
           1       0.62      0.69      0.66        36

    accuracy                           0.63        71
   macro avg       0.64      0.63      0.63        71
weighted avg       0.63      0.63      0.63        71


Model: K-Nearest Neighbors
Cross-Validation F1 Mean: 0.4332
Confusion Matrix:
[[21 14]
 [15 21]]

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.60      0.59        35
           1       0.60      0.58      0.59        36

    accuracy                           0.59        71
   macro avg       0.59      0.59      0.59        71
weighted avg       0.59      0.59      0.59        71


Model: Naive Bayes
Cross-Validation F1 Mean: 0.4268
Confusion Matrix:
[[20 15]
 [12 24]]

Classificat

In [67]:
def evaluate_model(model, X_valid_scaled, y_valid, show_confusion=True):
    y_pred = model.predict(X_valid_scaled)

    acc = accuracy_score(y_valid, y_pred)
    prec = precision_score(y_valid, y_pred, zero_division=0)
    rec = recall_score(y_valid, y_pred, zero_division=0)
    f1 = f1_score(y_valid, y_pred, zero_division=0)

    y_prob = None
    roc_auc = 0.0
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_valid_scaled)[:, 1]
        if len(np.unique(y_valid)) > 1:
            roc_auc = roc_auc_score(y_valid, y_prob)

    if show_confusion:
        print("Confusion Matrix:")
        print(confusion_matrix(y_valid, y_pred))
        print("Class Distribution:", dict(pd.Series(y_valid).value_counts()))

    return {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1 Score': f1,
        'ROC-AUC': roc_auc
    }

lr_valid_results = evaluate_model(models["Logistic Regression"], X_valid_scaled, y_valid)
knn_valid_results = evaluate_model(models["K-Nearest Neighbors"], X_valid_scaled, y_valid)
nb_valid_results = evaluate_model(models["Naive Bayes"], X_valid_scaled, y_valid)

comparison_df = pd.DataFrame({
    "Logistic Regression": lr_valid_results,
    "K-Nearest Neighbors": knn_valid_results,
    "Naive Bayes": nb_valid_results
})
comparison_df = comparison_df.T

print("\nValidation Set Performance Comparison:")
print(comparison_df.round(4))

Confusion Matrix:
[[ 5 11]
 [ 9  6]]
Class Distribution: {0: np.int64(16), 1: np.int64(15)}
Confusion Matrix:
[[ 5 11]
 [ 5 10]]
Class Distribution: {0: np.int64(16), 1: np.int64(15)}
Confusion Matrix:
[[ 6 10]
 [ 8  7]]
Class Distribution: {0: np.int64(16), 1: np.int64(15)}

Validation Set Performance Comparison:
                     Accuracy  Precision  Recall  F1 Score  ROC-AUC
Logistic Regression    0.3548     0.3529  0.4000    0.3750   0.3750
K-Nearest Neighbors    0.4839     0.4762  0.6667    0.5556   0.4271
Naive Bayes            0.4194     0.4118  0.4667    0.4375   0.3833


In [68]:
svm_base = SVC(C=0.1, kernel='linear', gamma='scale', class_weight='balanced', probability=True, random_state=42)
calibrated_svm = CalibratedClassifierCV(svm_base, method='sigmoid', cv=5)

calibrated_svm.fit(X_train_scaled, y_train)

svm_valid_results = evaluate_model(calibrated_svm, X_valid_scaled, y_valid, show_confusion=True)

comparison_df.loc["Calibrated SVM"] = pd.Series(svm_valid_results)

print("\nUpdated Validation Set Performance Comparison:")
print(comparison_df.round(4))

Confusion Matrix:
[[7 9]
 [8 7]]
Class Distribution: {0: np.int64(16), 1: np.int64(15)}

Updated Validation Set Performance Comparison:
                     Accuracy  Precision  Recall  F1 Score  ROC-AUC
Logistic Regression    0.3548     0.3529  0.4000    0.3750   0.3750
K-Nearest Neighbors    0.4839     0.4762  0.6667    0.5556   0.4271
Naive Bayes            0.4194     0.4118  0.4667    0.4375   0.3833
Calibrated SVM         0.4516     0.4375  0.4667    0.4516   0.4333
