In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [19]:
df = pd.read_csv("data/Covid Dataset.csv")
df.head()


Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,...,Yes,Yes,No,Yes,No,Yes,Yes,No,No,Yes
1,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,No,...,Yes,No,No,No,Yes,Yes,No,No,No,Yes
2,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,...,Yes,Yes,Yes,No,No,No,No,No,No,Yes
3,Yes,Yes,Yes,No,No,Yes,No,No,Yes,Yes,...,No,No,Yes,No,Yes,Yes,No,No,No,Yes
4,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,...,No,Yes,No,Yes,No,Yes,No,No,No,Yes


In [20]:
columns_to_drop = [
    'Asthma', 'Chronic Lung Disease', 'Headache', 'Heart Disease', 'Diabetes',
    'Hyper Tension', 'Abroad travel', 'Attended Large Gathering',
    'Family working in Public Exposed Places', 'Sanitization from Market'
]
df = df.drop(columns=columns_to_drop)
df.head()

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Fatigue,Gastrointestinal,Contact with COVID Patient,Visited Public Exposed Places,Wearing Masks,COVID-19
0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes
1,Yes,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes
2,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes
3,Yes,Yes,Yes,No,No,No,No,No,Yes,No,Yes
4,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,Yes


In [7]:
# Encode target and features
le = LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col].astype(str))

# Features and target
X = df.drop("COVID-19", axis=1)
y = df["COVID-19"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
def eval_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred)
    }


In [9]:
# Prepare models
os.makedirs("models", exist_ok=True)

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "RandomForest": RandomForestClassifier(class_weight='balanced'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=3.0)
}

best_model = None
best_score = 0

mlflow.set_experiment("COVID_Prediction")

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        mlflow.set_tag("Dataset", "Covid Dataset")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        metrics = eval_metrics(y_test, y_pred)

        mlflow.log_params(model.get_params())
        mlflow.log_metrics(metrics)

        input_example = X_test[:5]
        signature = infer_signature(X_train, model.predict(X_train))

        model_path = f"models/{name}.pkl"
        joblib.dump(model, model_path)
        mlflow.sklearn.log_model(model, name + "_model", signature=signature, input_example=input_example)

        print(f"\n{name} - F1 Score: {metrics['f1']:.4f}")

        if metrics["f1"] > best_score:
            best_score = metrics["f1"]
            best_model = model


2025/06/10 18:11:02 INFO mlflow.tracking.fluent: Experiment with name 'COVID_Prediction' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet




LogisticRegression - F1 Score: 0.9393

RandomForest - F1 Score: 0.9733


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost - F1 Score: 0.9746


In [21]:

# Save best model
joblib.dump(best_model, "models/best_model.pkl")
print(f"\nBest Model Name: {best_model.__class__.__name__}")



Best Model Name: XGBClassifier


In [12]:
print("\n\n========= All Model Evaluation Metrics =========")

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name}:")
    for k, v in eval_metrics(y_test, y_pred).items():
        print(f"  {k.capitalize()}: {v:.4f}")




LogisticRegression:
  Accuracy: 0.9043
  Precision: 0.9710
  Recall: 0.9095
  F1: 0.9393

RandomForest:
  Accuracy: 0.9577
  Precision: 0.9976
  Recall: 0.9502
  F1: 0.9733

XGBoost:
  Accuracy: 0.9577
  Precision: 0.9505
  Recall: 1.0000
  F1: 0.9746
