# Importing Necessary Libraries

In [1]:
import os
import json
import pandas as pd
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import auc, precision_recall_curve
import mlflow
import mlflow.sklearn

# Ignore Warnings

In [2]:
warnings.filterwarnings("ignore")

# Defining Important Variables and Constants

In [3]:
DATA_FOLDER  = os.path.abspath("./data")
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
VAL_FILE = "validation.csv"

MODEL_PARAMETERS = {
    "Logistic Regression": {
        "C": 1.0,  # Inverse of regularization strength
        "solver": "lbfgs",  # Algorithm to use in the optimization problem
        "max_iter": 1000,  # Maximum number of iterations for optimization
    },
    "Random Forest": {
        "n_estimators": 100,  # Number of trees in the forest
        "max_depth": None,  # Maximum depth of the tree
        "min_samples_split": 2,  # Minimum number of samples required to split an internal node
        "min_samples_leaf": 1,  # Minimum number of samples required to be at a leaf node
    },
    "Gradient Boosting": {
        "n_estimators": 100,  # Number of boosting stages
        "learning_rate": 0.1,  # Learning rate shrinks the contribution of each tree
        "max_depth": 3,  # Maximum depth of the individual estimators
        "min_samples_split": 2,  # Minimum number of samples required to split an internal node
        "min_samples_leaf": 1,  # Minimum number of samples required to be at a leaf node
        "max_features": None,  # Number of features to consider when looking for the best split
        "subsample": 1.0,  # Fraction of samples used for fitting the individual base learners
    },
}

MODELS = [
    ("Logistic Regression", LogisticRegression(**MODEL_PARAMETERS["Logistic Regression"])),
    ("Random Forest", RandomForestClassifier(**MODEL_PARAMETERS["Random Forest"])),
    ("Gradient Boosting", GradientBoostingClassifier(**MODEL_PARAMETERS["Gradient Boosting"])),
]

BEST_MODEL_NAME = None
BEST_MODEL_AUCPR = -float("inf")

# Defining Important Functions

In [4]:
def load_data(train_file, test_file, val_file):
    train_df = pd.read_csv(os.path.join(DATA_FOLDER, train_file))
    test_df = pd.read_csv(os.path.join(DATA_FOLDER, test_file))
    val_df = pd.read_csv(os.path.join(DATA_FOLDER, val_file))
    return train_df, test_df, val_df

def train_evaluate_and_save_model(model_name, model, X_train, y_train, X_val, y_val):
    print(f"Training, Evaluating and Registering Model: {model_name}")
    model_parameters = MODEL_PARAMETERS[model_name]
    print(f"\nModel Parameters:\n{json.dumps(model_parameters, indent = 4)}\n")

    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_val)[:, 1]
        precision, recall, _ = precision_recall_curve(y_val, y_pred_proba)
        aucpr = auc(recall, precision)
        mlflow.log_param("model_name", model_name)
        mlflow.log_params(model_parameters)
        mlflow.log_metric("AUCPR", aucpr)
        mlflow.sklearn.log_model(model, model_name)

    print(f"Registered Model: {model_name}")

def load_and_evaluate_model(model_name, X_val_tfidf, y_val):
    global BEST_MODEL_NAME, BEST_MODEL_AUCPR
    runs = mlflow.search_runs(filter_string=f"params.model_name = \"{model_name}\"")
    if not runs.empty:
        run_id = runs.iloc[0].run_id
        model_uri = f"runs:/{run_id}/{model_name}"
        loaded_model = mlflow.sklearn.load_model(model_uri)
        y_pred_proba = loaded_model.predict_proba(X_val_tfidf)[:, 1]
        precision, recall, _ = precision_recall_curve(y_val, y_pred_proba)
        aucpr = auc(recall, precision)
        if aucpr > BEST_MODEL_AUCPR:
            BEST_MODEL_NAME = model_name
            BEST_MODEL_AUCPR = aucpr
        print(f"{model_name} AUCPR: {aucpr}")

# Generating Train and Test Set

In [5]:
train_df, test_df, val_df = load_data(TRAIN_FILE, TEST_FILE, VAL_FILE)

X_train, y_train = train_df["text"], train_df["spam"]
X_test, y_test = test_df["text"], test_df["spam"]
X_val, y_val = val_df["text"], val_df["spam"]

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

model_parameters = {}

# Training, Evaluating and Registering Logistic Regression Model

In [6]:
train_evaluate_and_save_model(*MODELS[0], X_train_tfidf, y_train, X_val_tfidf, y_val)

Training, Evaluating and Registering Model: Logistic Regression

Model Parameters:
{
    "C": 1.0,
    "solver": "lbfgs",
    "max_iter": 1000
}



Registered Model: Logistic Regression


# Training, Evaluating and Registering Random Forest Model

In [7]:
train_evaluate_and_save_model(*MODELS[1], X_train_tfidf, y_train, X_val_tfidf, y_val)

Training, Evaluating and Registering Model: Random Forest

Model Parameters:
{
    "n_estimators": 100,
    "max_depth": null,
    "min_samples_split": 2,
    "min_samples_leaf": 1
}

Registered Model: Random Forest


# Training, Evaluating and Registering Gradient Boosting Model

In [8]:
train_evaluate_and_save_model(*MODELS[2], X_train_tfidf, y_train, X_val_tfidf, y_val)

Training, Evaluating and Registering Model: Gradient Boosting

Model Parameters:
{
    "n_estimators": 100,
    "learning_rate": 0.1,
    "max_depth": 3,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": null,
    "subsample": 1.0
}

Registered Model: Gradient Boosting


# Model Evaluation for Logistic Regression

In [9]:
load_and_evaluate_model(MODELS[0][0], X_val_tfidf, y_val)

Logistic Regression AUCPR: 0.9988594292001929


# Model Evaluation for Random Forest

In [10]:
load_and_evaluate_model(MODELS[1][0], X_val_tfidf, y_val)

Random Forest AUCPR: 0.995232000963588


# Model Evaluation for Gradient Boosting

In [11]:
load_and_evaluate_model(MODELS[2][0], X_val_tfidf, y_val)

Gradient Boosting AUCPR: 0.9892071201001098


# Running MLFlow UI

In [12]:
!mlflow ui

[2024-02-21 15:48:34 +0530] [122118] [INFO] Starting gunicorn 21.2.0
[2024-02-21 15:48:34 +0530] [122118] [INFO] Listening at: http://127.0.0.1:5000 (122118)
[2024-02-21 15:48:34 +0530] [122118] [INFO] Using worker: sync
[2024-02-21 15:48:34 +0530] [122128] [INFO] Booting worker with pid: 122128
[2024-02-21 15:48:34 +0530] [122129] [INFO] Booting worker with pid: 122129
[2024-02-21 15:48:34 +0530] [122130] [INFO] Booting worker with pid: 122130
[2024-02-21 15:48:35 +0530] [122131] [INFO] Booting worker with pid: 122131
^C
[2024-02-21 15:49:41 +0530] [122118] [INFO] Handling signal: int
[2024-02-21 15:49:41 +0530] [122130] [INFO] Worker exiting (pid: 122130)
[2024-02-21 15:49:41 +0530] [122129] [INFO] Worker exiting (pid: 122129)
[2024-02-21 15:49:41 +0530] [122131] [INFO] Worker exiting (pid: 122131)
[2024-02-21 15:49:41 +0530] [122128] [INFO] Worker exiting (pid: 122128)


# Best Model by AUCPR

In [13]:
print(f"Best Model: {BEST_MODEL_NAME}")
print(f"Best Model AUCPR: {BEST_MODEL_AUCPR}")

Best Model: Logistic Regression
Best Model AUCPR: 0.9988594292001929
