In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, precision_recall_curve
import mlflow
import mlflow.sklearn

# Load data
train_df = pd.read_csv(r"C:\Users\asust\Assignments\assignment\train.csv")
test_df = pd.read_csv(r"C:\Users\asust\Assignments\assignment\test.csv")
valid_df = pd.read_csv(r"C:\Users\asust\Assignments\assignment\validation.csv")

EXPERIMENT_NAME = "mlflow-spam-detection_101"
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

# Define benchmark models
models = {
    "LR_Count": LogisticRegression(),
    "LR_Tfidf": LogisticRegression(),
    "LR_Count_Balanced": LogisticRegression(class_weight="balanced")
}

# Train and log benchmark models
for i, (model_name, model) in enumerate(models.items()):
    with mlflow.start_run(run_name="Model {}".format(i+1)):
        # Train and log the benchmark model
        vectorizer = TfidfVectorizer() if model_name == "LR_Tfidf" else CountVectorizer()
        X_train = vectorizer.fit_transform(train_df["Text"])
        X_test = vectorizer.transform(test_df["Text"])
        model.fit(X_train, train_df["Label"])
        y_pred = model.predict_proba(X_test)[:, 1]
        precision, recall, thresholds = precision_recall_curve(test_df["Label"], y_pred)
        aucpr = auc(recall, precision)
        mlflow.log_metric("AUCPR", aucpr)
        mlflow.sklearn.log_model(model, model_name)
        mlflow.log_param("vectorizer", "TfidfVectorizer" if model_name == "LR_Tfidf" else "CountVectorizer")

# Print AUCPR for each model
client = mlflow.tracking.MlflowClient()
for i, (model_name, model) in enumerate(models.items()):
    run = client.search_runs(experiment_ids=["0"], filter_string="tags.mlflow.runName = 'Model {}'".format(i+1))[0]
    aucpr = client.get_metric_history(run.info.run_id, "AUCPR")[-1].value
    print("AUCPR for {}: {:.4f}".format(model_name, aucpr))


AUCPR for LR_Count: 0.9907
AUCPR for LR_Tfidf: 0.9881
AUCPR for LR_Count_Balanced: 0.9900


In [6]:
!mlflow ui

^C
