In [37]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, auc, precision_recall_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import mlflow
import mlflow.sklearn
from mlflow.models import ModelSignature
from mlflow.models.signature import infer_signature

In [38]:
import warnings
warnings.filterwarnings("ignore")

In [39]:
# Initialize MLflow
mlflow.set_tracking_uri('file:mlruns')  # Local directory for tracking
mlflow.set_experiment('Spam Classifier Experiment')

<Experiment: artifact_location=('file:///c:/Users/naren/OneDrive/Documents/CMI/Semester 4/Applied Machine '
 'Learning/Assignment_2/mlruns/712774972690327265'), creation_time=1740668958894, experiment_id='712774972690327265', last_update_time=1740668958894, lifecycle_stage='active', name='Spam Classifier Experiment', tags={}>

In [40]:
# Load data splits
train = pd.read_csv('train.csv').dropna()
validation = pd.read_csv('validation.csv').dropna()
test = pd.read_csv('test.csv').dropna()

# Vectorizing the text data using TF-IDF
def vectorize_data(train_data, validation_data, test_data):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train = vectorizer.fit_transform(train_data['message'])
    X_validation = vectorizer.transform(validation_data['message'])
    X_test = vectorizer.transform(test_data['message'])
    return X_train, X_validation, X_test, vectorizer

X_train, X_validation, X_test, vectorizer = vectorize_data(train, validation, test)

# Function to calculate AUCPR
def calculate_aucpr(model, X, y_true):
    y_scores = model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X)
    precision, recall, _ = precision_recall_curve(y_true, y_scores, pos_label='spam')
    aucpr = auc(recall, precision)
    return aucpr

# Model training, tracking, and registration with MLflow
def train_and_track_model(model_name, model, param_grid, X_train, y_train, X_validation, y_validation):
    with mlflow.start_run(run_name=model_name) as run:
        # Hyperparameter tuning with GridSearchCV
        grid_search = GridSearchCV(model, param_grid, scoring='recall', cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        # Evaluation on validation set
        val_precision, val_recall, val_accuracy = evaluate_model(best_model, X_validation, y_validation)
        aucpr = calculate_aucpr(best_model, X_validation, y_validation)

        print(f'{model_name} - AUCPR: {aucpr:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, Accuracy: {val_accuracy:.4f}')

        # Log parameters, metrics, and model in MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics({
            'precision': val_precision,
            'recall': val_recall,
            'accuracy': val_accuracy,
            'aucpr': aucpr
        })

        # Log and register the model
        signature = infer_signature(X_validation, best_model.predict(X_validation))
        mlflow.sklearn.log_model(best_model, 'model', signature=signature)
        mlflow.set_tag('model_name', model_name)

        model_uri = f'runs:/{run.info.run_id}/model'
        mlflow.register_model(model_uri=model_uri, name='SpamClassifierModel')

        return best_model

# Evaluate model
def evaluate_model(model, X, y, average='binary'):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    recall = recall_score(y, y_pred, pos_label='spam', average=average)
    precision = precision_score(y, y_pred, pos_label='spam', average=average)
    return precision, recall, accuracy

In [None]:
# Define models and hyperparameter grids
models = {
    'Naive Bayes': (MultinomialNB(), {'alpha': [0.01, 0.1, 1, 10]}),
    'Logistic Regression': (LogisticRegression(), {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear']}),
    'SVM': (SVC(probability=True), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']})
}

# Train, track, and register all models
for model_name, (model, param_grid) in models.items():
    train_and_track_model(model_name, model, param_grid, X_train, train['label'], X_validation, validation['label'])

Naive Bayes - AUCPR: 0.9609, Precision: 0.9245, Recall: 0.8448, Accuracy: 0.9709
Registered model 'SpamClassifierModel' already exists. Creating a new version of this model...
Created version '4' of model 'SpamClassifierModel'.
Logistic Regression - AUCPR: 0.9472, Precision: 0.8901, Recall: 0.7823, Accuracy: 0.8700
Registered model 'SpamClassifierModel' already exists. Creating a new version of this model...
Created version '5' of model 'SpamClassifierModel'.
SVM - AUCPR: 0.9513, Precision: 1.0000, Recall: 0.4483, Accuracy: 0.9283
Registered model 'SpamClassifierModel' already exists. Creating a new version of this model...
Created version '6' of model 'SpamClassifierModel'.


### Loading SVM Model

In [42]:
logged_model = 'runs:/d1df0f4f00c24fcc8128171054513663/model'
model = mlflow.pyfunc.load_model(logged_model)

# Get predictions and convert 'spam' to 1 and 'ham' to 0
y_scores = model.predict(X_test)
y_scores = (y_scores == 'spam').astype(int)

# Convert true labels to 1 for 'spam' and 0 for 'ham'
y_true = (test['label'] == 'spam').astype(int)

# Calculate precision, recall, and AUCPR
precision, recall, _ = precision_recall_curve(y_true, y_scores)
aucpr = auc(recall, precision)

# Print the model selection metric AUCPR
print(f"Model AUCPR: {aucpr:.4f}")

Model AUCPR: 0.7900


### Loading Logistic Regression Model

In [43]:
# Load the model using the run ID
logged_model = 'runs:/b8e832fdb09d4dd0a7fa704c263f4851/model'
model = mlflow.pyfunc.load_model(logged_model)

# Get predictions and convert 'spam' to 1 and 'ham' to 0
y_scores = model.predict(X_test)
y_scores = (y_scores == 'spam').astype(int)

# Convert true labels to 1 for 'spam' and 0 for 'ham'
y_true = (test['label'] == 'spam').astype(int)

# Calculate precision, recall, and AUCPR
precision, recall, _ = precision_recall_curve(y_true, y_scores)
aucpr = auc(recall, precision)

# Print the model selection metric AUCPR
print(f"Model AUCPR: {aucpr:.4f}")

Model AUCPR: 0.5650


### Loading Naive Bayes Model

In [44]:
# Load the model using the run ID
logged_model = 'runs:/ce9bc088e0904920a1b0038d6e5077a5/model'
model = mlflow.pyfunc.load_model(logged_model)

# Get predictions and convert 'spam' to 1 and 'ham' to 0
y_scores = model.predict(X_test)
y_scores = (y_scores == 'spam').astype(int)

# Convert true labels to 1 for 'spam' and 0 for 'ham'
y_true = (test['label'] == 'spam').astype(int)

# Calculate precision, recall, and AUCPR
precision, recall, _ = precision_recall_curve(y_true, y_scores)
aucpr = auc(recall, precision)

# Print the model selection metric AUCPR
print(f"Model AUCPR: {aucpr:.4f}")

Model AUCPR: 0.9356


### Best Model: **Naive Bayes** with AUCPR: 0.9356