In [2]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.20.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.3 (from mlflow)
  Downloading mlflow_skinny-2.20.3-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.3->mlflow)
  Downloading databricks_sdk-0.44.1-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Colle

In [13]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import LabelEncoder


# Load the Excel file

In [14]:
excel_file = 'sms_data.xlsx'
raw_data = pd.read_excel(excel_file, header=None, names=['label', 'message'])

# Save it as a CSV file

In [15]:
raw_data.to_csv('raw_data.csv', index=False)
X_text = raw_data['message']
y = raw_data['label']

# Convert Text to Numerical Features using TF-IDF

In [16]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(X_text.astype(str))

# Convert labels to numerical using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Split Data

In [17]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set MLflow Tracking URI (Local Directory)

In [18]:

mlflow.set_tracking_uri("file:///content/mlruns")
mlflow.set_experiment("SMS_Spam_Detection")


<Experiment: artifact_location='file:///content/mlruns/727595653916590589', creation_time=1741138284855, experiment_id='727595653916590589', last_update_time=1741138284855, lifecycle_stage='active', name='SMS_Spam_Detection', tags={}>

# Define Models

In [19]:

models = {
    "LogisticRegression": LogisticRegression(),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

aucpr_scores = {}
run_ids = {}

for name, model in models.items():
    print(f"\n Running {name} Model...")
    with mlflow.start_run(run_name=name) as run:
        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1]

        # Compute AUCPR
        aucpr = average_precision_score(y_test, y_pred, pos_label=1)
        aucpr_scores[name] = aucpr

        # Infer model signature
        signature = infer_signature(X_train[:5].toarray(), model.predict_proba(X_train[:5])[:, 1])

        # Log Model Parameters and Metrics
        mlflow.log_param("model_type", name)
        mlflow.log_metric("AUCPR", aucpr)

        # Log Model with Signature
        input_example = X_train[:1].toarray()
        mlflow.sklearn.log_model(model, name, signature=signature, input_example=input_example)
        run_ids[name] = run.info.run_id

        print(f" {name} AUCPR: {aucpr:.4f}")
        print(f" Run ID: {run.info.run_id}")
        print(f" Tracking URL: {mlflow.get_tracking_uri()}/0/{run.info.experiment_id}/{run.info.run_id}")



 Running LogisticRegression Model...
 LogisticRegression AUCPR: 0.9761
 Run ID: a2c7479dad6f4b21ae0e3d411a127833
 Tracking URL: file:///content/mlruns/0/727595653916590589/a2c7479dad6f4b21ae0e3d411a127833

 Running RandomForest Model...
 RandomForest AUCPR: 0.9826
 Run ID: 32ebfb544c5549f5bd2b9a01b7318129
 Tracking URL: file:///content/mlruns/0/727595653916590589/32ebfb544c5549f5bd2b9a01b7318129

 Running XGBoost Model...


Parameters: { "use_label_encoder" } are not used.



 XGBoost AUCPR: 0.9602
 Run ID: d41a83932d0048ef94ddf45c2afb036b
 Tracking URL: file:///content/mlruns/0/727595653916590589/d41a83932d0048ef94ddf45c2afb036b


# Print Final AUCPR Summary

In [20]:

print("\n **Final AUCPR Scores:**")
for model_name, score in aucpr_scores.items():
    print(f"{model_name}: AUCPR = {score:.4f}")


 **Final AUCPR Scores:**
LogisticRegression: AUCPR = 0.9761
RandomForest: AUCPR = 0.9826
XGBoost: AUCPR = 0.9602
