In [1]:
# Importing all the required libraries in the first step:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
import joblib

# Configure MLflow
print("Setting up MLflow...")
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("spam_detection")
print("MLflow setup complete!")
print(np.__version__)

Setting up MLflow...
MLflow setup complete!
1.24.3


In [3]:
# Loading the preprocessed data:
print("Loading data...")
df = pd.read_csv('../data/preprocessed_data.csv')
print(f"Original dataset shape: {df.shape}")

# Handle NaN values
df['cleaned_text'] = df['cleaned_text'].fillna('')
print("\nChecked and handled any NaN values")

# Show sample data
print("\nFirst few examples:")
display(df[['cleaned_text', 'label']].head())

# Show class distribution
print("\nClass distribution (%):")
print(df['label'].value_counts(normalize=True) * 100)

Loading data...
Original dataset shape: (5574, 5)

Checked and handled any NaN values

First few examples:


Unnamed: 0,cleaned_text,label
0,go jurong point crazy available bugis n great ...,0
1,ok lar joking wif u oni,0
2,free entry wkly comp win fa cup final tkts st ...,1
3,u dun say early hor u c already say,0
4,nah dont think goes usf lives around though,0



Class distribution (%):
label
0    86.598493
1    13.401507
Name: proportion, dtype: float64


In [5]:
# Create features using TF-IDF
print("Creating text features...")
vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=2,
    max_df=0.95,
    ngram_range=(1, 2)
)

# Transform text to features
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print("Features created successfully!")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Creating text features...
Features created successfully!
Training set shape: (4459, 5000)
Testing set shape: (1115, 5000)


In [7]:
# Define parameters for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create base model
rf = RandomForestClassifier(random_state=42)

# Setup grid search
print("Setting up Grid Search...")
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

print("\nParameters to be tried:")
for param, values in param_grid.items():
    print(f"{param}: {values}")

Setting up Grid Search...

Parameters to be tried:
n_estimators: [50, 100, 200]
max_depth: [10, 20, 30]
min_samples_split: [2, 5, 10]
min_samples_leaf: [1, 2, 4]


In [9]:
# Train model with MLflow tracking
print("Starting model training...")
with mlflow.start_run(run_name="spam_detection_training"):
    # Perform grid search
    grid_search.fit(X_train, y_train)
    
    # Get best parameters
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print(f"\nBest parameters found:")
    for param, value in best_params.items():
        print(f"{param}: {value}")
    print(f"\nBest CV score: {best_score:.4f}")
    
    # Log parameters
    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_score", best_score)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Test set evaluation
    y_pred = best_model.predict(X_test)
    
    # Calculate all metrics
    metrics = {
        "test_accuracy": accuracy_score(y_test, y_pred),
        "test_precision": precision_score(y_test, y_pred),
        "test_recall": recall_score(y_test, y_pred),
        "test_f1": f1_score(y_test, y_pred)
    }
    
    # Log metrics
    for metric_name, value in metrics.items():
        mlflow.log_metric(metric_name, value)
    
    print("\nTest Set Performance:")
    print(classification_report(y_test, y_pred))
    
    # Create models directory if it doesn't exist
    import os
    os.makedirs('../models', exist_ok=True)
    
    # Save model and vectorizer
    model_data = {
        'vectorizer': vectorizer,
        'model': best_model,
        'best_params': best_params,
        'metrics': metrics
    }
    
    model_path = '../models/spam_classifier.joblib'
    joblib.dump(model_data, model_path)
    print(f"\nModel saved to: {model_path}")
    
    # Log model in MLflow
    mlflow.sklearn.log_model(best_model, "spam_classifier")

print("\nTraining complete! Check MLflow UI for details")

Starting model training...
Fitting 5 folds for each of 81 candidates, totalling 405 fits

Best parameters found:
max_depth: 30
min_samples_leaf: 1
min_samples_split: 10
n_estimators: 50

Best CV score: 0.9590

Test Set Performance:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Model saved to: ../models/spam_classifier.joblib




🏃 View run spam_detection_training at: http://127.0.0.1:5000/#/experiments/219111585967021436/runs/4db30027376a4bffa6e27739f71b3dcb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/219111585967021436

Training complete! Check MLflow UI for details
