In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  # For text vectorization
from sklearn.model_selection import train_test_split, GridSearchCV  # For model selection
from sklearn.ensemble import RandomForestClassifier  # Our chosen classifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score  # For evaluation
import mlflow  # For experiment tracking
import mlflow.sklearn  # For saving sklearn models
import joblib  # For model persistence

# Configure MLflow tracking
# Setting tracking URI to localhost allows us to view experiments in the MLflow UI
mlflow.set_tracking_uri("http://localhost:5000")
# Create or set the experiment name for organization
mlflow.set_experiment("spam_detection")
print("MLflow setup complete!")
print(np.__version__)

MLflow setup complete!
2.0.2


In [4]:
# Load the preprocessed data from CSV
print("Loading data...")
df = pd.read_csv('../data/preprocessed_data.csv')
print(f"Dataset shape: {df.shape}")

# Handle any missing values in the cleaned text
# Empty strings are used instead of NaN to ensure compatibility with text processing
df['cleaned_text'] = df['cleaned_text'].fillna('')

# Display sample data to verify loading
print("\nFirst few examples:")
display(df[['cleaned_text', 'label']].head())

# Show class distribution to check for imbalance
# This helps understand if we need class balancing techniques
print("\nClass distribution (%):")
print(df['label'].value_counts(normalize=True) * 100)

Loading data...
Dataset shape: (5574, 5)

First few examples:


Unnamed: 0,cleaned_text,label
0,go jurong point crazy available bugis n great ...,0
1,ok lar joking wif u oni,0
2,free entry wkly comp win fa cup final tkts st ...,1
3,u dun say early hor u c already say,0
4,nah dont think goes usf lives around though,0



Class distribution (%):
label
0    86.598493
1    13.401507
Name: proportion, dtype: float64


In [5]:
# Create TF-IDF features from text data
print("Creating text features...")

# Configure the TF-IDF vectorizer
# - max_features=5000: Only include the 5000 most informative terms
# - min_df=2: Ignore terms that appear in fewer than 2 documents (removes rare terms)
# - max_df=0.95: Ignore terms that appear in more than 95% of documents (removes common terms)
# - ngram_range=(1,2): Include both single words and 2-word phrases
vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=2,
    max_df=0.95,
    ngram_range=(1, 2)
)

# Transform text data into numerical feature vectors
# X will be a sparse matrix with dimensions: (num_samples, num_features)
X = vectorizer.fit_transform(df['cleaned_text'])

# Extract target variable
y = df['label']

# Split data into training and testing sets
# - test_size=0.2: Use 80% for training, 20% for testing
# - random_state=42: Ensures reproducibility
# - stratify=y: Maintain the same class distribution in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Features created successfully!")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Creating text features...
Features created successfully!
Training set shape: (4459, 5000)
Testing set shape: (1115, 5000)


In [6]:
# Define hyperparameters for grid search
# We'll test different combinations of these parameters to find the best model
param_grid = {
    'n_estimators': [100],        # Number of trees in the forest
    'max_depth': [20],            # Maximum depth of each tree
    'min_samples_split': [2],     # Minimum samples required to split a node
    'min_samples_leaf': [1]       # Minimum samples required at a leaf node
}

# Create the base model
# random_state ensures reproducibility
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV for hyperparameter tuning
print("Setting up Grid Search...")
grid_search = GridSearchCV(
    estimator=rf,                 # The model to tune
    param_grid=param_grid,        # Hyperparameter combinations to test
    cv=5,                         # 5-fold cross-validation
    scoring='accuracy',           # Metric to optimize
    n_jobs=-1,                    # Use all available CPU cores
    verbose=2                     # Print progress during training
)

# Display the parameters that will be tested
print("\nParameters to be tried:")
for param, values in param_grid.items():
    print(f"{param}: {values}")

Setting up Grid Search...

Parameters to be tried:
n_estimators: [100]
max_depth: [20]
min_samples_split: [2]
min_samples_leaf: [1]


In [7]:
# Train the model with MLflow tracking
print("Starting model training...")
with mlflow.start_run(run_name="spam_detection_training"):
    # Perform grid search (trains multiple models with different hyperparameters)
    # This will train cv * len(param_combinations) models in total
    grid_search.fit(X_train, y_train)
    
    # Extract the best hyperparameters found during grid search
    best_params = grid_search.best_params_
    print(f"\nBest parameters: {best_params}")
    
    # Log the best parameters to MLflow for tracking
    mlflow.log_params(best_params)
    
    # Get the best model from grid search
    best_model = grid_search.best_estimator_
    
    # Evaluate the model on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate performance metrics
    metrics = {
        "test_accuracy": accuracy_score(y_test, y_pred),    # Overall accuracy
        "test_precision": precision_score(y_test, y_pred),  # Precision (minimize false positives)
        "test_recall": recall_score(y_test, y_pred),        # Recall (minimize false negatives)
        "test_f1": f1_score(y_test, y_pred)                 # F1 (harmonic mean of precision and recall)
    }
    
    # Log the performance metrics to MLflow
    for metric_name, value in metrics.items():
        mlflow.log_metric(metric_name, value)
    
    # Print detailed classification report
    print("\nTest Set Performance:")
    print(classification_report(y_test, y_pred))
    
    # Create models directory if it doesn't exist
    import os
    os.makedirs('../models', exist_ok=True)
    
    # Save both the model and the vectorizer together
    # This ensures we can preprocess new text correctly during inference
    model_data = {
        'vectorizer': vectorizer,      # To transform new text data
        'model': best_model,           # The trained classifier
        'best_params': best_params,    # The best hyperparameters
        'metrics': metrics             # Performance metrics
    }
    
    # Save to disk using joblib
    model_path = '../models/spam_classifier.joblib'
    joblib.dump(model_data, model_path)
    print(f"\nModel saved to: {model_path}")
    
    # Also log the model in MLflow's format
    mlflow.sklearn.log_model(best_model, "spam_classifier")

print("\nTraining complete! Check MLflow UI for details")

Starting model training...
Fitting 5 folds for each of 1 candidates, totalling 5 fits

Best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Test Set Performance:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       966
           1       1.00      0.60      0.75       149

    accuracy                           0.95      1115
   macro avg       0.97      0.80      0.86      1115
weighted avg       0.95      0.95      0.94      1115


Model saved to: ../models/spam_classifier.joblib




🏃 View run spam_detection_training at: http://localhost:5000/#/experiments/878681816147613169/runs/4f1f3783fa184ff4856319651ba891b9
🧪 View experiment at: http://localhost:5000/#/experiments/878681816147613169

Training complete! Check MLflow UI for details


In [6]:
# Define a more comprehensive parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],         # More trees options
    'max_depth': [10, 20, 30],              # Different depth options
    'min_samples_split': [2, 5, 10],        # Different split thresholds
    'min_samples_leaf': [1, 2, 4],          # Different leaf sizes
    'max_features': ['sqrt', 'log2']        # Feature selection methods
}

# Create base model
rf = RandomForestClassifier(random_state=42)

# Setup grid search with cross-validation
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,                        # 5-fold cross validation
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    return_train_score=True      # This will log training scores too
)

# Train with MLflow tracking
with mlflow.start_run(run_name="hyperparameter_tuning"):
    print("Starting hyperparameter tuning...")
    
    # Perform grid search
    grid_search.fit(X_train, y_train)
    
    # Log all combinations tried
    for idx, params in enumerate(grid_search.cv_results_['params']):
        with mlflow.start_run(run_name=f"trial_{idx}", nested=True):
            # Log parameters
            mlflow.log_params(params)
            
            # Log metrics for this combination
            metrics = {
                "cv_accuracy": grid_search.cv_results_['mean_test_score'][idx],
                "cv_std": grid_search.cv_results_['std_test_score'][idx],
                "train_accuracy": grid_search.cv_results_['mean_train_score'][idx],
            }
            mlflow.log_metrics(metrics)
    
    # Get best parameters
    best_params = grid_search.best_params_
    print(f"\nBest parameters found: {best_params}")
    
    # Train final model with best parameters
    with mlflow.start_run(run_name="best_model", nested=True):
        best_model = grid_search.best_estimator_
        
        # Evaluate on test set
        y_pred = best_model.predict(X_test)
        
        # Calculate final metrics
        final_metrics = {
            "test_accuracy": accuracy_score(y_test, y_pred),
            "test_precision": precision_score(y_test, y_pred),
            "test_recall": recall_score(y_test, y_pred),
            "test_f1": f1_score(y_test, y_pred)
        }
        
        # Log final metrics
        mlflow.log_metrics(final_metrics)
        
        # Log best model
        mlflow.sklearn.log_model(best_model, "spam_classifier")
        
        print("\nFinal Model Performance:")
        print(classification_report(y_test, y_pred))

# Save the model with vectorizer
model_data = {
    'vectorizer': vectorizer,
    'model': best_model,
    'best_params': best_params,
    'metrics': final_metrics
}

# Save to file
joblib.dump(model_data, '../models/spam_classifier.joblib')
print("\nModel saved successfully!")

Starting hyperparameter tuning...
Fitting 5 folds for each of 162 candidates, totalling 810 fits
🏃 View run trial_0 at: http://localhost:5000/#/experiments/878681816147613169/runs/0ce5464559a24dc5ba8f14163df4a0bd
🧪 View experiment at: http://localhost:5000/#/experiments/878681816147613169
🏃 View run trial_1 at: http://localhost:5000/#/experiments/878681816147613169/runs/4d9a624df003450f9fc6f055048f6992
🧪 View experiment at: http://localhost:5000/#/experiments/878681816147613169
🏃 View run trial_2 at: http://localhost:5000/#/experiments/878681816147613169/runs/91a4b79d9cd2445899e99319728494ae
🧪 View experiment at: http://localhost:5000/#/experiments/878681816147613169
🏃 View run trial_3 at: http://localhost:5000/#/experiments/878681816147613169/runs/068062eddeb940f49be94cb0a69dfb57
🧪 View experiment at: http://localhost:5000/#/experiments/878681816147613169
🏃 View run trial_4 at: http://localhost:5000/#/experiments/878681816147613169/runs/cda1284b59e14e2aad2476f9aa27724a
🧪 View experime




Final Model Performance:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

🏃 View run best_model at: http://localhost:5000/#/experiments/878681816147613169/runs/140664715d5240948dd240b445dc345d
🧪 View experiment at: http://localhost:5000/#/experiments/878681816147613169
🏃 View run hyperparameter_tuning at: http://localhost:5000/#/experiments/878681816147613169/runs/b2993efe1dfa4783af732b7f6bb9c452
🧪 View experiment at: http://localhost:5000/#/experiments/878681816147613169

Model saved successfully!


In [8]:
# Train the model with MLflow tracking
print("Starting model training...")
with mlflow.start_run(run_name="spam_detection_training"):
    # Perform grid search (trains multiple models with different hyperparameters)
    # This will train cv * len(param_combinations) models in total
    grid_search.fit(X_train, y_train)
    
    # Extract the best hyperparameters found during grid search
    best_params = grid_search.best_params_
    print(f"\nBest parameters: {best_params}")
    
    # Log the best parameters to MLflow for tracking
    mlflow.log_params(best_params)
    
    # Get the best model from grid search
    best_model = grid_search.best_estimator_
    
    # Evaluate the model on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate performance metrics
    metrics = {
        "test_accuracy": accuracy_score(y_test, y_pred),    # Overall accuracy
        "test_precision": precision_score(y_test, y_pred),  # Precision (minimize false positives)
        "test_recall": recall_score(y_test, y_pred),        # Recall (minimize false negatives)
        "test_f1": f1_score(y_test, y_pred)                 # F1 (harmonic mean of precision and recall)
    }
    
    # Log the performance metrics to MLflow
    for metric_name, value in metrics.items():
        mlflow.log_metric(metric_name, value)
    
    # Print detailed classification report
    print("\nTest Set Performance:")
    print(classification_report(y_test, y_pred))
    
    # Create models directory if it doesn't exist
    import os
    os.makedirs('../models', exist_ok=True)
    
    # Save both the model and the vectorizer together
    # This ensures we can preprocess new text correctly during inference
    model_data = {
        'vectorizer': vectorizer,      # To transform new text data
        'model': best_model,           # The trained classifier
        'best_params': best_params,    # The best hyperparameters
        'metrics': metrics             # Performance metrics
    }
    
    # Save to disk using joblib
    model_path = '../models/spam_classifier.joblib'
    joblib.dump(model_data, model_path)
    print(f"\nModel saved to: {model_path}")
    
    # Also log the model in MLflow's format
    mlflow.sklearn.log_model(best_model, "spam_classifier")

print("\nTraining complete! Check MLflow UI for details")

Starting model training...
Fitting 5 folds for each of 1 candidates, totalling 5 fits

Best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Test Set Performance:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       966
           1       1.00      0.60      0.75       149

    accuracy                           0.95      1115
   macro avg       0.97      0.80      0.86      1115
weighted avg       0.95      0.95      0.94      1115


Model saved to: ../models/spam_classifier.joblib




🏃 View run spam_detection_training at: http://localhost:5000/#/experiments/878681816147613169/runs/b2f646084b764b9cbdaf5dc122bdcc8c
🧪 View experiment at: http://localhost:5000/#/experiments/878681816147613169

Training complete! Check MLflow UI for details
