In [1]:
import pandas as pd

pd.options.display.max_rows = 999

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import ast
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from flaml import AutoML

# Import the pipeline BUILDER function from our pipeline.py
# This is the key: we are reusing the same structure.
from src.pipeline import build_pipeline
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)


In [None]:
# --- 1. Load and Prepare Data ---
print("Loading raw data...")
raw_df = pd.read_json("dataset.json")
raw_df.drop_duplicates(subset=["request_id"], keep="first", inplace=True)

y = raw_df["requester_received_pizza"]
X = raw_df.drop("requester_received_pizza", axis=1)

# Split the data into training and a temporary holdout/test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Data split: {len(X_train)} training samples, {len(X_test)} test samples.")


# --- 2. Build a Preprocessing-Only Pipeline ---
# Get the full pipeline structure, but remove the final classifier step for now.
# This leaves us with just the data cleaning and feature transformation part.
preprocessor_pipeline = build_pipeline()
# --- 3. Fit the Preprocessor and Transform Data for FLAML ---
print("Fitting the preprocessor on the training data...")
X_train_processed = preprocessor_pipeline.fit_transform(X_train, y_train)
X_test_processed = preprocessor_pipeline.transform(
    X_test
)

Loading raw data...
Data split: 3232 training samples, 808 test samples.
Fitting the preprocessor on the training data...


# 5. Building a machine learning model

In [10]:
# In your Jupyter Notebook or script, after your existing code

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB  # Good for dense data from our preprocessor
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier  # A high-performance gradient boosting model
random_state = 42
from sklearn.metrics import roc_auc_score, classification_report, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# A dictionary to store the results for comparison
model_results = {}


## Exploration of base classification models

In [None]:
# --- Model 1: Logistic Regression ---
print("--- Training Logistic Regression (Baseline) ---")
lr_model = LogisticRegression(random_state=random_state, max_iter=1000)
lr_model.fit(X_train_processed, y_train)

# 2. Make predictions on the test set
# We need probabilities for ROC AUC calculation
lr_probs = lr_model.predict_proba(X_test_processed)[:, 1]
lr_preds = lr_model.predict(X_test_processed)

# 3. Evaluate and store results
lr_auc = roc_auc_score(y_test, lr_probs)
model_results["Logistic Regression"] = {"auc": lr_auc, "probs": lr_probs}
print(f"Logistic Regression ROC AUC: {lr_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_preds))


--- Training Logistic Regression (Baseline) ---
Logistic Regression ROC AUC: 0.6237

Classification Report:
              precision    recall  f1-score   support

       False       0.76      0.97      0.85       609
        True       0.47      0.09      0.14       199

    accuracy                           0.75       808
   macro avg       0.62      0.53      0.50       808
weighted avg       0.69      0.75      0.68       808



The recall is very low for the positive class, we have a lot of fake positives. This was awaited as there is class imbalance. Logistic Regression ROC AUC: 0.6237 will be our baseline

In [12]:
# --- Model 2: Gaussian Naive Bayes ---
print("\n--- Training Gaussian Naive Bayes ---")

# 1. Initialize and train the model
# Naive Bayes requires a dense array, not a sparse matrix
gnb_model = GaussianNB()
gnb_model.fit(X_train_processed.toarray(), y_train)

# 2. Make predictions
gnb_probs = gnb_model.predict_proba(X_test_processed.toarray())[:, 1]
gnb_preds = gnb_model.predict(X_test_processed.toarray())

# 3. Evaluate and store results
gnb_auc = roc_auc_score(y_test, gnb_probs)
model_results["Naive Bayes"] = {"auc": gnb_auc, "probs": gnb_probs}
print(f"Gaussian Naive Bayes ROC AUC: {gnb_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, gnb_preds))



--- Training Gaussian Naive Bayes ---
Gaussian Naive Bayes ROC AUC: 0.5750

Classification Report:
              precision    recall  f1-score   support

       False       0.79      0.53      0.63       609
        True       0.28      0.57      0.38       199

    accuracy                           0.54       808
   macro avg       0.54      0.55      0.51       808
weighted avg       0.67      0.54      0.57       808



The Naive Bayes model scores lower than our baseline with a 0.5750 AUC. It finds more pizza recipients (higher recall) but makes many more mistakes in the process (lower precision).

In [13]:
# --- Model 3: Random Forest ---
print("\n--- Training Random Forest ---")

# 1. Initialize and train the model
# n_jobs=-1 uses all available CPU cores to speed up training
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_model.fit(X_train_processed, y_train)

# 2. Make predictions
rf_probs = rf_model.predict_proba(X_test_processed)[:, 1]
rf_preds = rf_model.predict(X_test_processed)

# 3. Evaluate and store results
rf_auc = roc_auc_score(y_test, rf_probs)
model_results["Random Forest"] = {"auc": rf_auc, "probs": rf_probs}
print(f"Random Forest ROC AUC: {rf_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_preds))



--- Training Random Forest ---
Random Forest ROC AUC: 0.6061

Classification Report:
              precision    recall  f1-score   support

       False       0.76      0.99      0.86       609
        True       0.50      0.03      0.06       199

    accuracy                           0.75       808
   macro avg       0.63      0.51      0.46       808
weighted avg       0.69      0.75      0.66       808



With default settings, the Random Forest model's score of 0.6061 AUC is slightly below our baseline. It is very cautious and barely predicts that anyone will receive a pizza.

In [14]:
# --- Model 4: LightGBM ---
print("\n--- Training LightGBM ---")

# 1. Initialize and train the model
lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train_processed, y_train)

# 2. Make predictions
lgbm_probs = lgbm_model.predict_proba(X_test_processed)[:, 1]
lgbm_preds = lgbm_model.predict(X_test_processed)

# 3. Evaluate and store results
lgbm_auc = roc_auc_score(y_test, lgbm_probs)
model_results["LightGBM"] = {"auc": lgbm_auc, "probs": lgbm_probs}
print(f"LightGBM ROC AUC: {lgbm_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lgbm_preds))



--- Training LightGBM ---
[LightGBM] [Info] Number of positive: 795, number of negative: 2437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004958 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 29578
[LightGBM] [Info] Number of data points in the train set: 3232, number of used features: 945
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.245978 -> initscore=-1.120181
[LightGBM] [Info] Start training from score -1.120181
LightGBM ROC AUC: 0.6085

Classification Report:
              precision    recall  f1-score   support

       False       0.77      0.93      0.84       609
        True       0.43      0.16      0.23       199

    accuracy                           0.74       808
   macro avg       0.60      0.55      0.54       808
weighted avg       0.69      0.74      0.69       808



Surprisingly  LightGBM also scores just below the baseline with a 0.6085 AUC. This shows that even powerful models need tuning to perform well on this dataset.

In [24]:
# In your Jupyter Notebook or script, after your data has been processed

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report
import warnings

# --- Model 5: Scikit-learn MLP Classifier (Tuned) ---
print("\n--- Starting GridSearchCV for Scikit-learn MLP ---")
print("This may take a few minutes...")

# 1. Initialize the MLP Classifier
# We set parameters that we aren't tuning here.
# early_stopping is great for preventing overfitting and speeding up the search.
mlp_model = MLPClassifier(
    random_state=42,
    max_iter=500,  # Set a reasonable max number of training epochs
    early_stopping=True,  # Stop training if validation score doesn't improve
    n_iter_no_change=10,  # How many epochs to wait before stopping
)

# 2. Define a small grid of parameters for a quick search
param_grid = {
    "hidden_layer_sizes": [
        (64,),
        (32, 16),
        (64, 32),  # Different architectures to test
        (128, 64, 32),  # More complex architecture
    ],  # Test a single-layer vs. a two-layer network
    "alpha": [0.0001, 0.001],  # L2 regularization strength
    "learning_rate_init": [0.005, 0.001, 0.0001],  # Initial learning rate   
}

# 3. Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=mlp_model,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores
    verbose=1,
)

# 4. Run the Grid Search
grid_search.fit(X_train_processed, y_train)

# 5. Print the best results and evaluate the final model
print("\n--- GridSearchCV Results ---")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best ROC AUC score during cross-validation: {grid_search.best_score_:.4f}")

print("\n--- Final Evaluation on Holdout Test Set ---")
best_mlp_model = grid_search.best_estimator_
mlp_probs = best_mlp_model.predict_proba(X_test_processed)[:, 1]
mlp_preds = best_mlp_model.predict(X_test_processed)
mlp_auc = roc_auc_score(y_test, mlp_probs)

# Add to our model results dictionary for comparison
model_results["MLP Classifier (Tuned)"] = {"auc": mlp_auc, "probs": mlp_probs}

print(f"Scikit-learn MLP ROC AUC on Test Set: {mlp_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, mlp_preds))


--- Starting GridSearchCV for Scikit-learn MLP ---
This may take a few minutes...
Fitting 3 folds for each of 24 candidates, totalling 72 fits

--- GridSearchCV Results ---
Best parameters found: {'alpha': 0.001, 'hidden_layer_sizes': (128, 64, 32), 'learning_rate_init': 0.001}
Best ROC AUC score during cross-validation: 0.6248

--- Final Evaluation on Holdout Test Set ---
Scikit-learn MLP ROC AUC on Test Set: 0.5922

Classification Report:
              precision    recall  f1-score   support

       False       0.76      0.99      0.86       609
        True       0.50      0.04      0.07       199

    accuracy                           0.75       808
   macro avg       0.63      0.51      0.47       808
weighted avg       0.70      0.75      0.66       808



Even after a systematic hyperparameter search, the Neural Network's best score of 0.6248 barely improved upon our simple baseline. This confirms that even with tuning, neural networks find it difficult to extract a strong predictive signal from this dataset.

## Optimization with an auto ml tool

My initial tests showed that advanced models like Random Forest and LightGBM did not perform well with their default settings. This indicates that their performance is highly dependent on finding the right configuration for this specific dataset. Instead of a slow and limited manual tuning process, I chose to use an AutoML tool like FLAML. This allows me to efficiently and automatically search across many different models and their settings to find the best combination, which is a more robust and modern approach to model selection.

In [27]:
# --- 4. Run FLAML to Find the Best Model ---
print("\nStarting FLAML search on pre-processed data...")
automl = AutoML()
settings = {
    "time_budget": 200,
    "metric": "roc_auc",
    "task": "classification",
    "log_file_name": "flaml_run.log",
    "seed": 42,
}
automl.fit(X_train=X_train_processed, y_train=y_train, **settings)
print("FLAML search complete.")


# --- 5. Evaluate the Best Model from FLAML ---
print("\n--- FLAML Results ---")
best_flaml_model = automl.model.estimator
print(f"Best model found: {best_flaml_model.__class__.__name__}")
print(f"Best ROC AUC on internal validation: {1 - automl.best_loss:.4f}")

# Evaluate on the holdout test set to confirm performance
test_score = automl.score(X_test_processed, y_test)
print(f"ROC AUC on holdout test set: {test_score:.4f}")



Starting FLAML search on pre-processed data...
[flaml.automl.logger: 06-26 17:09:29] {1752} INFO - task = classification
[flaml.automl.logger: 06-26 17:09:29] {1763} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-26 17:09:29] {1862} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 06-26 17:09:29] {1979} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 06-26 17:09:29] {2282} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-26 17:09:30] {2417} INFO - Estimated sufficient time budget=3381s. Estimated necessary time budget=78s.
[flaml.automl.logger: 06-26 17:09:30] {2466} INFO -  at 0.4s,	estimator lgbm's best error=0.3478,	best estimator lgbm's best error=0.3478
[flaml.automl.logger: 06-26 17:09:30] {2282} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-26 17:09:30] {2466} INFO -  at 0.4s,	estimator lgbm's best error=0.3439,	best estimator 