In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from datetime import datetime


from google.colab import drive
import os


We start by loading final_df from final_features.parquet. This dataframe contains one row per customer, their engineered features (income, tenure, engagement, portfolio metrics, etc.), and a set of binary product indicators such as saving_account, loans, mortgage, credit_card, pensions, direct_debit, and guarantees. These 0/1 columns act as your “ground truth” labels: 1 means the customer currently owns that product; 0 means they do not.

In [25]:
from google.colab import drive
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
final_df = pd.read_parquet('/content/drive/My Drive/Colab Notebooks/DSC678-Capstone/Banking_Project/project_dataset/final_features.parquet')
print("Final features loaded.")
print(f"Shape: {final_df.shape}")
print("\nColumns:")
print(final_df.columns.tolist())

final_df.head(10)

Final features loaded.
Shape: (159288, 48)

Columns:
['customer_id', 'residence_country', 'gender', 'age', 'first_join_date', 'residence_index', 'channel_entrance', 'activity_status', 'household_gross_income', 'saving_account', 'guarantees', 'junior_account', 'loans', 'credit_card', 'pensions', 'direct_debit', 'mortgage', 'employment_status', 'employment_status_int', 'personal_income', 'current_loan_amount', 'credit_score', 'customer_segment_model', 'years_calc', 'total_products_owned', 'junior_guarantee', 'customer_tenure_months', 'current_products_owned', 'total_adoptions', 'portfolio_value', 'avg_adoption_value', 'adoption_value_std', 'total_cancellations', 'net_product_growth', 'product_churn_rate', 'adoption_value_cv', 'category_diversity', 'product_diversity', 'active_months', 'adoption_frequency', 'avg_days_between_adoptions', 'norm_adoptions', 'norm_portfolio', 'norm_growth', 'norm_diversity', 'norm_frequency', 'engagement_score', 'engagement_category']


Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrance,activity_status,household_gross_income,saving_account,...,active_months,adoption_frequency,avg_days_between_adoptions,norm_adoptions,norm_portfolio,norm_growth,norm_diversity,norm_frequency,engagement_score,engagement_category
0,15891,ES,0,59,2020-07-28,Y,KAT,1,121425.66,0,...,1.0,1.0,0.0,0.034483,0.001049,0.0,0.066667,0.071429,0.029597,Very Low
1,15899,ES,1,57,2000-01-16,Y,KAT,1,130835.64,0,...,1.0,6.0,0.0,0.206897,0.046782,0.4,0.4,0.428571,0.267705,Low
2,15900,ES,1,48,2000-01-16,Y,KAT,1,105327.03,0,...,14.166667,0.494118,70.833333,0.241379,0.011474,0.2,0.266667,0.035294,0.148508,Very Low
3,15902,ES,0,57,2000-01-16,Y,KAT,1,230408.25,0,...,1.0,2.0,0.0,0.068966,0.034943,0.066667,0.133333,0.142857,0.080739,Very Low
4,15906,ES,0,55,2006-02-16,Y,KAT,1,81005.49,0,...,13.233333,0.982368,33.083333,0.448276,0.581025,0.4,0.533333,0.070169,0.427851,Medium
5,15910,ES,1,53,2000-01-16,Y,KAT,0,77912.91,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6,15915,ES,1,56,2000-07-21,Y,KAT,0,331242.63,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
7,15916,ES,0,54,2000-01-16,Y,KAT,1,465589.68,0,...,14.166667,0.635294,53.125,0.310345,0.053074,0.533333,0.533333,0.045378,0.284328,Low
8,15918,ES,0,50,2000-01-16,Y,KAT,1,298795.08,0,...,1.0,8.0,0.0,0.275862,0.538669,0.533333,0.533333,0.571429,0.476014,Medium
9,15919,ES,1,55,2000-01-16,Y,KAT,1,318796.59,0,...,1.0,5.0,0.0,0.172414,0.031273,0.266667,0.333333,0.357143,0.207826,Low


In [27]:
#Printing all data types

print("=== FULL DATA TYPE SUMMARY ===")
print(final_df.dtypes)

print("\n=== COUNT OF FEATURES BY DTYPE ===")
print(final_df.dtypes.value_counts())

print("\n=== OBJECT (string-like) COLUMNS ===")
obj_cols = final_df.select_dtypes(include=['object']).columns.tolist()
print(obj_cols)

print("\n=== CATEGORY COLUMNS ===")
cat_cols = final_df.select_dtypes(include=['category']).columns.tolist()
print(cat_cols)

print("\n=== NUMERIC COLUMNS ===")
num_cols = final_df.select_dtypes(include=["int64", "float64", "int32", "float32", "bool"]).columns.tolist()
print(num_cols)

=== FULL DATA TYPE SUMMARY ===
customer_id                            int64
residence_country                     object
gender                                 int64
age                                    int64
first_join_date               datetime64[ns]
residence_index                       object
channel_entrance                      object
activity_status                        int64
household_gross_income               float64
saving_account                         int64
guarantees                             int64
junior_account                         int64
loans                                  int64
credit_card                            int64
pensions                               int64
direct_debit                           int64
mortgage                               int64
employment_status                   category
employment_status_int                   int8
personal_income                      float64
current_loan_amount                  float64
credit_score            

In this step, we first identify and remove two categories of columns from our dataset: leakage features and features removed for practical reasons. Leakage features are columns that contain information that would not realistically be available at the time of making a prediction. For example, features that summarize customer activity over time—such as the total number of adoptions, churn metrics, portfolio growth, or adoption frequency—are directly influenced by whether a customer already adopted a product in the future. Keeping such fields would artificially boost model accuracy during training, while making the model completely unreliable in real-world use. Removing these prevents hidden “future knowledge” from sneaking into the training process.

The second group of removed features includes columns that either suffer from extremely low data availability or cause inconsistencies between the training and test datasets. For example, “guarantees” and “junior_account” have so few positive examples that the model cannot learn meaningful patterns. We also remove all loan-related features, including mortgage and current loan amount, because the test dataset does not contain the transaction-level information required to make loan predictions. Finally, we drop the existing “credit_score” because we generate a new, consistent credit score later using only the features that are available in both datasets.

By combining both groups into a single list of columns to drop, we ensure the dataset is clean, consistent, and suitable for training stable models. This step strengthens the overall reliability of our machine learning pipeline. The downside is that removing too many features may slightly reduce predictive power, but in this case the trade-off is worth it because it eliminates misleading signals and ensures that our model behaves realistically when applied to unseen customer data.

In [28]:
# Features removed due to data leakage
leakage_features = [
    'total_products_owned', 'current_products_owned', 'total_adoptions',
    'portfolio_value', 'avg_adoption_value', 'adoption_value_std',
    'total_cancellations', 'net_product_growth', 'product_churn_rate',
    'adoption_value_cv', 'category_diversity', 'product_diversity',
    'active_months', 'adoption_frequency', 'avg_days_between_adoptions',
    'norm_adoptions', 'norm_portfolio', 'norm_growth',
    'norm_diversity', 'norm_frequency',
    'engagement_score'
]

# Other features removed for practical reasons
# - 'guarantees': target is extremely rare → unstable / not useful for prediction
# - 'junior_account': target is extremely rare → unstable / not useful for prediction
# - loans and mortgage: not enough transaction data in test data to make recommendation
# - current_loan_amount: because we are dropping loan and mortgage
# - credit_score: as we are generating one using the features that are present
other_removed_features = [
    'guarantees',
    'junior_guarantee',
    'junior_account',
    'loans',
    'mortgage',
    'current_loan_amount',
    'credit_score'



]

# Combine everything to drop (and deduplicate just in case)
cols_to_drop = sorted(set(leakage_features + other_removed_features))

print("Columns that will be removed:")
print(cols_to_drop)


Columns that will be removed:
['active_months', 'adoption_frequency', 'adoption_value_cv', 'adoption_value_std', 'avg_adoption_value', 'avg_days_between_adoptions', 'category_diversity', 'credit_score', 'current_loan_amount', 'current_products_owned', 'engagement_score', 'guarantees', 'junior_account', 'junior_guarantee', 'loans', 'mortgage', 'net_product_growth', 'norm_adoptions', 'norm_diversity', 'norm_frequency', 'norm_growth', 'norm_portfolio', 'portfolio_value', 'product_churn_rate', 'product_diversity', 'total_adoptions', 'total_cancellations', 'total_products_owned']


In [29]:
final_clean_df = final_df.drop(columns=cols_to_drop, errors='ignore')

print("Shape after removing leakage columns:", final_clean_df.shape)
print("Remaining columns:", final_clean_df.columns.tolist())


Shape after removing leakage columns: (159288, 20)
Remaining columns: ['customer_id', 'residence_country', 'gender', 'age', 'first_join_date', 'residence_index', 'channel_entrance', 'activity_status', 'household_gross_income', 'saving_account', 'credit_card', 'pensions', 'direct_debit', 'employment_status', 'employment_status_int', 'personal_income', 'customer_segment_model', 'years_calc', 'customer_tenure_months', 'engagement_category']


In this part of the workflow, we first create a separate copy of the cleaned training dataset so that our original data remains untouched. This ensures that any transformations performed here do not accidentally modify earlier stages of preprocessing or affect other models we may want to build later. Since the credit-score formula depends on how long a customer has been with the bank, we convert the first_join_date column into a proper datetime format to make it possible to compute account age accurately.

We then define a custom credit-score generator that avoids using any product-related information—especially loan or mortgage activity—so that there is no data leakage. The score is intentionally built using features that would realistically be available in real life: demographic information, household income, personal income, and account activity status. Each of these components contributes logically to a credit score. Higher income leads to a higher score, active customers get credit for longer account histories, and inactive accounts get a penalty. Finally, the calculated score is clamped into a realistic FICO-style range from 300 to 850.

Once the scoring function is defined, we apply it to every row in the dataset to generate a brand-new credit_score column. This overwrites the existing credit score so that both the training and test datasets use a consistent methodology. The purpose of this step is to avoid hidden leakage, ensure fairness between datasets, and create a stable, well-behaved numeric feature that can be used by the XGBoost model. This improves model reliability and prevents the model from accidentally learning patterns based on post-product behavior or missing values.

In [30]:
# ------------------------------------------------------------
# Work on a COPY of final_clean_df so original is untouched
# ------------------------------------------------------------
model_df = final_clean_df.copy()

# Make sure first_join_date is in datetime format (needed for account age)
if "first_join_date" in model_df.columns:
    model_df["first_join_date"] = pd.to_datetime(
        model_df["first_join_date"], errors="coerce"
    )

# ------------------------------------------------------------
# Define a credit_score generator (NO product/loan leakage)
# Uses only demographics + income + account activity
# ------------------------------------------------------------
def generate_credit_score(row):
    base_score = 650.0

    # --- Income factor (0–150 pts) ---
    hh_income = float(row.get("household_gross_income", 0) or 0)
    temp = np.clip(hh_income / 10_000.0, 0, 10)      # cap at 10
    base_score += temp * 15                          # up to +150

    # add a smaller effect for personal income if present
    pers_income = float(row.get("personal_income", 0) or 0)
    if pers_income > 0:
        temp_pi = np.clip(pers_income / 10_000.0, 0, 10)
        base_score += temp_pi * 5                    # up to +50

    # --- Account history / activity ---
    if row.get("activity_status", 0) == 1:
        join_date = row.get("first_join_date")
        if pd.isna(join_date):
            account_age = 0.0
        else:
            account_age = (pd.to_datetime("today") - join_date).days / 365.0
        base_score += min(account_age * 5, 50)       # up to +50
    else:
        base_score -= 40                             # inactive penalty

    # Clamp to FICO-like 300–850 range
    return max(300, min(int(base_score), 850))

# ------------------------------------------------------------
# Compute / overwrite credit_score in the copied dataframe
# ------------------------------------------------------------
model_df["credit_score"] = model_df.apply(generate_credit_score, axis=1)

print("Credit score summary (model_df):")
print(model_df["credit_score"].describe())

Credit score summary (model_df):
count    159288.000000
mean        816.221297
std          42.527727
min         614.000000
25%         801.000000
50%         825.000000
75%         850.000000
max         850.000000
Name: credit_score, dtype: float64


From this dataframe we then define label_cols as the set of product columns that we want to model, and feature_cols as all the remaining useful predictors, excluding identifiers like customer_id and non-numeric or target-like fields such as engagement_category and first_join_date. This separation between features (X) and labels (y) is crucial because XGBoost expects numeric input features and a clear binary target for each individual model.

Note: Loan and mortgage predictions were excluded from the final recommendation engine because the available production-scoring dataset does not contain loan-related financial behavior or transaction history, which are essential predictors. Instead of imputing synthetic values or producing unreliable recommendations, we limited our predictions to savings accounts, credit cards, pensions, and direct debits — products that can be accurately modeled using demographic and household-level features

In [31]:
# All potential product columns from your design
all_product_cols = [
    # Savings
    "saving_account",  "home_account",
    # Investment
    "short_term_deposits", "medium_term_deposits", "long_term_deposits",
    "funds", "securities", "derivatives",
    # Credit
    "credit_card",
    # Retirement
    "pensions", "pension_2",
    # Digital
    "e_account",
    # Services
    "direct_debit", "payroll", "taxes"
]

# Intersect with columns that actually exist in final_df
product_cols = [c for c in all_product_cols if c in model_df.columns]

print("Product label columns used:")
print(product_cols)


Product label columns used:
['saving_account', 'credit_card', 'pensions', 'direct_debit']


In [32]:
#Define label and feature columns

label_cols = [c for c in all_product_cols if c in model_df.columns]
print("Product label columns used:", label_cols)

# Exclude id/date/engagement + TEXT category employment_status
feature_cols = [
    c for c in model_df.columns
    if c not in label_cols + ["customer_id", "engagement_category", "first_join_date", "employment_status"]
]


X_raw = model_df[feature_cols]
y = model_df[label_cols]

print("X_raw shape:", X_raw.shape)
print("y shape    :", y.shape)


Product label columns used: ['saving_account', 'credit_card', 'pensions', 'direct_debit']
X_raw shape: (159288, 13)
y shape    : (159288, 4)


In [33]:
# Encode object/category columns for XGBoost
obj_cols = X_raw.select_dtypes(include=["object"]).columns.tolist()
cat_cols = X_raw.select_dtypes(include=["category"]).columns.tolist()

print("Object cols in X:", obj_cols)
print("Category cols in X:", cat_cols)

X = X_raw.copy()

# Encode object → category codes
for col in obj_cols:
    X[col] = X[col].astype("category").cat.codes

# Encode category → codes (if any slipped in)
for col in cat_cols:
    X[col] = X[col].cat.codes

# Final dtypes check
print("\nDtypes after encoding:")
print(X.dtypes.value_counts())

Object cols in X: ['residence_country', 'residence_index', 'channel_entrance', 'customer_segment_model']
Category cols in X: []

Dtypes after encoding:
int8       4
int64      4
float64    3
int16      1
int32      1
Name: count, dtype: int64


Next, we loop through each product in label_cols and train a separate binary classifier for each one. Each product (e.g., loans, credit cards, pensions) is treated as its own prediction task. For every product, we extract the target values from the training and validation sets (y_tr and y_va) and count how many customers have the product (pos) versus how many do not (neg). This class balance check is essential because many products are rare, and a model could appear accurate simply by predicting “no” for everyone. If a label contains only one class, we skip it because a model cannot be trained meaningfully.

We then determine whether to use SMOTE to handle class imbalance. SMOTE (Synthetic Minority Oversampling Technique) generates artificial examples of the minority class to help the model learn patterns more evenly. We apply SMOTE only when there are enough positive samples (greater than 5) and when the minority class is truly underrepresented. If SMOTE is used, we avoid applying class weights because SMOTE already balances the training data. If SMOTE is not appropriate (for extremely rare products), we rely on scale_pos_weight, which tells XGBoost to give more importance to the minority class without modifying the dataset. In simple terms, SMOTE makes more minority samples; scale_pos_weight makes each minority sample count more.

To evaluate stability, we run a 2-fold stratified cross-validation for each product. During cross-validation, we apply SMOTE only to the training fold (never to the validation fold) to avoid leaking synthetic data. We train a smaller XGBoost model on each fold and compute the AUC (Area Under the ROC Curve). Averaging AUC across folds allows us to check whether the model is consistently performing well rather than getting lucky on a single split.

After confirming stability, we train a final, more robust XGBoost model for each product using the entire training split. When SMOTE is enabled, we resample the entire training set; otherwise we retain the original data and use class weighting. The final model uses stronger learning parameters (400 trees, lower learning rate, max depth 4) that help improve predictive power without overfitting.

Once the model is trained, we evaluate it on the validation set and compute the predicted probabilities. To convert probabilities into actual recommendations (0 or 1), we need a threshold. Rather than using a fixed threshold (like 0.5), we use a more intelligent, product-specific method based on the precision–recall curve. For each product, we generate the PR curve and compute the F1-score (balance between precision and recall) at every threshold. We select the threshold that maximizes F1 on this PR curve—this becomes our “base threshold.” We then refine this further by searching a smaller, local grid around that base threshold to find the best possible cutoff. This gives each product its own optimized threshold that balances recommending too much versus too little.

Finally, for each product, we store the model, the chosen threshold, and key performance metrics such as AUC, best F1-score, class counts, and whether SMOTE was used. At the end, we print a summary for all products to compare their difficulty, predictive strength, and thresholds.

In [34]:
# ---------- Train/valid split ----------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None  # multi-label → can't stratify on all labels
)

print("Train shape:", X_train.shape, y_train.shape)
print("Valid shape:", X_valid.shape, y_valid.shape)

models = {}
best_thresholds = {}
metrics = {}

# Finer threshold grid (0.01–0.99)
threshold_grid = np.linspace(0.01, 0.99, 99)

for label in label_cols:
    print("\n" + "="*60)
    print(f"Training model for product: {label}")
    print("="*60)

    y_tr = y_train[label].astype(int)
    y_va = y_valid[label].astype(int)

    pos = y_tr.sum()
    neg = len(y_tr) - pos

    if y_tr.nunique() < 2:
        print(f"Skipping {label}: only one class in training (pos={pos}, neg={neg}).")
        continue

    print(f"Original class balance for {label}: pos={pos}, neg={neg}")

    # ----- Decide whether to use SMOTE -----
    use_smote = (pos > 5) and (pos < neg)  # need at least 6 positives for SMOTE to work

    if use_smote:
        # k_neighbors must be < number of positives
        k_neighbors = min(5, pos - 1)
        smote = SMOTE(
            sampling_strategy="auto",
            random_state=42,
            k_neighbors=k_neighbors
        )
        print(f"Using SMOTE for {label} with k_neighbors={k_neighbors}")
        scale_pos_weight = 1.0   # SMOTE already balances classes
    else:
        # Fall back to class weights if we can't safely use SMOTE
        scale_pos_weight = (neg / pos) if pos > 0 else 1.0
        print(f"Not using SMOTE for {label}, scale_pos_weight={scale_pos_weight:.2f}")

    # ----- Quick 2-fold CV AUC -----
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    cv_aucs = []
    for fold, (idx_tr, idx_te) in enumerate(skf.split(X_train, y_tr), 1):
        X_tr_cv, X_te_cv = X_train.iloc[idx_tr], X_train.iloc[idx_te]
        y_tr_cv, y_te_cv = y_tr.iloc[idx_tr], y_tr.iloc[idx_te]

        # Apply SMOTE only on the training fold
        if use_smote:
            X_tr_cv_res, y_tr_cv_res = smote.fit_resample(X_tr_cv, y_tr_cv)
        else:
            X_tr_cv_res, y_tr_cv_res = X_tr_cv, y_tr_cv

        model_cv = XGBClassifier(
            n_estimators=200,
            max_depth=4,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="binary:logistic",
            eval_metric="logloss",
            random_state=42,
            n_jobs=-1,
            scale_pos_weight=scale_pos_weight,
            tree_method="hist",  # usually faster in Colab
        )

        model_cv.fit(X_tr_cv_res, y_tr_cv_res)
        proba_cv = model_cv.predict_proba(X_te_cv)[:, 1]

        if y_te_cv.nunique() > 1:
            cv_auc = roc_auc_score(y_te_cv, proba_cv)
            cv_aucs.append(cv_auc)

    if cv_aucs:
        print(f"  CV AUC (2-fold) for {label}: mean={np.mean(cv_aucs):.4f}, std={np.std(cv_aucs):.4f}")
    else:
        print(f"  CV AUC (2-fold) for {label}: not defined (only one class in fold).")

    # ----- Final model fit on full train split -----
    if use_smote:
        X_train_res, y_tr_res = smote.fit_resample(X_train, y_tr)
    else:
        X_train_res, y_tr_res = X_train, y_tr

    model = XGBClassifier(
        n_estimators=400,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weight,
        tree_method="hist",
    )
    model.fit(X_train_res, y_tr_res)

    # ----- Validation probs + AUC -----
    proba_valid = model.predict_proba(X_valid)[:, 1]
    try:
        auc = roc_auc_score(y_va, proba_valid)
    except ValueError:
        auc = np.nan

    # ----- Threshold search: PR-based + local fine grid -----
    # get precision–recall curve for this product
    precision, recall, pr_thresholds = precision_recall_curve(y_va, proba_valid)

    # precision, recall have length N; pr_thresholds has length N-1
    # We'll compute F1 for each threshold point (except the last PR point with no threshold)
    f1_pr = []
    for p, r in zip(precision[:-1], recall[:-1]):
        if p + r == 0:
            f1_pr.append(0.0)
        else:
            f1_pr.append(2 * p * r / (p + r))

    if len(f1_pr) > 0:
        # base threshold = argmax F1 on PR curve
        best_idx_pr = int(np.argmax(f1_pr))
        base_thr = pr_thresholds[best_idx_pr]
    else:
        # Fallback if something weird happens
        base_thr = 0.5

    # refine with a local fine grid around base_thr
    lower = max(0.01, base_thr - 0.15)
    upper = min(0.99, base_thr + 0.15)
    local_grid = np.linspace(lower, upper, 51)

    best_f1 = -1.0
    best_t  = base_thr

    for t in local_grid:
        preds_t = (proba_valid >= t).astype(int)

        # avoid degenerate all-0/all-1 both sides
        if preds_t.sum() == 0 and y_va.sum() == 0:
            continue

        f1 = f1_score(y_va, preds_t, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_t  = t

    print(f"AUC: {auc:.4f}")
    print(f"Best F1 (PR-based): {best_f1:.4f} at threshold {best_t:.3f}")


    print(f"AUC: {auc:.4f}")
    print(f"Best F1: {best_f1:.4f} at threshold {best_t:.2f}")

    models[label] = model
    best_thresholds[label] = float(best_t)
    metrics[label] = {
        "auc": float(auc),
        "best_f1": float(best_f1),
        "threshold": float(best_t),
        "pos": int(pos),
        "neg": int(neg),
        "scale_pos_weight": float(scale_pos_weight),
        "used_smote": bool(use_smote),
    }

print("\nSummary of thresholds per product:")
for lbl, info in metrics.items():
    print(
        f"{lbl:15s}  pos={info['pos']:5d}  AUC={info['auc']:.3f}  "
        f"F1={info['best_f1']:.3f}  thr={info['threshold']:.2f}  "
        f"SMOTE={info['used_smote']}"
    )

Train shape: (127430, 13) (127430, 4)
Valid shape: (31858, 13) (31858, 4)

Training model for product: saving_account
Original class balance for saving_account: pos=50, neg=127380
Using SMOTE for saving_account with k_neighbors=5
  CV AUC (2-fold) for saving_account: mean=0.6406, std=0.0305
AUC: 0.6277
Best F1 (PR-based): 0.0036 at threshold 0.385
AUC: 0.6277
Best F1: 0.0036 at threshold 0.38

Training model for product: credit_card
Original class balance for credit_card: pos=14396, neg=113034
Using SMOTE for credit_card with k_neighbors=5
  CV AUC (2-fold) for credit_card: mean=0.7675, std=0.0008
AUC: 0.7675
Best F1 (PR-based): 0.3374 at threshold 0.256
AUC: 0.7675
Best F1: 0.3374 at threshold 0.26

Training model for product: pensions
Original class balance for pensions: pos=3744, neg=123686
Using SMOTE for pensions with k_neighbors=5
  CV AUC (2-fold) for pensions: mean=0.7690, std=0.0000
AUC: 0.7746
Best F1 (PR-based): 0.1260 at threshold 0.526
AUC: 0.7746
Best F1: 0.1260 at thresh

In [35]:
print("\n" + "="*95)
print("      TRAINING MODEL SUMMARY      ")
print("="*95)

# Define fixed widths for each column
col_widths = {
    "product": 20,
    "pos": 8,
    "neg": 8,
    "smote": 8,
    "auc": 8,
    "f1": 8,
    "thr": 8,
}

# Header
header = (
    f"{'Product':{col_widths['product']}s} | "
    f"{'Pos':>{col_widths['pos']}s} | "
    f"{'Neg':>{col_widths['neg']}s} | "
    f"{'SMOTE':>{col_widths['smote']}s} | "
    f"{'AUC':>{col_widths['auc']}s} | "
    f"{'F1':>{col_widths['f1']}s} | "
    f"{'Thresh':>{col_widths['thr']}s}"
)
print(header)
print("-"*95)

# Data rows
for lbl, info in metrics.items():
    row = (
        f"{lbl:{col_widths['product']}s} | "
        f"{info['pos']:{col_widths['pos']}d} | "
        f"{info['neg']:{col_widths['neg']}d} | "
        f"{str(info['used_smote']):>{col_widths['smote']}s} | "
        f"{info['auc']:{col_widths['auc']}.3f} | "
        f"{info['best_f1']:{col_widths['f1']}.3f} | "
        f"{info['threshold']:{col_widths['thr']}.3f}"
    )
    print(row)



      TRAINING MODEL SUMMARY      
Product              |      Pos |      Neg |    SMOTE |      AUC |       F1 |   Thresh
-----------------------------------------------------------------------------------------------
saving_account       |       50 |   127380 |     True |    0.628 |    0.004 |    0.385
credit_card          |    14396 |   113034 |     True |    0.768 |    0.337 |    0.256
pensions             |     3744 |   123686 |     True |    0.775 |    0.126 |    0.526
direct_debit         |    28656 |    98774 |     True |    0.826 |    0.584 |    0.358


We performed a sanity-check step to ensure that none of the input features were accidentally leaking target information or were excessively correlated with the product labels. To do this, we calculated the correlation between each feature and each product target (saving account, junior account, credit card, loans, mortgage, pensions, and direct debit). We specifically looked for very high correlations (above 0.95), because such extreme values often indicate leakage—meaning the model may be indirectly “cheating” by learning information that would not be available at prediction time.

The results showed no features with correlations above 0.95 for any of the targets, which is a strong sign that the cleaned training set does not contain direct or obvious leakage. This confirms that the features we kept after removing our leakage list are behaving normally and not unintentionally encoding future or target-related behavior.



In [36]:
# Check correlations between features and each target
high_corr = {}

for label in label_cols:
    corr_values = X.corrwith(y[label]).abs().sort_values(ascending=False)
    high_corr[label] = corr_values[corr_values > 0.95]
    print(f"\nHigh correlations with {label}:")
    print(high_corr[label])



High correlations with saving_account:
Series([], dtype: float64)

High correlations with credit_card:
Series([], dtype: float64)

High correlations with pensions:
Series([], dtype: float64)

High correlations with direct_debit:
Series([], dtype: float64)


In this step, we are examining the feature importances produced by the XGBoost model trained for the saving_account prediction task. This is essentially a sanity check to make sure the model is learning from the right types of information and not relying on anything suspicious or leakage-related.

To do this, we take the trained model for saving_account, extract its feature importance values, and list the top 20 features ranked from most to least influential. These values show how much each feature contributed to the model’s decisions during training. High importance means the model relied on that feature heavily when determining whether a customer is likely to adopt a savings account.

The results here make intuitive sense:

- credit_score, employment_status_int, channel_entrance, and activity_status are among the strongest predictors, which aligns with how banks assess eligibility or likelihood of adoption.

- age, income-related variables, and behavioral characteristics also appear meaningfully, which is expected.

- Columns like residence_country correctly show low importance—nothing in the model suggests leakage or unrealistic patterns.

Overall, this step helps confirm that the model is using logical and valid signals rather than accidental artifacts or leaked data. It reassures us that our data cleaning, feature selection, and credit-score generation steps were done correctly.

In [37]:
import pandas as pd

model = models['saving_account']
importances = pd.Series(model.feature_importances_, index=X_train.columns)
print(importances.sort_values(ascending=False).head(20))


credit_score              0.163006
channel_entrance          0.123256
employment_status_int     0.119934
activity_status           0.111575
age                       0.085343
gender                    0.082008
household_gross_income    0.075117
years_calc                0.074688
customer_tenure_months    0.069544
personal_income           0.069173
residence_index           0.021874
customer_segment_model    0.004482
residence_country         0.000000
dtype: float32


In this step, we train our final machine learning models using the entire labeled dataset. Using all available data allows the models to learn from every example, which usually leads to stronger performance when making predictions on new customers. Because we are building separate models for each product, each model gets its own full dataset to learn the patterns that are relevant to that specific product.

Some products in our dataset have very few positive cases compared to negative ones. For example, many customers do not own a savings account and only a small portion do. When the classes are heavily imbalanced like this, a model can easily become biased toward always predicting “no product,” simply because that outcome appears more often. To prevent this, we use SMOTE whenever possible. SMOTE creates synthetic examples of the minority class, giving the model more balanced data to learn from. This helps the model understand what a positive case looks like rather than ignoring it. When a product has too few positive cases for SMOTE to work safely, the model instead uses class weighting, which adjusts the importance of positive cases during training.

By retraining the models using the full dataset together with SMOTE or class weighting, we ensure that the final models are as strong and stable as possible. They benefit from seeing every real customer example, and they also learn from a balanced representation of both “yes” and “no” outcomes. This approach typically improves real-world performance because the model becomes better at identifying customers who are actually likely to adopt a product, even when those customers are rare in the data. The final result is a set of models that are more reliable, less biased, and better prepared to generate accurate recommendations on the new test dataset.

In [38]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# ============================================================
# FINAL MODELS ON 100% OF LABELED DATA (WITH SMOTE LOGIC)
# ============================================================

final_models = {}

for label in label_cols:
    print(f"\nTraining FINAL model for: {label}")

    # Full label vector for this product
    y_full = y[label].astype(int)
    pos = y_full.sum()
    neg = len(y_full) - pos

    if y_full.nunique() < 2:
        print(f"Skipping {label}: only one class in full dataset (pos={pos}, neg={neg}).")
        continue

    print(f"Full-data class balance for {label}: pos={pos}, neg={neg}")

    # Decide whether to use SMOTE on the full dataset
    use_smote = (pos > 5) and (pos < neg)  # at least 6 positives and still imbalanced

    if use_smote:
        k_neighbors = min(5, pos - 1)
        smote = SMOTE(
            sampling_strategy="auto",
            random_state=42,
            k_neighbors=k_neighbors
        )
        print(f"  → Using SMOTE for {label} with k_neighbors={k_neighbors}")
        X_full_res, y_full_res = smote.fit_resample(X, y_full)
        scale_pos_weight = 1.0  # SMOTE has already balanced the classes
    else:
        print("  → Not using SMOTE, falling back to class weights.")
        X_full_res, y_full_res = X, y_full
        scale_pos_weight = (neg / pos) if pos > 0 else 1.0

    # FINAL model: slightly more trees since we now use all data
    model = XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weight,
        tree_method="hist",
    )

    model.fit(X_full_res, y_full_res)
    final_models[label] = model
    print(f"  ✓ FINAL model trained for {label}")

print("\nAll FINAL models trained with SMOTE / class-weight methodology.")



Training FINAL model for: saving_account
Full-data class balance for saving_account: pos=66, neg=159222
  → Using SMOTE for saving_account with k_neighbors=5
  ✓ FINAL model trained for saving_account

Training FINAL model for: credit_card
Full-data class balance for credit_card: pos=17988, neg=141300
  → Using SMOTE for credit_card with k_neighbors=5
  ✓ FINAL model trained for credit_card

Training FINAL model for: pensions
Full-data class balance for pensions: pos=4666, neg=154622
  → Using SMOTE for pensions with k_neighbors=5
  ✓ FINAL model trained for pensions

Training FINAL model for: direct_debit
Full-data class balance for direct_debit: pos=35883, neg=123405
  → Using SMOTE for direct_debit with k_neighbors=5
  ✓ FINAL model trained for direct_debit

All FINAL models trained with SMOTE / class-weight methodology.


In this part of the workflow, we start by loading the customer test dataset — the data for which we want to generate new product recommendations. Since this file comes directly from the project dataset and has a different structure than our training data, the first thing we do is check whether the test file contains all the feature columns that our XGBoost models expect. If any features are missing, we fill numeric ones using the median values calculated from the training data. Medians are used because they are stable and less affected by extreme values, which makes the test data consistent and prevents the model from encountering missing inputs.

After aligning the test schema, we make sure the first_join_date column is properly formatted as a date. This is important because we use that date to estimate how long a customer has been active, which later becomes part of the credit score. We then rebuild the credit score for each customer. The test file does not include a credit score column, so we generate it in the same way we did during training. This makes the feature compatible and avoids any leakage from product ownership. The credit score is computed using only demographic factors, income, and account activity — the same logic used on the training data — so the model sees consistent inputs at prediction time.

Once all missing columns are added and the credit score is generated, we construct the final test feature matrix (X_test). This matrix follows the exact feature order the model expects and includes only the features used during training. We also keep the customer_id so we can link each prediction back to the correct customer later. This entire process ensures that the test data looks and behaves exactly like the training data from the model’s perspective. Without this alignment step, the model would misinterpret inputs, leading to incorrect predictions or even errors.



In [39]:
# ============================================================
# LOAD CUSTOMER TEST DATA
# ============================================================
drive.mount('/content/drive')

test_model = pd.read_parquet(
    '/content/drive/My Drive/Colab Notebooks/DSC678-Capstone/Banking_Project/project_dataset/customer_test.parquet'
)

print("Test data loaded.")
print(test_model.shape)
print(test_model.columns.tolist())

# ============================================================
# ALIGN TEST SCHEMA WITH TRAINING SCHEMA
# ============================================================

# (a) Check which training features are missing in test
missing = [c for c in feature_cols if c not in test_model.columns]
if missing:
    print("\nMissing columns from test_df (will be imputed):")
    print(missing)

# (b) Compute training medians for numeric features
train_medians = X.median(numeric_only=True)

# (c) Add any missing feature columns to test_model
for col in missing:
    if col in train_medians.index:
        # Impute numeric features with training median
        test_model[col] = train_medians[col]
    else:
        # Safe default for non-numeric or unexpected cols
        test_model[col] = 0

# Make sure first_join_date is datetime (needed for account age in credit_score)
if "first_join_date" in test_model.columns:
    test_model["first_join_date"] = pd.to_datetime(
        test_model["first_join_date"], errors="coerce"
    )

# ============================================================
# REBUILD / OVERWRITE CREDIT SCORE (SAME LOGIC AS TRAIN)
#    Uses ONLY demographics + income + activity
# ============================================================

def generate_credit_score(row):
    base_score = 650.0

    # --- Household income factor (0–150 pts) ---
    hh_income = float(row.get("household_gross_income", 0) or 0)
    temp = np.clip(hh_income / 10_000.0, 0, 10)   # cap at 10
    base_score += temp * 15                       # up to +150

    # --- Personal income (smaller effect, up to +50 pts) ---
    pers_income = float(row.get("personal_income", 0) or 0)
    if pers_income > 0:
        temp_pi = np.clip(pers_income / 10_000.0, 0, 10)
        base_score += temp_pi * 5                 # up to +50

    # --- Account history / activity ---
    if row.get("activity_status", 0) == 1:
        join_date = row.get("first_join_date")
        if pd.isna(join_date):
            account_age = 0.0
        else:
            account_age = (pd.to_datetime("today") - join_date).days / 365.0
        base_score += min(account_age * 5, 50)    # up to +50
    else:
        base_score -= 40                          # inactive penalty

    # NO loan_amount / LTI terms here, same as training

    # Clamp to FICO-like 300–850 range
    return max(300, min(int(base_score), 850))

# Only recompute credit_score if it is actually used as a feature
if "credit_score" in feature_cols:
    test_model["credit_score"] = test_model.apply(generate_credit_score, axis=1)
    print("\nTest credit_score summary:")
    print(test_model["credit_score"].describe())

# ============================================================
# BUILD X_test WITH THE FINAL FEATURE ORDER
# ============================================================

X_test = test_model[feature_cols].copy()
customer_ids_test = test_model["customer_id"]

print("\nX_test shape:", X_test.shape)
print("First few feature columns:", X_test.columns[:10].tolist())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Test data loaded.
(104580, 16)
['transaction_date', 'customer_id', 'employment_status', 'residence_country', 'gender', 'age', 'first_join_date', 'residence_index', 'channel_entrance', 'activity_status', 'household_gross_income', 'employment_status_int', 'personal_income', 'customer_tenure_months', 'customer_segment_model', 'years_calc']

Missing columns from test_df (will be imputed):
['credit_score']

Test credit_score summary:
count    104580.000000
mean        817.052132
std          42.354669
min         614.000000
25%         802.000000
50%         833.000000
75%         850.000000
max         850.000000
Name: credit_score, dtype: float64

X_test shape: (104580, 13)
First few feature columns: ['residence_country', 'gender', 'age', 'residence_index', 'channel_entrance', 'activity_status', 'household_gross_income', 'employment_status_int', 'personal_income

In [40]:
# --------- Build category mappings from TRAIN (final_df) ----------
# These are the columns that were object in training:
obj_cols_train = ['residence_country', 'residence_index', 'channel_entrance', 'customer_segment_model']

category_maps = {}
for col in obj_cols_train:
    cat = final_df[col].astype("category")
    category_maps[col] = cat.cat.categories
    print(f"{col}: {len(category_maps[col])} categories")

# --------- Build X_test aligned with feature_cols ----------
X_test = test_model[feature_cols].copy()

# Encode test object columns using the SAME categories
for col in obj_cols_train:
    if col not in X_test.columns:
        continue  # safety

    X_test[col] = pd.Categorical(
        X_test[col],
        categories=category_maps[col]   # same ordering as train
    ).codes   # unseen categories → -1

print("\nDtypes in X_test after encoding:")
print(X_test.dtypes.value_counts())


residence_country: 87 categories
residence_index: 2 categories
channel_entrance: 144 categories
customer_segment_model: 3 categories

Dtypes in X_test after encoding:
int8       4
int64      4
float64    3
int16      1
int32      1
Name: count, dtype: int64


In this step, we use the final XGBoost models to score each customer in the test dataset and generate product recommendations. For every product, the model takes the preprocessed X_test features and outputs a probability between 0 and 1. This value represents how likely the customer is to adopt that product. We store all of these probabilities in the proba_test dataframe so we can see the model’s confidence levels.

Next, we translate these probabilities into actual recommendations. Instead of using a default cutoff like 0.5, we apply the product-specific thresholds that were optimized earlier during validation. These thresholds are chosen because they produced the best F1-score for each product during training, which helps balance precision and recall. If a customer’s probability is greater than the threshold, we mark that product as recommended (1); otherwise, we mark it as not recommended (0). These final recommendation flags are stored in preds_test.

The sample outputs confirm that the system is behaving correctly. Products with moderately high probabilities become recommendations, while very low probabilities do not. This per-product threshold strategy prevents over-recommending and ensures each product has an appropriate level of strictness. As a result, the output becomes a clean, reliable list showing which products each customer is most likely to adopt based on learned patterns.

In [41]:
#Score the test customers + generate product recommendations
# DataFrames to hold probabilities and binary recommendations
proba_test = pd.DataFrame(index=X_test.index)
preds_test = pd.DataFrame(index=X_test.index)

for label in label_cols:
    if label not in final_models:
        print(f"No final model for {label}, skipping.")
        continue

    model = final_models[label]
    thr   = best_thresholds[label]

    # Predict probabilities
    proba = model.predict_proba(X_test)[:, 1]
    proba_test[label] = proba

    # Apply product-specific threshold
    preds = (proba >= thr).astype(int)
    preds_test[label] = preds

print("\nSample test probabilities:")
display(proba_test.head())

print("\nSample test binary recommendations:")
display(preds_test.head())



Sample test probabilities:


Unnamed: 0,saving_account,credit_card,pensions,direct_debit
9,3.4e-05,0.094844,0.015534,0.273034
10,0.000147,0.077726,0.014568,0.409046
12,8e-06,0.055591,0.036347,0.15756
14,0.000362,0.075532,0.179557,0.065366
17,3e-06,0.001248,0.000661,0.003822



Sample test binary recommendations:


Unnamed: 0,saving_account,credit_card,pensions,direct_debit
9,0,0,0,0
10,0,0,0,1
12,0,0,0,0
14,0,0,0,0
17,0,0,0,0


In [42]:
#Attaching customer id to recommendation
recommendations = pd.concat(
    [customer_ids_test.reset_index(drop=True),
     proba_test.reset_index(drop=True),
     preds_test.reset_index(drop=True)],
    axis=1
)

recommendations.head()

Unnamed: 0,customer_id,saving_account,credit_card,pensions,direct_debit,saving_account.1,credit_card.1,pensions.1,direct_debit.1
0,15899,3.4e-05,0.094844,0.015534,0.273034,0,0,0,0
1,15900,0.000147,0.077726,0.014568,0.409046,0,0,0,1
2,15902,8e-06,0.055591,0.036347,0.15756,0,0,0,0
3,15906,0.000362,0.075532,0.179557,0.065366,0,0,0,0
4,15910,3e-06,0.001248,0.000661,0.003822,0,0,0,0


In this step, we transform our recommendation results into a more analysis-friendly format. Initially, the preds_test dataframe is in a wide structure: one row per customer and one column per product (such as saving_account, credit_card, pensions, etc.), with each cell holding a 0 or 1 indicating whether we recommend that product. This format is good for modeling, but not ideal for reporting or analysis.

To fix this, we reshape the data using .melt() to convert the wide table into a long table called long_recs. In this long format, each row represents a single (customer_id, product) combination. After reshaping, we filter to keep only rows where recommend == 1, meaning we keep only the recommended products. We then drop the temporary recommend column so the final result has only two fields: customer_id and product.

This long format is much easier to use for analysis and visualization. It allows us to quickly count recommendations per product, merge with other customer attributes, and load the data into dashboards or BI tools. While this step does not evaluate model accuracy (that was handled earlier through metrics like AUC and F1-score), it produces a clean and useful structure for summarizing and presenting recommendations. The sample output confirms the format: one row per recommended product for each customer.

In [43]:
# Long-form table: one row per (customer, product) that is recommended
long_recs = (
    preds_test
    .reset_index(drop=True)
    .assign(customer_id=test_model["customer_id"].values)
    .melt(id_vars="customer_id", var_name="product", value_name="recommend")
)

long_recs = long_recs[long_recs["recommend"] == 1].drop(columns="recommend")
display(long_recs.head(20))


Unnamed: 0,customer_id,product
670,18438,saving_account
1705,21909,saving_account
4795,32666,saving_account
9005,47784,saving_account
11187,54550,saving_account
12600,59184,saving_account
14957,66646,saving_account
15006,66800,saving_account
15282,67743,saving_account
16070,70165,saving_account


In [44]:
# ============================================================
# Primary product recommendation
# ============================================================

# Map technical column names to human-readable product names
product_name_map = {
    "saving_account": "Savings Account",
    "credit_card": "Credit Card",
    "pensions": "Pension Plan",
    "direct_debit": "Direct Debit",
}

# List of products we are actually predicting/recommending
primary_product_candidates = ["saving_account", "credit_card", "pensions", "direct_debit"]

# Safety: keep only those that exist in preds_test
primary_product_candidates = [c for c in primary_product_candidates if c in preds_test.columns]

# Decide a single "primary" recommendation per customer
primary_reco = []

for i in preds_test.index:
    # products that are recommended (binary = 1) for this customer
    recommended_cols = [
        col for col in primary_product_candidates
        if preds_test.loc[i, col] == 1
    ]

    if not recommended_cols:
        primary_reco.append("No Product Recommended")
    else:
        # among recommended products, pick the one with highest probability
        best_col = max(
            recommended_cols,
            key=lambda c: proba_test.loc[i, c]
        )
        primary_reco.append(product_name_map.get(best_col, best_col))

# Store on preds_test
preds_test["primary_recommendation"] = primary_reco


In this step, we take all the model predictions and turn them into a clean, human-friendly recommendation summary for every customer. Up to this point, our model generated two things: (1) a probability that each customer might adopt a given product, and (2) a binary yes/no recommendation for each product based on optimized thresholds. However, these raw predictions are not something a business team can easily interpret. So we reorganize the outputs into a structured format that clearly answers the question: “What financial products are most suitable for this customer?”

We start by choosing a single primary recommendation for each customer. Some customers qualify for multiple products, so we look at the predicted probabilities and select the product with the highest likelihood of adoption. This ensures each customer receives one clear, strongest recommendation rather than a scattered set of options. On the other hand, if a customer does not meet the threshold for any product, we assign the label “No Product Recommended.” This is an important part of the workflow, because it shows the model is being selective and only recommending products when the confidence is high enough. It also respects the idea that not every customer fits into a targeted marketing effort at a given time.

Next, we also build a complete list of all products that were recommended for each customer. This gives a broader view of potential interest and is useful for teams who want to design multi-product offers, bundle opportunities, or understand customer needs more holistically. Keeping both the “primary recommendation” and the “full recommended products list” allows different business functions to use the model results in flexible, practical ways.

Finally, we gather everything into one organized output table called recommendations, where each row corresponds to one customer. It includes each product’s probability, the binary recommendation flags, the customer’s primary recommended product, and the full list of recommended products. This table is structured in a way that is easy to interpret, easy to filter, and ready to feed into dashboards, CRM systems, or marketing automation tools.

For customers with no recommended products, the model indicates that there is currently insufficient evidence to justify a targeted product offer. These customers can instead be included in general marketing or engagement campaigns until stronger signals appear. This ensures the system remains respectful, avoids irrelevant outreach, and maintains the trustworthiness of the recommendation process.

In [45]:
# ============================================================
# Build a “recommended products” list per customer
# ============================================================

def gather_recos(row):
    recos = []
    for col in label_cols:   # label_cols should now only include the 4 products
        if col in row and row[col] == 1:
            recos.append(product_name_map.get(col, col))
    return recos

preds_test["recommended_products"] = preds_test.apply(gather_recos, axis=1)

# Final recommendation frame
recommendations = pd.DataFrame({
    "customer_id": customer_ids_test,
})

# Add per-product probability + binary recommendation
for col in label_cols:
    if col in preds_test.columns:
        recommendations[col + "_proba"] = proba_test[col]
        recommendations[col + "_recommend"] = preds_test[col]

# Add the new primary recommendation + list of all recommended products
recommendations["primary_recommendation"] = preds_test["primary_recommendation"]
recommendations["recommended_products"]   = preds_test["recommended_products"]

print("\nSample final recommendation output:")
display(recommendations.head(100))



Sample final recommendation output:


Unnamed: 0,customer_id,saving_account_proba,saving_account_recommend,credit_card_proba,credit_card_recommend,pensions_proba,pensions_recommend,direct_debit_proba,direct_debit_recommend,primary_recommendation,recommended_products
9,15899,3.363100e-05,0,0.094844,0,0.015534,0,0.273034,0,No Product Recommended,[]
10,15900,1.466140e-04,0,0.077726,0,0.014568,0,0.409046,1,Direct Debit,[Direct Debit]
12,15902,7.914568e-06,0,0.055591,0,0.036347,0,0.157560,0,No Product Recommended,[]
14,15906,3.615001e-04,0,0.075532,0,0.179557,0,0.065366,0,No Product Recommended,[]
17,15910,2.866757e-06,0,0.001248,0,0.000661,0,0.003822,0,No Product Recommended,[]
...,...,...,...,...,...,...,...,...,...,...,...
209,16183,8.288953e-06,0,0.047316,0,0.005395,0,0.142384,0,No Product Recommended,[]
210,16185,1.883583e-07,0,0.001896,0,0.002171,0,0.001551,0,No Product Recommended,[]
212,16189,3.258896e-06,0,0.000542,0,0.000182,0,0.000403,0,No Product Recommended,[]
215,16194,1.659735e-03,0,0.064404,0,0.021869,0,0.381624,1,Direct Debit,[Direct Debit]


In this step, we calculate how many customers were recommended each product by summing the binary prediction columns in preds_test. Each product column contains 1 if the model recommended that product for a customer and 0 if it did not. By adding up all the 1s for each product, we get a total count of how many times the model recommended that product across the entire test population.

We do this for two main reasons. First, it gives us a quick sanity check on the overall behavior of the model—whether it is recommending products too aggressively, too conservatively, or in a balanced way. If a product receives extremely few recommendations or an unusually large number, it may indicate an issue with thresholds, class imbalance, or feature alignment between train and test data. Second, these counts help teams understand expected marketing volume. Knowing how many customers are eligible for each product allows business teams to plan campaigns, allocate budgets, and manage capacity more effectively.

Overall, this step provides a simple but important summary of model outputs, helping ensure the recommendations make sense at a population level before they are used operationally.

In [46]:
print("\nTotal number of recommendations per product:")
product_recommendation_counts = preds_test[label_cols].sum().sort_values(ascending=False)
print(product_recommendation_counts)


Total number of recommendations per product:
credit_card       2382
direct_debit      1067
pensions           503
saving_account     123
dtype: int64
