In [1]:

# Utils
# ==============================================================================
import warnings

# Plot
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing
# ==============================================================================
import pandas as pd
import numpy as np

# Model
# ==============================================================================
from sklearn.model_selection import (
    train_test_split,
    ParameterGrid,
    KFold,
    GridSearchCV
)

from sklearn.multioutput import RegressorChain

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


# Metrics
# ==============================================================================
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)




# **Info**
---

**@By**: Steven Bernal

**@Nickname**: Kaiziferr

**@Git**: https://github.com/Kaiziferr

# **Objectives**
---
1. Predict Monthly_Expenses based on Age and Monthly_Income.
2. Predict Number_of_Transactions using Monthly_Expenses and Average_Transaction_Amount.
3. Finally, predict Total_Payments using all the previous variables.


# **Data dictionary**
---


- **Customer_ID**: unique identifier of the customer.

- **Age**: customer’s age.

- **Monthly_Income**: how much the customer earns per month.

- **Monthly_Expenses**: average monthly spending.

- **Number_of_Transactions**: number of financial transactions during the month.

- **Average_Transaction_Amount**: average value of each transaction.

- **Late_Payment_History**: number of times the customer has been late on payments.

- **Current_Credit**: available credit balance.

- **Total_Payments** (target variable): total amount paid during the month.

# **Config**
---

In [2]:
random_seed = 12354
warnings.filterwarnings('ignore')
sns.set(style='darkgrid')
title_data = 'Financial Data'
paleta = sns.color_palette('tab10').as_hex()

# **Utils**
---

In [3]:
def get_data(X:pd.DataFrame, feature_X:list, feature_Y: list)->tuple:
  """
    Filter the data by columns

    Args:
      X (DataFrame):    training data
      feature_X (list): Filtered training data columns
      feature_Y (list): validation column(s)

    Returns:
      tuple: Column-filtered data
  """
  try:
    return X[feature_X], X[feature_Y]
  except Exception as e:
    print(e)


def model_optimization(
    X:pd.DataFrame,
    y:pd.DataFrame,
    model:object,
    dict_params:dict)->dict:
    """
      Find the best combination of parameters for a random forest
      using the OOB score

      Args:
        X (DataFrame):      training data
        y (DataFrame)       labeled data
        model (object):     random forest regressor o Random forest classification
        dict_params (dict): dictionary with the best parameters

      Returns:
        dict: Best combination of parameters
    """
    try:
      results = {'params': [], 'metric': []}

      for params in dict_params:
        model_oobscore = model(**params)
        model_oobscore.fit(X, y)
        results['params'].append(params)
        results['metric'].append(model_oobscore.oob_score_)

      resultados_scores = pd.DataFrame(results)
      resultados_scores = pd.concat(
          [resultados_scores,
           resultados_scores['params'].apply(pd.Series)], axis=1)
      resultados_scores = resultados_scores.drop(columns = 'params')
      if isinstance(params['oob_score'], bool):
        pass
      else:
        resultados_scores['oob_score'] = resultados_scores['oob_score'].apply(
          lambda x: x.__name__)
      return resultados_scores
    except Exception as e:
      print(e)



def prediction_model(
    X:pd.DataFrame,
    y:pd.DataFrame,
    model:object,
    metric:object,
    param_metric:dict={},
    **kwards)->tuple:

  """
    Predicts the target value using Out-Of-Fold

    Args:
      X (DataFrame):       training data
      y (DataFrame)        labeled data
      model (object):      model
      metric (object):     model evaluation metric or loss
      param_metric (dict): dictionary with the parameters of metric
      kwards:              Model parameters

    Returns:
      tuple: oof_predictions, fold_score
  """

  try:
    oof_predictions = np.zeros(len(X))
    fold_score = []
    kf = KFold(n_splits=10, shuffle=True, random_state=random_seed)
    for train_index, val_index in kf.split(X):
      X_train, X_val = X.iloc[train_index], X.iloc[val_index]
      y_train, y_val = y.iloc[train_index], y.iloc[val_index]
      model_f = model(**kwards)
      model_f.fit(X_train, y_train)
      oof_predictions[val_index] = model_f.predict(X_val)
      fold_score.append(
          metric(y_val, model_f.predict(X_val), **param_metric)
      )
    return pd.Series(oof_predictions, index=X.index), fold_score
  except Exception as e:
    print(e)

# **Data**

---



In [4]:
url='https://drive.google.com/file/d/1JfAhOxwo2a78MYYiQexK8nFQQ3dKFyKW/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
data = pd.read_csv(url)

In [5]:
data.head()

Unnamed: 0,Customer_ID,Age,Monthly_Income,Monthly_Expenses,Number_of_Transactions,Average_Transaction_Amount,Late_Payment_History,Current_Credit,Total_Payments
0,C0001,56,1393.55,1112.29,13,88.59,1,1145.48,1147.74
1,C0002,69,3203.46,1550.97,30,52.02,4,746.5,1776.32
2,C0003,46,2243.65,1176.36,13,88.33,0,743.05,1202.06
3,C0004,32,1577.75,1042.95,12,84.25,1,400.52,1152.9
4,C0005,60,2353.43,1748.83,43,40.48,2,1892.39,1759.59


In [6]:
data = data.drop('Customer_ID', axis=1)

# **Data Split**
---


The dataset is split into training and validation sets for the entire process. It must be done this way because chain regression will be applied

In [7]:
data_train, data_test, _, _ = train_test_split(
    data,
    data.iloc[:,0],
    train_size=0.85,
    random_state=random_seed
)

Global parameters of the model to be implemented. In this case, it is for the random forest.

In [8]:
dict_params = ParameterGrid(
    {
        "n_estimators": [50, 100, 150, 200],
        "max_features": [0.75, 1, 'sqrt', 'log2'],
        "max_depth": [None, 6, 12, 18],
        "criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"],
        "oob_score": [mean_absolute_error, r2_score],
        "n_jobs": [-1],
        "random_state": [random_seed]
    }
)

# **First chain**
---


The training and test sets of this first chain are filtered by Age, Monthly_Income, and Monthly_Expenses

In [9]:
X_train, y_train = get_data(data_train, ['Age', 'Monthly_Income'], ['Monthly_Expenses'])
X_test, y_test = get_data(data_test, ['Age', 'Monthly_Income'], ['Monthly_Expenses'])

Search for the best parameters using the OOB score.

In [10]:
resultados_scores = model_optimization(X_train, y_train, RandomForestRegressor, dict_params)

In [11]:
resultados_scoresT = resultados_scores[
    resultados_scores['oob_score'] == 'mean_absolute_error'].sort_values(
        'metric', ascending=True)

In [12]:
resultados_scoresT.head()

Unnamed: 0,metric,criterion,max_depth,max_features,n_estimators,n_jobs,oob_score,random_state
48,386.751363,squared_error,6.0,sqrt,50,-1,mean_absolute_error,12354
56,386.751363,squared_error,6.0,log2,50,-1,mean_absolute_error,12354
40,386.751363,squared_error,6.0,1,50,-1,mean_absolute_error,12354
32,386.751363,squared_error,6.0,0.75,50,-1,mean_absolute_error,12354
288,386.751363,friedman_mse,6.0,0.75,50,-1,mean_absolute_error,12354


The following are the best parameters

In [13]:
resultados_scoresT.iloc[0, 1:].to_dict()

{'criterion': 'squared_error',
 'max_depth': 6.0,
 'max_features': 'sqrt',
 'n_estimators': 50,
 'n_jobs': -1,
 'oob_score': 'mean_absolute_error',
 'random_state': 12354}

In [14]:
best_params = {
  **resultados_scoresT.iloc[0, 1:].to_dict(),
  'max_depth': 6,
  'oob_score': mean_absolute_error}
best_params

{'criterion': 'squared_error',
 'max_depth': 6,
 'max_features': 'sqrt',
 'n_estimators': 50,
 'n_jobs': -1,
 'oob_score': <function sklearn.metrics._regression.mean_absolute_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')>,
 'random_state': 12354}

Prediction using Out-Of-Fold cross-validation

In [15]:
predict_first, score = prediction_model(X_train, y_train, RandomForestRegressor, mean_absolute_error, **best_params)

In [16]:
np.mean(score), np.std(score)

(np.float64(389.3335562889553), np.float64(15.22492666255742))

The model exhibits a mean absolute error of approximately 389 monetary units in predicting monthly expenses, estimated using 10-fold cross-validation. The low standard deviation across folds (15.22) suggests that the model’s performance is stable and consistent across different data partitions; it is neither fragile nor sensitive to sampling.

In [17]:
y_train.describe()

Unnamed: 0,Monthly_Expenses
count,850.0
mean,1682.8056
std,697.989938
min,257.56
25%,1166.4425
50%,1588.625
75%,2118.67
max,4332.16


Most monthly expenses are centered around $1,682.81 ± $697.99.

In [18]:
mean_absolute_error(y_train, predict_first)

389.3335562889553

The model predicts monthly expenses with an average error of 389 units, equivalent to 23% of the typical expense. Although this error is smaller than the natural variability of the data (standard deviation of 698), it still represents over half of it (56%), indicating that the model captures approximately 44% of the actual dispersion. In summary, the model learns important patterns, provides reasonable estimates, and still has room to improve its accuracy.

**Model for deploymen**

The Random Forest model in production should be configured with the best hyperparameters from the previous process.

In [19]:
model = RandomForestRegressor(**best_params)
model.fit(X_train, y_train)

In [20]:
y_predict_train = pd.DataFrame(model.predict(X_train),  index=X_train.index)
y_predict_test = pd.DataFrame(model.predict(X_test), index=X_test.index)

In [21]:
mean_absolute_error(y_train, y_predict_train)

329.9968810191357

The model predicts monthly expenses with an average error of 329 units, equivalent to 19% of the typical expense. Although this error is smaller than the natural variability of the data (standard deviation of 698), it still represents over half of it (42%), indicating that the model captures approximately 58% of the actual dispersion. In summary, the model learns important patterns and provides reasonable estimates

In [22]:
mean_absolute_error(y_test, y_predict_test)

390.6292703982103

The test MAE is 18% higher than the training MAE, indicating a slight loss of accuracy when generalizing. However, since the error is still lower than the natural variability of the data, there is no evidence of overfitting. The model remains useful and reliable, as the error is still moderate compared to the natural variability of expenses

In [23]:
y_predict_train_monthly_expenses = y_predict_train
y_predict_test_monthly_expenses = y_predict_test

# **Second chain.**
---


The training and test sets of this second chain are filtered by Average Transaction Amount and Number of Transactions, but concatenate the prediction of the first model(first chain)

In [24]:
X_train, y_train = get_data(data_train, ['Average_Transaction_Amount'], ['Number_of_Transactions'])
X_test, y_test = get_data(data_test, ['Average_Transaction_Amount'], ['Number_of_Transactions'])

X_train['Monthly_Expenses'] = y_predict_train
X_test['Monthly_Expenses'] = y_predict_test


Search for the best parameters using the OOB score.

In [25]:
resultados_scores = model_optimization(X_train, y_train, RandomForestRegressor, dict_params)

In [26]:
resultados_scoresT = resultados_scores[
    resultados_scores['oob_score'] == 'mean_absolute_error'].sort_values(
        'metric', ascending=True)

The following are the best parameters

In [27]:
resultados_scoresT.iloc[0, 1:].to_dict()

{'criterion': 'poisson',
 'max_depth': 6.0,
 'max_features': 'sqrt',
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': 'mean_absolute_error',
 'random_state': 12354}

In [28]:
best_params = {
  **resultados_scoresT.iloc[0, 1:].to_dict(),
  'max_depth': 6,
  'oob_score': mean_absolute_error}
best_params

{'criterion': 'poisson',
 'max_depth': 6,
 'max_features': 'sqrt',
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': <function sklearn.metrics._regression.mean_absolute_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')>,
 'random_state': 12354}

Prediction using Out-Of-Fold cross-validation

In [29]:
predict_second, score = prediction_model(X_train, y_train, RandomForestRegressor, mean_absolute_error, **best_params)

In [30]:
np.mean(score), np.std(score)

(np.float64(5.035730918990834), np.float64(0.3604555431231253))

The model exhibits a mean absolute error of approximately 5 transactions, estimated using 10-fold cross-validation. The low standard deviation across folds (0.3) suggests that the model’s performance is stable and consistent across different data partitions; it is neither fragile nor sensitive to sampling.

In [31]:
y_train.describe()

Unnamed: 0,Number_of_Transactions
count,850.0
mean,24.597647
std,13.912328
min,3.0
25%,15.0
50%,22.0
75%,30.0
max,100.0


Most number transactions are centered around  24.597647± 13.912328.

In [32]:
mean_absolute_error(y_train, predict_second)

5.035730918990835

The model predicts number transactions with an average error of 5 units, equivalent to 20% of the number transactions . Although this error is smaller than the natural variability of the data (standard deviation of 14), it still represents over half of it (36%), indicating that the model captures approximately 64% of the actual dispersion. In summary, the model learns important patterns

**Model for deploymen**

The Random Forest model in production should be configured with the best hyperparameters from the previous process.

In [33]:
model = RandomForestRegressor(**best_params)
model.fit(X_train, y_train)

In [34]:
y_predict_train = pd.DataFrame(model.predict(X_train),  index=X_train.index)
y_predict_test = pd.DataFrame(model.predict(X_test), index=X_test.index)

In [35]:
mean_absolute_error(y_train, y_predict_train)

3.9752318048188617

The model predicts number transactions with an average error of 4 units, equivalent to 16% of the number transactions . Although this error is smaller than the natural variability of the data (standard deviation of 14), it still represents over half of it (28%), indicating that the model captures approximately 64% of the actual dispersion. In summary, the model learns important patterns


In [36]:
mean_absolute_error(y_test, y_predict_test)

5.246082700678527

In [37]:
(5.246082700678527-3.9752318048188617)/3.9752318048188617

0.31969227412577866

The test MAE is 32% higher than the training MAE, indicating a slight loss of accuracy when generalizing. However, since the error is still lower than the natural variability of the data, there is no evidence of overfitting. The model  useful and reliable, as the error is still moderate compared to the natural variability of number transactions

In [38]:
(5.24-3.97)/3.97

0.31989924433249367

In [39]:
y_predict_train_number_transactions = y_predict_train
y_predict_test_number_transactions = y_predict_test

# **Third chain**
---

The training and test sets of this third chain will use all variables, but they concatenate the predictions from the previous models, in order to predict Total Payments.

In [40]:
X_train = data_train.drop(['Monthly_Expenses', 'Number_of_Transactions', 'Total_Payments'], axis=1)
X_test = data_test.drop(['Monthly_Expenses', 'Number_of_Transactions', 'Total_Payments'], axis=1)

y_train = data_train['Total_Payments']
y_test = data_test['Total_Payments']

In [41]:
X_train['Monthly_Expenses'] = y_predict_train_monthly_expenses
X_test['Monthly_Expenses'] =  y_predict_test_monthly_expenses

X_train['Number_of_Transactions'] = y_predict_train_number_transactions
X_test['Number_of_Transactions'] = y_predict_test_number_transactions

Search for the best parameters using the OOB score.

In [42]:
resultados_scores = model_optimization(X_train, y_train, RandomForestRegressor, dict_params)

In [43]:
resultados_scoresT = resultados_scores[
    resultados_scores['oob_score'] == 'mean_absolute_error'].sort_values(
        'metric', ascending=True)

The following are the best parameters

In [44]:
resultados_scoresT.iloc[0, 1:].to_dict()

{'criterion': 'friedman_mse',
 'max_depth': 18.0,
 'max_features': 0.75,
 'n_estimators': 50,
 'n_jobs': -1,
 'oob_score': 'mean_absolute_error',
 'random_state': 12354}

In [45]:
best_params = {
  **resultados_scoresT.iloc[0, 1:].to_dict(),
  'max_depth': 18,
  'oob_score': mean_absolute_error}
best_params

{'criterion': 'friedman_mse',
 'max_depth': 18,
 'max_features': 0.75,
 'n_estimators': 50,
 'n_jobs': -1,
 'oob_score': <function sklearn.metrics._regression.mean_absolute_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')>,
 'random_state': 12354}

Prediction using Out-Of-Fold cross-validation

In [46]:
predict_third, score = prediction_model(
    X_train,
    y_train,
    RandomForestRegressor,
    mean_absolute_error,
    **best_params)

In [47]:
np.mean(score), np.std(score)

(np.float64(304.6654730694516), np.float64(22.41784524295324))

The model shows an average MAE of approximately 304.66 total payments in cross-validation, with a standard deviation of 22.42 across folds, indicating consistent performance. Compared to the standard deviation of total payments (701.92), the error is relatively small, suggesting that the model captures relevant patterns in the variability of the target variable.

In [48]:
y_train.describe()

Unnamed: 0,Total_Payments
count,850.0
mean,1757.742965
std,701.919095
min,280.13
25%,1256.7575
50%,1663.835
75%,2180.2175
max,4401.38


Most total payments are centered around 1757.742965 ± 6701.919095.

In [49]:
mean_absolute_error(y_train, predict_third)

304.66547306945165

The model predicts payments with an average error of 304.66 total payments. This error corresponds to approximately 43% of the typical range of variation in payments and about 17% of the average payment (this 17% refers to the mean error across all predictions and does not imply that each individual prediction has a 17% error).

Overall, the model captures relevant underlying patterns, delivers reasonably accurate estimates

**Model for deploymen**

In [50]:
model = RandomForestRegressor(**best_params)
model.fit(X_train, y_train)

In [51]:
y_predict_train = pd.DataFrame(model.predict(X_train),  index=X_train.index)
y_predict_test = pd.DataFrame(model.predict(X_test), index=X_test.index)

In [52]:
mean_absolute_error(y_train, y_predict_train)

112.86999387437787

In [53]:
mean_absolute_error(y_test, y_predict_test)

397.09331997373744

The model is clearly overfitted: it has a very low training error (MAE 112), but the test error increases drastically (MAE 397), indicating poor generalization. Although the test error is still lower than the natural dispersion of the data, the model has learned patterns specific to the training set and fails to predict new data accurately.

# **Overfitting resolution**
---

To mitigate overfitting, an XGBRegressor is trained. For this purpose, a hyperparameter search is conducted to determine an improved configuration, while ensuring the constraint of using all available features is respected.

In [54]:
dict_params = {
    "n_estimators": [200, 400, 600],
    "learning_rate": [0.01, 0.05, 0.1],

    "max_depth": [3, 4, 6],
    "min_child_weight": [1, 5, 10],

    "gamma": [0, 0.1, 0.3],

    "subsample": [0.6, 0.8],
    "colsample_bytree": [0.6, 0.8]
}

In [55]:
model = XGBRegressor(
    random_state = random_seed
)

In [56]:
grid = GridSearchCV(
    model,
    dict_params,
    scoring='neg_mean_absolute_error',
    cv=3,
    return_train_score=True,
    verbose=0
)

In [57]:
grid.fit(X_train, y_train)

In [58]:
resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)')\
    .drop(columns = 'params')\
    .sort_values('mean_test_score', ascending = False).head()

Unnamed: 0,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,param_subsample,mean_test_score,std_test_score,mean_train_score,std_train_score
694,0.8,0.1,0.01,6,5,600,0.6,-306.70182,8.63153,-139.420848,3.429703
532,0.8,0.0,0.01,6,5,600,0.6,-306.70182,8.63153,-139.420848,3.429703
856,0.8,0.3,0.01,6,5,600,0.6,-306.70182,8.63153,-139.420848,3.429703
538,0.8,0.0,0.01,6,10,600,0.6,-307.116215,7.021856,-173.03335,3.17552
862,0.8,0.3,0.01,6,10,600,0.6,-307.116215,7.021856,-173.03335,3.17552


In [59]:
print("-----------------------------------")
print("Best hyperparameters found")
print("-----------------------------------")
print(f"{grid.best_params_} : {-1*grid.best_score_} ({grid.scoring})")

-----------------------------------
Best hyperparameters found
-----------------------------------
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 600, 'subsample': 0.6} : 306.7018203779853 (neg_mean_absolute_error)


The model is trained using the best hyperparameters, with the objective of minimizing the error.

In [60]:
model = grid.best_estimator_.fit(X_train, y_train)
model

In [61]:
y_predict_train = pd.DataFrame(model.predict(X_train))
y_predict_test = pd.DataFrame(model.predict(X_test))

In [62]:
mean_absolute_error(y_train, y_predict_train)

154.98321600988052

In [63]:
mean_absolute_error(y_test, y_predict_test)

391.0948281005859

The new model does not reduce overfitting, as the test error is significantly higher than the training error.

A variation of ensemble voting is implemented.


In [64]:
kf = KFold(n_splits=10, shuffle=True, random_state=random_seed)
score_prediction_train = []
score_prediction_val = []
models_last = []

for train_id, val_id in kf.split(X_train):
  X_train_fold, X_val_fold = X_train.iloc[train_id], X_train.iloc[val_id]
  y_train_fold, y_val_fold = y_train.iloc[train_id], y_train.iloc[val_id]
  model = XGBRegressor(
        max_depth=4,
        learning_rate=0.005,
        n_estimators=800,
        subsample=0.5,
        colsample_bytree=0.5,
        reg_alpha=5,
        reg_lambda=5
    )

    # Entrenar con early stopping
  model.fit(
      X_train_fold, y_train_fold,
      eval_set=[(X_val_fold, y_val_fold)],
      #early_stopping_rounds=50,
      verbose=False
  )

  y_pred_tra = model.predict(X_train_fold)
  y_pred_val = model.predict(X_val_fold)
  score_train = mean_absolute_error(y_train_fold, y_pred_tra)
  score_val = mean_absolute_error(y_val_fold, y_pred_val)
  models_last.append(model)
  score_prediction_train.append(score_train)
  score_prediction_val.append(score_val)

In [65]:
test_train = np.mean([m.predict(X_train) for m in models_last], axis=0)

A variant of the voting strategy is then applied, which consists of instantiating multiple manually tuned XGBRegressor models, with variations in the training samples and feature subsets used, in order to average the predictions.

In [66]:
mean_absolute_error(y_train, test_train)

278.4154617934283

In [67]:
test_preds = np.mean([m.predict(X_test) for m in models_last], axis=0)

In [68]:
mean_absolute_error(y_test, test_preds)

385.2805902018229

The test error is 38%. The model is useful and reliable, as the error remains moderate relative to the natural variability of total payments.

# **Insights / Conclusions**
---

In conclusion, the chained regression approach allows for a structured modeling of the relationship between average payments, the number of transactions, and total payments. The model responsible for predicting the number of transactions shows a significantly better performance in terms of mean absolute error, indicating a stable relationship among the intermediate variables.

Although the models predicting average payments and total payments exhibit higher errors, these results are consistent with the continuous nature and high variability of these variables. Additionally, an error propagation effect along the chain is observed, which is inherent to this type of architecture.

Nevertheless, the overall performance of the system suggests that using predicted intermediate variables is appropriate and provides relevant information for estimating total payments, thereby validating the usefulness of the proposed approach.

# **Info**
---

**@By**: Steven Bernal

**@Nickname**: Kaiziferr

**@Git**: https://github.com/Kaiziferr