In [14]:

# Utils
# ==============================================================================
import warnings

# Plot
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing
# ==============================================================================
import pandas as pd
import numpy as np

# Model
# ==============================================================================
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.multioutput import RegressorChain

from sklearn.ensemble import RandomForestRegressor


# Metrics
# ==============================================================================
from sklearn.metrics import mean_absolute_error




# **Info**
---

**@By**: Steven Bernal

**@Nickname**: Kaiziferr

**@Git**: https://github.com/Kaiziferr

# **Nota**
---

The interpretation and conclusions for each cell are presented immediately below it. ⚠️

# **Objectives**
---
Predict Total_Payments using all available features. To achieve this, it is necessary to first model Monthly_Expenses and subsequently Number_of_Transactions, under the assumption that these variables are correlated.



# **Data dictionary**
---


- **Customer_ID**: unique identifier of the customer.

- **Age**: customer’s age.

- **Monthly_Income**: how much the customer earns per month.

- **Monthly_Expenses**: average monthly spending.

- **Number_of_Transactions**: number of financial transactions during the month.

- **Average_Transaction_Amount**: average value of each transaction.

- **Late_Payment_History**: number of times the customer has been late on payments.

- **Current_Credit**: available credit balance.

- **Total_Payments** (target variable): total amount paid during the month.

# **Config**
---

In [15]:
random_seed = 12354
warnings.filterwarnings('ignore')
sns.set(style='darkgrid')
title_data = 'Financial Data'
paleta = sns.color_palette('tab10').as_hex()

# **Utils**
---

# **Data**

---



In [16]:
url='https://drive.google.com/file/d/1JfAhOxwo2a78MYYiQexK8nFQQ3dKFyKW/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
data = pd.read_csv(url)

In [17]:
data.head()

Unnamed: 0,Customer_ID,Age,Monthly_Income,Monthly_Expenses,Number_of_Transactions,Average_Transaction_Amount,Late_Payment_History,Current_Credit,Total_Payments
0,C0001,56,1393.55,1112.29,13,88.59,1,1145.48,1147.74
1,C0002,69,3203.46,1550.97,30,52.02,4,746.5,1776.32
2,C0003,46,2243.65,1176.36,13,88.33,0,743.05,1202.06
3,C0004,32,1577.75,1042.95,12,84.25,1,400.52,1152.9
4,C0005,60,2353.43,1748.83,43,40.48,2,1892.39,1759.59


In [18]:
data = data.drop('Customer_ID', axis=1)

# **Data Split**
---


Within the dataset, the data are split into training, test, and validation sets, assuming that the latter does not come from the same distribution

In [19]:
data_train, data_test, _, _ = train_test_split(
    data,
    data.iloc[:,0],
    train_size=0.7,
    random_state=random_seed
)

In [20]:
data_test, data_validation, _, _ = train_test_split(
    data_test,
    data_test.iloc[:,0],
    train_size=0.65,
    random_state=random_seed
)

In [21]:
X_masiva_train = data_train.drop(['Age', 'Monthly_Income', 'Late_Payment_History', 'Current_Credit'], axis=1)
X_masiva_test = data_test.drop(['Age', 'Monthly_Income', 'Late_Payment_History', 'Current_Credit'], axis=1)
X_masiva_validation  = data_validation.drop(['Age', 'Monthly_Income', 'Late_Payment_History', 'Current_Credit'], axis=1)

y_masiva_train = data_train[['Monthly_Expenses', 'Number_of_Transactions', 'Total_Payments']]
y_masiva_test = data_test[['Monthly_Expenses', 'Number_of_Transactions', 'Total_Payments']]
y_masiva_validation = data_validation[['Monthly_Expenses', 'Number_of_Transactions', 'Total_Payments']]

A bagging model is employed to reduce the need for preprocessing.

In [22]:
model = RandomForestRegressor()

The RegressorChain method is applied to perform chained predictions. While it is useful, it requires the use of all variables when a reasonable level of correlation exists; Otherwise, if the variables are independent, it is not a good option.

In [23]:
wrapper = RegressorChain(model, cv=10, random_state=random_seed, order=[0,1,2])
wrapper.fit(X_masiva_train, y_masiva_train)

In [24]:
y_predict = wrapper.predict(X_masiva_train)
mean_absolute_error(y_masiva_train, y_predict)

1.2420017142857456

In [25]:
y_predict = wrapper.predict(X_masiva_test)
mean_absolute_error(y_masiva_test, y_predict)

2.7382147008547584

In [26]:
y_predict = wrapper.predict(X_masiva_validation)
mean_absolute_error(y_masiva_validation, y_predict)

3.50063936507935

# **Info**
---

**@By**: Steven Bernal

**@Nickname**: Kaiziferr

**@Git**: https://github.com/Kaiziferr