# Continuação do projeto de um modelo de regressão para determinação do crédito de clientes bancários

Nesta etapa, será realizado o treinamento de diferentes modelos, o tunning de hiperparâmetros e, por fim, será treinado um modelo de redes neurais para comparação com os modelos padrões. As métricas avaliadas serão o r², MSE e RMSE.

In [33]:
# Trabalho com o dataset, visualização e métricas de avaliação
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Modelos de regressão
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# Hiperparametização
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('df_regressao.csv')

In [3]:
df.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,...,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,cluster_gmm_0,cluster_gmm_1,cluster_gmm_2,cluster_gmm_3
0,-0.819705,0.818182,-0.68748,-0.597929,-0.440112,-0.559575,0.166667,0.0,0.083333,0.0,...,-0.607648,1000.0,-0.829268,-0.655746,0.0,12,0,1,0,0
1,1.185767,1.0,0.260167,0.813451,-0.695456,-0.559575,1.0,1.0,0.0,0.0,...,0.080223,7500.0,-0.400762,0.433366,0.0,12,0,1,0,0
2,-0.184937,1.0,-0.798496,-0.568721,-0.695456,-0.559575,0.083333,0.083333,0.0,0.0,...,-0.676435,1200.0,-0.343391,-0.420672,0.0,12,1,0,0,0
3,0.625762,1.0,1.043304,-0.597929,2.873147,-0.559575,0.666667,0.0,0.583333,0.0,...,-0.194926,1800.0,0.392485,4.407691,0.0,12,0,0,0,1
4,0.637058,1.0,-0.210979,-0.597929,0.472059,-0.559575,1.0,0.0,1.0,0.0,...,0.080223,2300.0,-0.342646,0.220688,0.0,12,0,1,0,0


# Separação dos dados e funções para métricas

In [4]:
X = df.drop('CREDIT_LIMIT', axis=1)
y = df['CREDIT_LIMIT']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
def avaliar_modelo(nome_modelo, y_teste, previsao):
    r2 = r2_score(y_teste, previsao)
    MSE = mean_absolute_error(y_teste, previsao)
    RSME = mean_squared_error(y_teste, previsao)
    return f'Modelo: {nome_modelo}\nR2: {r2:.2f}\nMSE: {MSE:.2f}\nRSME: {RSME:.2f}\n'

# Treinamento de modelos

In [12]:
modelo_rf = RandomForestRegressor()
modelo_lr = LinearRegression()
modelo_et = ExtraTreesRegressor()
modelo_sg = SGDRegressor()
modelo_grad = GradientBoostingRegressor()
modelo_hub = HuberRegressor()
modelo_ada = AdaBoostRegressor()
modelo_svr = SVR()

modelos = {'RandomForest': modelo_rf,
          'LinearRegression': modelo_lr,
          'ExtraTrees': modelo_et,
          'SGDRegressor': modelo_sg,
          'GradientBoostingRegressor': modelo_grad,
          'HuberRegressor': modelo_hub,
          'AdaBoostRegressor': modelo_ada,
          'SVR': modelo_svr}

for nome_modelo, modelo in modelos.items():
    # treinar
    modelo.fit(X_train, y_train)
    #testar
    previsao = modelo.predict(X_test)
    print(avaliar_modelo(nome_modelo, y_test, previsao))

Modelo: RandomForest
R2: 0.31
MSE: 1678.68
RSME: 5707194.55

Modelo: LinearRegression
R2: 0.27
MSE: 1790.77
RSME: 6017553.86

Modelo: ExtraTrees
R2: 0.30
MSE: 1675.22
RSME: 5744797.93

Modelo: SGDRegressor
R2: 0.26
MSE: 1721.91
RSME: 6056324.19

Modelo: GradientBoostingRegressor
R2: 0.34
MSE: 1663.36
RSME: 5468472.04

Modelo: HuberRegressor
R2: 0.24
MSE: 1681.21
RSME: 6245921.24



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Modelo: AdaBoostRegressor
R2: -0.18
MSE: 2682.15
RSME: 9735376.52

Modelo: SVR
R2: -0.04
MSE: 2058.37
RSME: 8595607.42



# Escolhendo o modelo para hiperparametização  

Nenhum modelo performou muito bem, mas será realizada a hiperparametização de dois modelos, afim de verificar se os modelos performam de forma melhor.  
Os modelos escolhidos foram o de Gradient Boosting e Random Forest

## Gradient Boosting

In [18]:
param_grid={'n_estimators':[100, 500, 600, 800, 1000], 
            'learning_rate': [0.1, 0.05, 0.02, 0.01, 0.005, 0.001],
            'max_depth':[1, 3, 5, 8, 10, 15], 
            'min_samples_leaf':[5, 7, 10, 13, 15, 20]}

clf = GradientBoostingRegressor()

grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


GridSearchCV(cv=3, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.05, 0.02, 0.01, 0.005, 0.001],
                         'max_depth': [1, 3, 5, 8, 10, 15],
                         'min_samples_leaf': [5, 7, 10, 13, 15, 20],
                         'n_estimators': [50, 100, 500]},
             verbose=2)

In [27]:
best_gbr = grid_search.best_params_
best_gbr

{'learning_rate': 0.01,
 'max_depth': 5,
 'min_samples_leaf': 13,
 'n_estimators': 500}

In [28]:
clf_gbr = grid_search.best_estimator_

In [29]:
previsao = clf_gbr.predict(X_test)
print(avaliar_modelo('CLF Best Gradient Bossting', y_test, previsao))

Modelo: CLF Best Gradient Bossting
R2: 0.34
MSE: 1640.43
RSME: 5413400.84



## Random Forest

In [7]:
params_grid = {
    'n_estimators':[50, 250, 500],
    'max_depth':[None, 2, 4, 5, 10],
    'min_samples_split':[2, 3, 5, 10],
    'max_features':['auto'],
    'bootstrap':[True, False]
}

clf = RandomForestRegressor()

grid_search_rf = GridSearchCV(clf, param_grid=params_grid, cv=3, n_jobs=-1, verbose=2)

grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [None, 2, 4, 5, 10],
                         'max_features': ['auto'],
                         'min_samples_split': [2, 3, 5, 10],
                         'n_estimators': [50, 250, 500]},
             verbose=2)

In [9]:
best_rfr = grid_search_rf.best_params_
best_rfr

{'bootstrap': True,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_split': 10,
 'n_estimators': 500}

In [11]:
clf_rfr = grid_search_rf.best_estimator_

In [12]:
previsao = clf_rfr.predict(X_test)
print(avaliar_modelo('CLF Best Random Forest', y_test, previsao))

Modelo: CLF Best Random Forest
R2: 0.33
MSE: 1649.41
RSME: 5517897.45



## Redes Neurais

### 1 camada, 3 neurônios

In [27]:
n_layers = 1
n_elements_layer = 3 
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.27
MSE: 1795.53
RSME: 6023149.65



### 3 camadas, 3 neurônios

In [28]:
n_layers = 3
n_elements_layer = 3
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.29
MSE: 1721.51
RSME: 5810831.81



### 3 camadas, 20 neurônios

In [29]:
n_layers = 3
n_elements_layer = 20 
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.34
MSE: 1631.11
RSME: 5439906.54



### 3 camadas, 50 neurônios

In [26]:
n_layers = 3
n_elements_layer = 50
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.34
MSE: 1610.08
RSME: 5416870.63



### 5 camadas, 20 neurônios

In [30]:
n_layers = 5
n_elements_layer = 20 
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.34
MSE: 1649.65
RSME: 5420358.53



### 5 camadas, 50 neurônios

In [31]:
n_layers = 5
n_elements_layer = 50
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.34
MSE: 1655.15
RSME: 5395376.86



### 7 camadas, 20 neurônios

In [24]:
n_layers = 7
n_elements_layer = 20
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.35
MSE: 1580.04
RSME: 5375366.88



### 7 camadas, 50 neurônios

In [25]:
n_layers = 7
n_elements_layer = 50
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.35
MSE: 1616.12
RSME: 5361501.67



# Conclusões parciais

Nenhum modelo performou bem, mesmo com o tuning de hiperparâmetros ou com redes neurais.  

Uma alternativa a ser realizada é escalonar o target (CREDIT_LIMIT), repetindo os procedimentos.

In [34]:
df.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,...,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,cluster_gmm_0,cluster_gmm_1,cluster_gmm_2,cluster_gmm_3
0,-0.819705,0.818182,-0.68748,-0.597929,-0.440112,-0.559575,0.166667,0.0,0.083333,0.0,...,-0.607648,1000.0,-0.829268,-0.655746,0.0,12,0,1,0,0
1,1.185767,1.0,0.260167,0.813451,-0.695456,-0.559575,1.0,1.0,0.0,0.0,...,0.080223,7500.0,-0.400762,0.433366,0.0,12,0,1,0,0
2,-0.184937,1.0,-0.798496,-0.568721,-0.695456,-0.559575,0.083333,0.083333,0.0,0.0,...,-0.676435,1200.0,-0.343391,-0.420672,0.0,12,1,0,0,0
3,0.625762,1.0,1.043304,-0.597929,2.873147,-0.559575,0.666667,0.0,0.583333,0.0,...,-0.194926,1800.0,0.392485,4.407691,0.0,12,0,0,0,1
4,0.637058,1.0,-0.210979,-0.597929,0.472059,-0.559575,1.0,0.0,1.0,0.0,...,0.080223,2300.0,-0.342646,0.220688,0.0,12,0,1,0,0


In [36]:
df_norm = df.copy(deep=True)

In [37]:
scaler = StandardScaler()
variaveis_scalling = ['CREDIT_LIMIT']
df_norm[variaveis_scalling] = scaler.fit_transform(df_norm[variaveis_scalling])

In [39]:
df_norm['CREDIT_LIMIT'].head()

0   -0.916830
1    1.283455
2   -0.849129
3   -0.646025
4   -0.476773
Name: CREDIT_LIMIT, dtype: float64

In [40]:
X = df_norm.drop('CREDIT_LIMIT', axis=1)
y = df_norm['CREDIT_LIMIT']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinamento de modelos com o target normalizado

In [42]:
modelo_rf = RandomForestRegressor()
modelo_lr = LinearRegression()
modelo_et = ExtraTreesRegressor()
modelo_sg = SGDRegressor()
modelo_grad = GradientBoostingRegressor()
modelo_hub = HuberRegressor()
modelo_ada = AdaBoostRegressor()
modelo_svr = SVR()

modelos = {'RandomForest': modelo_rf,
          'LinearRegression': modelo_lr,
          'ExtraTrees': modelo_et,
          'SGDRegressor': modelo_sg,
          'GradientBoostingRegressor': modelo_grad,
          'HuberRegressor': modelo_hub,
          'AdaBoostRegressor': modelo_ada,
          'SVR': modelo_svr}

for nome_modelo, modelo in modelos.items():
    # treinar
    modelo.fit(X_train, y_train)
    #testar
    previsao = modelo.predict(X_test)
    print(avaliar_modelo(nome_modelo, y_test, previsao))

Modelo: RandomForest
R2: 0.31
MSE: 0.56
RSME: 0.65

Modelo: LinearRegression
R2: 0.27
MSE: 0.61
RSME: 0.69

Modelo: ExtraTrees
R2: 0.30
MSE: 0.57
RSME: 0.66

Modelo: SGDRegressor
R2: 0.27
MSE: 0.61
RSME: 0.69

Modelo: GradientBoostingRegressor
R2: 0.34
MSE: 0.56
RSME: 0.63

Modelo: HuberRegressor
R2: 0.24
MSE: 0.57
RSME: 0.71



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Modelo: AdaBoostRegressor
R2: -0.23
MSE: 0.93
RSME: 1.16

Modelo: SVR
R2: 0.25
MSE: 0.54
RSME: 0.71



## Tuning do modelo de Gradient Boosting

In [44]:
param_grid={'n_estimators':[100, 500, 800, 1000], 
            'learning_rate': [0.1, 0.05, 0.01],
            'max_depth':[3, 5, 10, 15], 
            'min_samples_leaf':[7, 10, 13, 15]}

clf = GradientBoostingRegressor()

grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


GridSearchCV(cv=3, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [3, 5, 10, 15],
                         'min_samples_leaf': [7, 10, 13, 15],
                         'n_estimators': [100, 500, 800, 1000]},
             verbose=2)

In [45]:
best_gbr = grid_search.best_params_
best_gbr

{'learning_rate': 0.01,
 'max_depth': 5,
 'min_samples_leaf': 13,
 'n_estimators': 800}

In [46]:
clf_gbr = grid_search.best_estimator_

In [47]:
previsao = clf_gbr.predict(X_test)
print(avaliar_modelo('CLF Best Gradient Bossting', y_test, previsao))

Modelo: CLF Best Gradient Bossting
R2: 0.34
MSE: 0.55
RSME: 0.62



## Redes Neurais

Foram escolhidos os parâmetros que performaram melhor para os dados não escalonados. Que são: 3 camadas e 50 neurônios, 5 camadas e 20 neurônios, 7 camadas e 20 neurônios

### 3 camadas e 50 neurônios

In [48]:
n_layers = 3
n_elements_layer = 50
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.22
MSE: 0.61
RSME: 0.73



### 5 camadas e 20 neurônios

In [49]:
n_layers = 5
n_elements_layer = 20
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.35
MSE: 0.54
RSME: 0.62



### 7 camadas e 20 neurônios

In [50]:
n_layers = 7
n_elements_layer = 20
layer_size = (n_elements_layer, )
for i in range(0,n_layers-1):
    layer_size += (n_elements_layer, )

model = MLPRegressor(random_state = 42, max_iter = 100000, 
                     hidden_layer_sizes = layer_size, 
                     learning_rate = 'adaptive', activation = 'relu')
reg = model.fit(X_train, y_train)

previsao = reg.predict(X_test)

print(avaliar_modelo('Rede neural MLP', y_test, previsao))

Modelo: Rede neural MLP
R2: 0.34
MSE: 0.55
RSME: 0.62



# Conclusão

Mesmo após a normalização dos dados de crédito, o resultado ainda foi ruim, com um baixo r². Entre os modelos testados, a hiperparametização do modelo de Gradient Boosting resultou em um modelo com um desempenho semelhante a um modelo de redes neurais com 5 camadas e 20 neurônios por camada.