In [165]:
import pandas as pd
import plotly.express as px
import numpy as np


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

### Carregar os dados

In [166]:
# Carregar os dados ja tratados
df_costs = pd.read_csv('..\datasets\healthcost_cleaned.csv')


invalid escape sequence '\d'


invalid escape sequence '\d'


invalid escape sequence '\d'



In [167]:
df_costs.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552
5,31,female,25.74,0,0,southeast,3756.6216
6,46,female,33.44,1,0,southeast,8240.5896
7,37,female,27.74,3,0,northwest,7281.5056
8,37,male,29.83,2,0,northeast,6406.4107
9,60,female,25.84,0,0,northwest,28923.13692


In [168]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


# Preparação dos Dados

In [169]:
# Preparar os dados para o modelo
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [170]:
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) # handle_unknown='ignore', no caso de o algoritmo considerar um dado desconhecido na hora de fazer o slip, quando não é encontrado nas duas porções.
])

In [171]:
preprocessor

In [172]:
# Dividir os dados entre treino e teste
X_train, X_test, y_train_, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [173]:
# Aplicar preprocessor em treinamento e test
# Treinamento = fit e transform
# Teste = tranform, considerando o treinamento que foi feito anteriormente
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [174]:
# Mostrar as dimensões dos conjuntos
print(f'Treinamento: {X_train.shape}\nTeste: {X_test.shape}')

Treinamento: (1070, 10)
Teste: (268, 10)


### Treinamento do modelo Stacking

In [175]:
# Criar o modelo de StackingRegressor
# Algoritmo base
lr_model = LinearRegression()
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0., random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

# Meta-Modelo ou Meta-Learner
huber_model = HuberRegressor()

# Stacking Model
stacking_model = StackingRegressor(
    estimators=[
        ('linear regression', lr_model),
        ('elastc', elastic_model),
        ('decision tree', tree_model)
    ],
    final_estimator=huber_model,
    # passthrough=False (padrão) -> Usa apenas as predições dos estimadores
    # passthrough=True -> Usa as predições dos estimadores mais o conjunto de treinamento
    passthrough=True
)

In [176]:
# Treinar o modelo
stacking_model.fit(X_train, y_train_)


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.685e+10, tolerance: 1.494e+07 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.




Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.716e+10, tolerance: 1.182e+07 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.830e+10, tolerance: 1.232e+07 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.767e+10, tolerance: 1.209e+07 Linear regression models with null

### Análise de Resultados

In [177]:
# Fazer as predições com base no modelo treinado
y_pred = stacking_model.predict(X_test)

In [178]:
# Mostrar as predições 
y_pred

array([ 7714.86094874, 43360.22269362, 20520.07233295, 23282.31000515,
       38524.54822763,  9702.5315971 ,  7856.75114845, 12182.28958738,
        5549.70107692,  9424.00185955,  8709.92986076, 11497.18369581,
        7318.89017813,  2455.59603691,  4553.1228242 , 12491.97598569,
        3093.51706767,  6938.61387257, 18351.64816483, 20120.83998499,
        4900.60225072,  6822.55849919, 53523.70836454, 10875.92272229,
        5626.51829488, 15005.69969858, 11475.74950226,  1576.11109667,
       30816.46290083, 19318.78666651,  1449.57463429, 23009.52786018,
        2480.40874417,  2806.5645809 ,  7053.36214053, 24963.03928055,
        7369.54783128,  1419.98375861, 11647.41753469,  7257.69070907,
       11210.99237409,  1234.02179056,  3677.06221198,  1463.21718408,
       12228.4855513 , 12743.41467307, 11567.75063504, 40214.34828015,
        8301.09275914, 12548.909254  ,  4748.5702539 , 37477.73639613,
        8823.13242561, 46787.99185837, 18813.67910174, 33236.61710648,
      

In [179]:
# Avaliar Métricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [180]:
# Mostrar o Error e R2 do modelo
print(f'RMSE: {rmse:.2f}\nR2 Score: {r2:.2f}')

RMSE: 6637.38
R2 Score: 0.75


In [181]:
# Calcular a imporância considerando os modelos do stacking regressor

importances = []

for estimador in stacking_model.estimators_:
    if hasattr(estimador, 'coef_'):
        # Uso em modelos Lineares
        importances.append(np.abs(estimador.coef_))
    elif hasattr(estimador, 'feature_importances_'):
        # Uso em modelos de árvore
        importances.append(estimador.feature_importances_)
    else:
        print(f'Não foi possivel carregar a importância das variáveis do modeo {estimador}')

In [182]:
# Calculando a média das importâncias
importancia_media = np.mean(importances, axis=0)

# Obter os nomes da features
features_name = preprocessor.get_feature_names_out()

In [183]:
# Normalizar as importâncias
feaeture_importance = importancia_media / np.sum(importancia_media)

In [184]:
# Criar um dataframe com as importâncias e os nomes das features
importance_df = pd.DataFrame({'feature': features_name, 'importance': feaeture_importance})

In [185]:
# Ordernar o DataFrame pela importância
importance_df = importance_df.sort_values(by='importance', ascending=True)

In [186]:
# Criar o gráfico de barras para apresentar a importância das features
fig = px.bar(importance_df,
             x='importance',
             y='feature',
             title='Importância das Features - Stacking Regressor',
             orientation='h')
fig.show()

### Propriedades do modelo

In [187]:
# Mostrar a ecidência do Stacking Regressor

# Selecionar um amostra para fazer a predição
X_sample = X_test[20].reshape(1, -1)

# Predições individuais dos estimadores
linear_pred = stacking_model.named_estimators_['linear regression'].predict(X_sample)
elastic_pred =stacking_model.named_estimators_['elastc'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision tree'].predict(X_sample)

#Prefição final do Stacking Regressor
stacking_pred = stacking_model.predict(X_sample)
# Mostrar as Predições dos estimadores
print(f'PREDIÇÕES\nRegressão Linerar: {linear_pred[0]:.2f}\nElasticNet: {elastic_pred[0]:.2f}\nÁrvore de Decisão:{tree_pred[0]:.2f}')
print(f'PREDIÇÃO FINAL\nStacking Regressor: {stacking_pred[0]:.2f}')

PREDIÇÕES
Regressão Linerar: 10354.97
ElasticNet: 12010.30
Árvore de Decisão:3987.93
PREDIÇÃO FINAL
Stacking Regressor: 4900.60
