In [69]:
# Importar as bibliotecas 

# EDA
import pandas as pd
import plotly.express as px
import numpy as np

# Machine Learning

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

### Carregar os Dados

In [70]:
# Carregar os dados já tratados
df_costs = pd.read_csv('./datasets/healthcosts_cleaned.csv')

In [71]:
# Mostrar as primeiras linhas
df_costs.head(10)

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,0,19,female,27.9,0,1,southwest,16884.924
1,1,18,male,33.77,1,0,southeast,1725.5523
2,2,28,male,33.0,3,0,southeast,4449.462
3,3,33,male,22.705,0,0,northwest,21984.47061
4,4,32,male,28.88,0,0,northwest,3866.8552
5,5,31,female,25.74,0,0,southeast,3756.6216
6,6,46,female,33.44,1,0,southeast,8240.5896
7,7,37,female,27.74,3,0,northwest,7281.5056
8,8,37,male,29.83,2,0,northeast,6406.4107
9,9,60,female,25.84,0,0,northwest,28923.13692


In [72]:
# Mostrar as últimas linhas
df_costs.tail(10)

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
1328,1328,23,female,24.225,2,0,northeast,22395.74424
1329,1329,52,male,38.6,2,0,southwest,10325.206
1330,1330,57,female,25.74,2,0,southeast,12629.1656
1331,1331,23,female,33.4,0,0,southwest,10795.93733
1332,1332,52,female,44.7,3,0,southwest,11411.685
1333,1333,50,male,30.97,3,0,northwest,10600.5483
1334,1334,18,female,31.92,0,0,northeast,2205.9808
1335,1335,18,female,36.85,0,0,southeast,1629.8335
1336,1336,21,female,25.8,0,0,southwest,2007.945
1337,1337,61,female,29.07,0,1,northwest,29141.3603


In [73]:
# Mostrar a estrutura do dataset
df_costs.info(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1338 non-null   int64  
 1   age              1338 non-null   int64  
 2   sex              1338 non-null   object 
 3   bmi              1338 non-null   float64
 4   children         1338 non-null   int64  
 5   smoker           1338 non-null   int64  
 6   region           1338 non-null   object 
 7   medical charges  1338 non-null   float64
dtypes: float64(2), int64(4), object(2)
memory usage: 83.8+ KB


### Preparação dos dados

In [74]:
# Preparar dados - separar X e y
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [75]:
# Carregar o preprocessor
import joblib
preprocessor = joblib.load('./preprocessor_dataset_healthcosts.pkl')

In [76]:
# Divir o dataset entre treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=51)

In [77]:
# Aplicar preprocessor
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [78]:
# Mostrar os conjuntos
print(f'Dados de Treinamento: {X_train.shape}')
print(f'Dados de Teste: {X_test.shape}')


Dados de Treinamento: (267, 10)
Dados de Teste: (1071, 10)


### Treinamento do Modelo

In [79]:
# Criar o objeto de Voting Regressor
lr_model = LinearRegression()
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

voting_model = VotingRegressor(
    estimators=[
        ('linear regression', lr_model),
        ('elastic', elastic_model),
        ('decision tree', tree_model)
    ]
)

In [80]:
# Treinar o modelo
voting_model.fit(X_train, y_train)

0,1,2
,estimators,"[('linear regression', ...), ('elastic', ...), ...]"
,weights,
,n_jobs,
,verbose,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False

0,1,2
,alpha,1.0
,l1_ratio,0.5
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,51

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Análise dos Resultados

In [81]:
# Realizar predição 
y_pred = voting_model.predict(X_test)

In [82]:
# Mostrar predição
y_pred


array([ 9317.74707749, 36286.37522074,  9511.08236705, ...,
       12739.71557102,  8490.59806613, 22082.20032736], shape=(1071,))

In [83]:
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [84]:
# Mostrar o Erro e R2 do modelo
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

RMSE: 5783.256063552278
R2: 0.7774140545637298


In [85]:

# Carregar as importâncias dos modelo

importances = []

for estimador in voting_model.estimators_:
        # Para modelo lineares, obtenha coef
    if hasattr(estimador, 'coef_'):
        importances.append(np.abs(estimador.coef_))
        # Para modelo basedos em árvores obtenha feature_importances
    elif hasattr(estimador, 'feature_importances_'):
        importances.append(estimador.feature_importances_)
    else:
        print(f'Não foi possível carregar a importância das variáves do modelo {type(estimador).__name__}')

In [86]:
# Calcular a média das importâncias
importancia_media = np.mean(importances, axis=0)

In [87]:
# Normalizar as importâncias
feature_importance = importancia_media / np.sum(importancia_media)

In [88]:
# Obter os nome das features
feature_names = preprocessor.get_feature_names_out()

In [89]:
# Criar um Dataframe com nomes e importância das features
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_names})

In [90]:
# Ordenar o Dataframe pela importância
importance_df = importance_df.sort_values(by='importance', ascending=True)

In [91]:
# Criar uma gráfico para mostrar importância
fig = px.bar(
    importance_df,
    x='importance',
    y='feature',
    title='Importância das features - Voting Regressor',
    orientation='h'
)
fig.show()

In [92]:
# Mostrar evidência do Hard Voting (Média aritmetica dos valores dos estimadores)

# Selecionar um registro para predição
X_sample = X_test[7].reshape(1, -1)

In [93]:
linear_pred = voting_model.named_estimators_['linear regression'].predict(X_sample)
elastic_pred = voting_model.named_estimators_['elastic'].predict(X_sample)
tree_pred = voting_model.named_estimators_['decision tree'].predict(X_sample)


In [94]:
# Predição final Voting 
voting_pred = voting_model.predict(X_sample)

In [95]:
# Media das predições dos estimadores
mean_pred = np.mean([linear_pred[0], elastic_pred[0], tree_pred[0]])

In [97]:
# Exibir os resultados
print(f'Predição da Regressão Linear {linear_pred[0]}')
print(f'Predição da ElasticNet {elastic_pred[0]}')
print(f'Predição da Árvore de Decisão {tree_pred[0]}')

print(f'Média das Predições {mean_pred}')
print(f'Predição final do Hard Voting {linear_pred[0]}')



Predição da Regressão Linear 12781.372554584777
Predição da ElasticNet 12495.399733492653
Predição da Árvore de Decisão 11356.6609
Média das Predições 12211.144396025811
Predição final do Hard Voting 12781.372554584777
