In [1]:
import pandas as pd

DATA_PATH = 'insurance.xlsx'

df = pd.read_excel(DATA_PATH)
df.head(2)

Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,regiao,custos_seguro
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [2]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['sexo']    = encoder.fit_transform(df['sexo'])
df['regiao']  = encoder.fit_transform(df['regiao'])
df['fumante'] = encoder.fit_transform(df['fumante'])

In [3]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df_norm = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)
df_norm

Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,regiao,custos_seguro
0,0.021739,0.0,0.321227,0.0,1.0,1.000000,0.264777
1,0.000000,0.5,0.479150,0.2,0.0,0.666667,0.027059
2,0.217391,0.5,0.473500,0.6,0.0,0.666667,0.069773
3,0.326087,0.5,0.181464,0.0,0.0,0.333333,0.344744
4,0.304348,0.5,0.347592,0.0,0.0,0.333333,0.060637
...,...,...,...,...,...,...,...
1336,0.695652,0.5,0.403820,0.6,0.0,0.333333,0.166230
1337,0.000000,0.0,0.429379,0.0,0.0,0.000000,0.034593
1338,0.000000,0.0,0.562012,0.0,0.0,0.666667,0.025558
1339,0.065217,0.0,0.264730,0.0,0.0,1.000000,0.031487


In [4]:
import statsmodels.api as sm

df_const = sm.add_constant(df_norm.drop(columns=['custos_seguro']))
sm_model = sm.OLS(df_norm['custos_seguro'], df_const, hasconst=True).fit()
print(sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:          custos_seguro   R-squared:                       0.750
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     667.8
Date:                Tue, 02 Apr 2024   Prob (F-statistic):               0.00
Time:                        19:23:31   Log-Likelihood:                 1255.2
No. Observations:                1341   AIC:                            -2496.
Df Residuals:                    1334   BIC:                            -2460.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -0.0289      0.00

In [5]:
import statsmodels.formula.api as smf

func  = "custos_seguro~idade+sexo+imc+quantidade_filhos+fumante+regiao"
model = smf.ols(formula=func, data=df_norm).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          custos_seguro   R-squared:                       0.750
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     667.8
Date:                Tue, 02 Apr 2024   Prob (F-statistic):               0.00
Time:                        19:23:31   Log-Likelihood:                 1255.2
No. Observations:                1341   AIC:                            -2496.
Df Residuals:                    1334   BIC:                            -2460.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.0289      0.00

In [6]:
X = df_norm.drop(columns=['custos_seguro', 'sexo'])
y = df_norm['custos_seguro']

In [7]:
import numpy as np

features = np.asarray(X.columns)
features

array(['idade', 'imc', 'quantidade_filhos', 'fumante', 'regiao'],
      dtype=object)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'R² -> {lr.score(X, y)}\
      \nMAE_train -> {metrics.mean_absolute_error(y_train, lr.predict(X_train))}\
      \nMAE_test  -> {metrics.mean_absolute_error(y_test, lr.predict(X_test))}')

R² -> 0.7493409266155526      
MAE_train -> 0.06368967656240831      
MAE_test  -> 0.06707534198100804


In [10]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

print(f'R² -> {rf.score(X, y)}\
      \nMAE_train -> {metrics.mean_absolute_error(y_train, rf.predict(X_train))}\
      \nMAE_test  -> {metrics.mean_absolute_error(y_test, rf.predict(X_test))}')

R² -> 0.9472257740827179      
MAE_train -> 0.015920333747126498      
MAE_test  -> 0.04494192522379433


In [11]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor()
ada.fit(X_train, y_train)

print(f'R² -> {ada.score(X, y)}\
      \nMAE_train -> {metrics.mean_absolute_error(y_train, ada.predict(X_train))}\
      \nMAE_test  -> {metrics.mean_absolute_error(y_test, ada.predict(X_test))}')

R² -> 0.8317864127597348      
MAE_train -> 0.060568960615932955      
MAE_test  -> 0.062120677231235155


In [12]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

print(f'R² -> {gbr.score(X, y)}\
      \nMAE_train -> {metrics.mean_absolute_error(y_train, gbr.predict(X_train))}\
      \nMAE_test  -> {metrics.mean_absolute_error(y_test, gbr.predict(X_test))}')

R² -> 0.8977511259093132      
MAE_train -> 0.031505186002482775      
MAE_test  -> 0.039869351735219015


In [13]:
from sklearn.model_selection import GridSearchCV

parameters = { "max_depth": [5],
              "min_samples_leaf": [4],
              "min_samples_split": [2],
              "n_estimators": [200]}

grid_search = GridSearchCV(gbr, parameters, scoring="r2", cv=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [14]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)

GradientBoostingRegressor(max_depth=5, min_samples_leaf=4, n_estimators=200)
{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [15]:
best_model = grid_search.best_estimator_
best_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [16]:
gbr_tunned = GradientBoostingRegressor(alpha=0.9,
ccp_alpha=0.0,
criterion='friedman_mse',
init=None,
learning_rate=0.1,
loss='squared_error',
max_depth=5,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_samples_leaf=4,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=200,
n_iter_no_change=None,
random_state=None,
subsample=1.0,
tol=0.0001,
validation_fraction=0.1,
verbose=0,
warm_start=False)
gbr_tunned.fit(X_train, y_train)

print(f'R² -> {gbr_tunned.score(X, y)}\
      \nMAE_train -> {metrics.mean_absolute_error(y_train, gbr_tunned.predict(X_train))}\
      \nMAE_test  -> {metrics.mean_absolute_error(y_test, gbr_tunned.predict(X_test))}')

R² -> 0.9295433455722835      
MAE_train -> 0.022761683279318195      
MAE_test  -> 0.0462353040295609
