<a href="https://colab.research.google.com/github/LucasJFaust/bibliotecas_py/blob/main/Case_Seguros.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Case Previsão - Seguros



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
seguro = pd.read_excel("insurance.xlsx")

In [3]:
seguro.head()

Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,regiao,custos_seguro
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.56,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
seguro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1341 entries, 0 to 1340
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   idade              1341 non-null   int64  
 1   sexo               1338 non-null   object 
 2   imc                1341 non-null   float64
 3   quantidade_filhos  1341 non-null   int64  
 4   fumante            1341 non-null   object 
 5   regiao             1341 non-null   object 
 6   custos_seguro      1341 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.5+ KB


- Tratando os valores nulos

In [5]:
seguro.dropna(inplace=True)

In [6]:
seguro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1338 entries, 0 to 1340
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   idade              1338 non-null   int64  
 1   sexo               1338 non-null   object 
 2   imc                1338 non-null   float64
 3   quantidade_filhos  1338 non-null   int64  
 4   fumante            1338 non-null   object 
 5   regiao             1338 non-null   object 
 6   custos_seguro      1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


- Tratando as variáveis categóricas

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()

le.fit(seguro.sexo)
seguro.sexo = le.transform(seguro.sexo)

le.fit(seguro.fumante)
seguro.fumante = le.transform(seguro.fumante)

le.fit(seguro.regiao)
seguro.regiao = le.transform(seguro.regiao)

In [9]:
seguro = pd.DataFrame(seguro)

- Agora vamos balancear/normalizar as variáveis transformadas

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

seguro_norm = pd.DataFrame(scaler.fit_transform(seguro), index=seguro.index, columns=seguro.columns)
seguro_norm.head()


Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,regiao,custos_seguro
0,0.021739,0.0,0.321227,0.0,1.0,1.0,0.251611
1,0.0,1.0,0.47915,0.2,0.0,0.666667,0.009636
2,0.217391,1.0,0.4735,0.6,0.0,0.666667,0.053115
3,0.326087,1.0,0.181464,0.0,0.0,0.333333,0.33301
4,0.304348,1.0,0.347592,0.0,0.0,0.333333,0.043816


- Modelo OLS

In [12]:
import statsmodels.formula.api as smf
function = "custos_seguro~idade+sexo+imc+quantidade_filhos+fumante+regiao"
model = smf.ols(formula=function, data=seguro_norm).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          custos_seguro   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.750
Method:                 Least Squares   F-statistic:                     668.1
Date:                Mon, 05 Feb 2024   Prob (F-statistic):               0.00
Time:                        20:02:43   Log-Likelihood:                 1230.4
No. Observations:                1338   AIC:                            -2447.
Df Residuals:                    1331   BIC:                            -2410.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.0479      0.00

In [13]:
X = seguro_norm[["idade", "imc", "quantidade_filhos", "fumante", "regiao"]]

y = seguro_norm[["custos_seguro"]]

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

lr = LinearRegression()

lr.fit(X_train, y_train)

In [20]:
from sklearn import metrics
r_sq = lr.score(X,y)
print(r_sq)

0.7505629738411864


In [22]:
y_pred_train = lr.predict(X_train)

print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

MAE: 0.06760649849946623


In [23]:
y_pred_test = lr.predict(X_test)

print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

MAE: 0.06275502946015028


- Agora vamos analisar a performance com o modelos de ML

In [25]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

  rf.fit(X_train, y_train)


In [26]:
from sklearn import metrics
r_sq = rf.score(X,y)
print(r_sq)

y_pred_train = rf.predict(X_train)
print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

y_pred_test = rf.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

0.9518517408830581
MAE: 0.01635731215834568
MAE: 0.043301675391957235


In [27]:
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [28]:
ada_pred_train = ada.predict(X_train)

r_sq = ada.score(X,y)
print(r_sq)

y_pred_train = ada.predict(X_train)
print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

y_pred_test = ada.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

0.8237278063467659
MAE: 0.06612408455375543
MAE: 0.0653858503773663


In [30]:
from sklearn.ensemble import GradientBoostingRegressor
grb = GradientBoostingRegressor()

grb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [31]:
r_sq = grb.score(X,y)
print(r_sq)

y_pred_train = grb.predict(X_train)
print("MAE:", metrics.mean_absolute_error(y_train, y_pred_train))

y_pred_test = grb.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_test))

0.8972853894243809
MAE: 0.03346236453828395
MAE: 0.03922575406413379


- Considerando que o erro ficou muito mais próximo que o Random, vamos tunar esse modelo

In [33]:
from sklearn.model_selection import GridSearchCV

parameters = {"bootstrap": [True],
              "max_depth": [5,10,20],
              "min_samples_leaf": [2,3,4],
              "min_samples_split": [2,3],
              "n_estimators": [50, 100, 200]}

grid_search = GridSearchCV(rf, parameters, scoring="r2", cv=2, n_jobs=-1)

In [34]:
grid_search.fit(X_train, y_train)

  self.best_estimator_.fit(X, y, **fit_params)


In [35]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)

RandomForestRegressor(max_depth=5, min_samples_leaf=4, min_samples_split=3)
{'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 3, 'n_estimators': 100}


In [42]:
parameters = {
              "max_depth": [5],
              "min_samples_leaf": [4],
              "min_samples_split": [3],
              "n_estimators": [100]}

grid_search = GridSearchCV(grb, parameters, scoring="r2", cv=2, n_jobs=-1)

In [43]:
grid_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [44]:
best_model = grid_search.best_estimator_

In [45]:
best_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 4,
 'min_samples_split': 3,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [47]:
grb_tunned = GradientBoostingRegressor (alpha = 0.9,
 ccp_alpha =0.0,
 criterion = 'friedman_mse',
 init = None,
 learning_rate = 0.1,
 loss = 'squared_error',
 max_depth = 5,
 max_features = None,
 max_leaf_nodes = None,
 min_impurity_decrease = 0.0,
 min_samples_leaf = 4,
 min_samples_split = 3,
 min_weight_fraction_leaf = 0.0,
 n_estimators = 100,
 n_iter_no_change = None,
 random_state = None,
 subsample = 1.0,
 tol = 0.0001,
 validation_fraction = 0.1,
 verbose = 0,
 warm_start = False)

In [49]:
grb_tunned.fit(X_train, y_train)

rf_tunned_pred_train = grb_tunned.predict(X_train)
rf_tunned_pred_test = grb_tunned.predict(X_test)

  y = column_or_1d(y, warn=True)


In [51]:
r_sq = grb_tunned.score(X,y)
r_sq


0.9160594927822325

In [53]:
print("MAE:", metrics.mean_absolute_error(y_train, rf_tunned_pred_train))

MAE: 0.028222400707827323


In [54]:
print("MAE:", metrics.mean_absolute_error(y_test, rf_tunned_pred_test))

MAE: 0.04296056618957654
