In [32]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import pandas as pd

In [None]:
#https://www.kaggle.com/datasets/teertha/ushealthinsurancedataset?resource=download
df = pd.read_csv('data/insurance.csv')

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Tratamento

In [6]:
df['smoker'] = df['smoker'].astype('category')
df['region'] = df['region'].astype('category')
df['sex'] = df['sex'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       1338 non-null   int64   
 1   sex       1338 non-null   category
 2   bmi       1338 non-null   float64 
 3   children  1338 non-null   int64   
 4   smoker    1338 non-null   category
 5   region    1338 non-null   category
 6   charges   1338 non-null   float64 
dtypes: category(3), float64(2), int64(2)
memory usage: 46.3 KB


In [10]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,False,True,False,False,True
1,18,33.77,1,1725.5523,True,False,False,True,False
2,28,33.0,3,4449.462,True,False,False,True,False
3,33,22.705,0,21984.47061,True,False,True,False,False
4,32,28.88,0,3866.8552,True,False,True,False,False


In [11]:
X = df.drop(['charges'], axis=1)
y = df['charges']

In [13]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

## Testa modelos

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
models = {
    'LinearRegression': LinearRegression(),
    'ElasticNet': ElasticNet(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
}

In [18]:
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = (mse, r2)

In [19]:
for name, scores in results.items():
    print(f'{name}> mse: {scores[0]} > r2: {scores[1]}')

LinearRegression> mse: 33596915.85136147 > r2: 0.7835929767120723
ElasticNet> mse: 49129161.80917044 > r2: 0.6835454864133704
Ridge> mse: 33604956.54337763 > r2: 0.7835411843918458
Lasso> mse: 33601151.98665181 > r2: 0.7835656905935344


Modelos muito parecidos, será utilizado o LinearRegression

In [None]:
models_grid = {
    'ElasticNet': ElasticNet(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
}

params_grid = {
    'ElasticNet': {'alpha': [0.1, 1.0, 10, 30, 100], 'l1_ratio': [0.1, 0.2, 0.5, 0.7, 1]},
    'Ridge': {'alpha': [0.1, 1, 10, 100]},
    'Lasso': {'alpha': [0.1, 1, 10, 100]},
}

In [28]:
best_models = {}

for name, model in models_grid.items():
    grid_search = GridSearchCV(model, param_grid=params_grid[name], cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_models[name] = grid_search.best_estimator_
    print(f'{name}: {grid_search.best_estimator_}')

ElasticNet: ElasticNet(alpha=100, l1_ratio=1)
Ridge: Ridge(alpha=10)
Lasso: Lasso(alpha=100)


Modelo Final

In [29]:
model = LinearRegression()
model.fit(X_train, y_train)