In [177]:
# Data Wrangling

import numpy as np
import pandas as pd

# Seleção e validação de modelos
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Pré-processamento para incluir no Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Pipilines para tratar dados futuros
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Modelos
from sklearn.linear_model import  LinearRegression

# Manter saída do processamento do sklearn como DataFrame pandas
from sklearn import set_config
set_config(transform_output='pandas')

In [178]:
df = pd.read_csv('insurance.csv')
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [179]:
# DF_futuro
df_futuro = pd.DataFrame({
    'age' : [37, 40, 58, 21, 43],
    'sex' : ['male', 'female', 'male', 'female', 'female'],
    'bmi' : [46.53, 32.395, 28.595, 21.89, 24.7],
    'children' : [3, 1, 0, 2, 2],
    'smoker' : ['no', 'no', 'no', 'no', 'yes'],
    'region' : ['southeast', 'northeast', 'northwest', 'souteast', 'north']
})

df_futuro.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region
0,37,male,46.53,3,no,southeast
1,40,female,32.395,1,no,northeast


In [180]:
# Base de treino
X = df.drop(columns='charges')
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2023)


In [181]:
# Dummies DF
X_train_transformed = pd.get_dummies(X_train)
X_train_transformed.head(2)

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,29,35.53,0,True,False,True,False,False,False,True,False
895,61,44.0,0,True,False,True,False,False,False,False,True


In [182]:
# LinearRegression
lr = LinearRegression()
lr.fit(X_train_transformed, y_train)

# Predict Dammies DF
lr.predict(X_train_transformed)

array([ 6297.81640554, 17604.53144584,  8564.16097047, ...,
        8171.64062054, 11728.69287826,  1928.89483764])

In [183]:
# Dummies futuro
df_futuro_transformed = pd.get_dummies(df_futuro)

#Erro lr.predict(df_futuro_transformed)  # Percebe que deu erro na região, pois o north do df_futuro não está no df de origem

In [184]:
# OneHot Encoder
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
onehot.fit(X_train)

In [185]:
# Transform X_train do DF
onehot.transform(X_train).head(2)

Unnamed: 0,age_18,age_19,age_20,age_21,age_22,age_23,age_24,age_25,age_26,age_27,...,children_1,children_2,children_3,children_4,children_5,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [186]:
# Transform X_train do DF_futuro
onehot.transform(df_futuro).head(2)  # Perceba que ele fez a transformação, mas deixou o aviso de unknown da regiao



Unnamed: 0,age_18,age_19,age_20,age_21,age_22,age_23,age_24,age_25,age_26,age_27,...,children_1,children_2,children_3,children_4,children_5,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [187]:
# Transformando apenas com as colunas categoricas do DF
categorical_columns = ['sex', 'smoker', 'region']

onehot.fit(X_train[categorical_columns])
X_train_transformed = onehot.transform(X_train[categorical_columns])
X_train_transformed.head(2)

Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,0.0,0.0,0.0,0.0,1.0,0.0
895,0.0,0.0,0.0,0.0,0.0,1.0


In [188]:
# Join com as colunas não categoricas
X_train_transformed = X_train_transformed.join(X_train.drop(columns=categorical_columns))
X_train_transformed.head(2)

Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
632,0.0,0.0,0.0,0.0,1.0,0.0,29,35.53,0
895,0.0,0.0,0.0,0.0,0.0,1.0,61,44.0,0


In [189]:
# Transformando apenas com as colunas categoricas do DF_futuro
categorical_columns = ['sex', 'smoker', 'region']

onehot.fit(X_train[categorical_columns])
df_futuro_transformed = onehot.transform(df_futuro[categorical_columns])
df_futuro_transformed  # Perceba que fez está com a region_north
df_futuro_transformed.head(2)



Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0


In [190]:
# Join com as colunas não categoricas do DF_futuro
df_futuro_transformed = df_futuro_transformed.join(df_futuro.drop(columns=categorical_columns))
df_futuro_transformed.head(2)

Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
0,1.0,0.0,0.0,0.0,1.0,0.0,37,46.53,3
1,0.0,0.0,1.0,0.0,0.0,0.0,40,32.395,1


In [191]:
# LinearRegression com OneHot do DF
lr.fit(X_train_transformed, y_train)
lr.predict(X_train_transformed)

array([ 6297.81640554, 17604.53144584,  8564.16097047, ...,
        8171.64062054, 11728.69287826,  1928.89483764])

In [195]:
# LinearRegression com OnHot do DF_futuro
lr.predict(df_futuro_transformed)

array([13453.33806874,  9492.69052556, 12798.58335217,   903.40011556,
       31451.96072786])

In [201]:
# Criando pré-processador com ColumTransformer
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('onehot', onehot, ['sex', 'smoker', 'region']),
    ('scale', StandardScaler(),['age', 'bmi', 'children'])
])

preprocessor.fit(X_train, y_train)

In [206]:
# View preprocessor X_train
preprocessor.transform(X_train).head(2)

Unnamed: 0,onehot__sex_male,onehot__smoker_yes,onehot__region_northeast,onehot__region_northwest,onehot__region_southeast,onehot__region_southwest,scale__age,scale__bmi,scale__children
632,0.0,0.0,0.0,0.0,1.0,0.0,-0.734484,0.783894,-0.89775
895,0.0,0.0,0.0,0.0,0.0,1.0,1.550704,2.160687,-0.89775


In [207]:
# View preprocessor df_futuro
preprocessor.transform(df_futuro).head(2)



Unnamed: 0,onehot__sex_male,onehot__smoker_yes,onehot__region_northeast,onehot__region_northwest,onehot__region_southeast,onehot__region_southwest,scale__age,scale__bmi,scale__children
0,1.0,0.0,0.0,0.0,1.0,0.0,-0.163187,2.571937,1.629218
1,0.0,0.0,1.0,0.0,0.0,0.0,0.051049,0.274302,-0.055427


In [209]:
# Criando modelo com Pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lr_model', LinearRegression())
])

model_pipeline.fit(X_train, y_train)

In [216]:
# Teste modelo, OK
model_pipeline.predict(df_futuro)



array([ 1.36320000e+04,  9.34400000e+03,  1.27680000e+04, -1.58142173e+17,
       -1.58142173e+17])

In [219]:
# Modelo completo considerando também a transformação de dados NaN
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

numeric_processor = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), onehot)

preprocessor = ColumnTransformer([
    ('onehot', categorical_processor, ['sex', 'smoker', 'region']),
    ('scale', numeric_processor, ['age', 'bmi', 'children'])
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lr_model', LinearRegression())
])

model_pipeline.fit(X_train, y_train)

In [221]:
# Colocando alguns NaNs no df_futuro
df_futuro = pd.DataFrame({
    'age' : [37, np.nan, 58, 21, 43],
    'sex' : ['male', 'female', 'male', 'female', 'female'],
    'bmi' : [46.53, 32.395, 28.595, 21.89, 24.7],
    'children' : [3, 1, 0, 2, 2],
    'smoker' : ['no', np.nan, 'no', 'no', 'yes'],
    'region' : ['southeast', 'northeast', 'northwest', np.nan, 'north']
})

# Utilizando modelo com NaNs
model_pipeline.predict(df_futuro)



array([ 1.36320000e+04,  9.15200000e+03,  1.27680000e+04,  5.12000000e+02,
       -1.58142173e+17])