In [3]:
from PipelineClass import PipelineClass

#Data wrangling
import numpy as np
import pandas as pd

#seleção e validação de modelos
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Pré-processamento para incluir no pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Pipelines para tratar dados futuros
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Modelos
from sklearn.linear_model import LinearRegression

#Manter saída do processamento do sklearn como DataFrame pandas
from sklearn import set_config
set_config(transform_output="pandas")



In [4]:
URL_DATA = 'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv'
df = pd.read_csv(URL_DATA)
df


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
# df_futuro = pd.DataFrame ({
#     'age': [37, 40, 58,21,43],
#     'sex': ['male', 'female', 'male', 'female', 'female'],
#     'bmi': [46.53, 32.395, 28.595, 21.89, 24.7],
#     'children': [3, 1, 0, 2 ,2],
#     'smoker': ['no', 'no', 'no', 'no', 'yes'],
#     'region': ['southeast', 'northeast', 'northwest', 'southeast', 'north']
# })

In [6]:
df_futuro = pd.DataFrame ({
    'age': [37, np.nan, 58,21,43],
    'sex': ['male', 'female', np.nan, 'female', 'female'],
    'bmi': [46.53, 32.395, 28.595, 21.89, 24.7],
    'children': [3, 1, 0, 2 ,2],
    'smoker': ['no', np.nan, 'no', 'no', 'yes'],
    'region': ['southeast', 'northeast', 'northwest', np.nan, 'north']
})

df_futuro

Unnamed: 0,age,sex,bmi,children,smoker,region
0,37.0,male,46.53,3,no,southeast
1,,female,32.395,1,,northeast
2,58.0,,28.595,0,no,northwest
3,21.0,female,21.89,2,no,
4,43.0,female,24.7,2,yes,north


In [7]:
lr = LinearRegression()

In [8]:
X = df.drop(columns='charges')
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=2023)
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
632,29,female,35.53,0,no,southeast
895,61,female,44.0,0,no,southwest
1173,38,male,29.26,2,no,northwest
1131,27,male,45.9,2,no,southwest
363,21,female,26.4,1,no,southwest


In [9]:
X_train_transformed = pd.get_dummies(X_train)
X_train_transformed

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,29,35.530,0,1,0,1,0,0,0,1,0
895,61,44.000,0,1,0,1,0,0,0,0,1
1173,38,29.260,2,0,1,1,0,0,1,0,0
1131,27,45.900,2,0,1,1,0,0,0,0,1
363,21,26.400,1,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
884,25,26.695,4,0,1,1,0,0,1,0,0
515,58,35.700,0,0,1,1,0,0,0,0,1
695,26,40.185,0,1,0,1,0,0,1,0,0
454,32,46.530,2,0,1,1,0,0,0,1,0


In [10]:
lr.fit(X_train_transformed, y_train)

In [11]:
lr.predict(X_train_transformed)

array([ 6297.81640554, 17604.53144584,  8564.16097047, ...,
        8171.64062054, 11728.69287826,  1928.89483764])

In [12]:
# df_futuro_transformed = pd.get_dummies(df_futuro)
# lr.predict(df_futuro_transformed)

In [13]:
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
onehot.fit(X_train)

In [14]:
onehot.transform(X_train)

Unnamed: 0,age_18,age_19,age_20,age_21,age_22,age_23,age_24,age_25,age_26,age_27,...,children_1,children_2,children_3,children_4,children_5,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
363,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
onehot.transform(df_futuro)



Unnamed: 0,age_18,age_19,age_20,age_21,age_22,age_23,age_24,age_25,age_26,age_27,...,children_1,children_2,children_3,children_4,children_5,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
categorical_columns = ['sex', 'region', 'smoker']

X_train_transformed = onehot.fit(X_train[categorical_columns])
X_train_transformed = X_train_transformed.transform(X_train[categorical_columns])

X_train_transformed = X_train_transformed.join(X_train.drop(columns=categorical_columns))

In [17]:
df_futuro_transformed = onehot.transform(df_futuro[categorical_columns])
df_futuro_transformed = df_futuro_transformed.join(df_futuro.drop(columns=categorical_columns))
df_futuro_transformed



Unnamed: 0,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_yes,age,bmi,children
0,1.0,0.0,0.0,1.0,0.0,0.0,37.0,46.53,3
1,0.0,1.0,0.0,0.0,0.0,0.0,,32.395,1
2,0.0,0.0,1.0,0.0,0.0,0.0,58.0,28.595,0
3,0.0,0.0,0.0,0.0,0.0,0.0,21.0,21.89,2
4,0.0,0.0,0.0,0.0,0.0,1.0,43.0,24.7,2


In [18]:
lr.fit(X_train_transformed, y_train)

In [19]:
lr.predict(X_train_transformed)

array([ 6297.81640554, 17604.53144584,  8564.16097047, ...,
        8171.64062054, 11728.69287826,  1928.89483764])

In [26]:
# lr.predict(df_futuro_transformed)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [21]:
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)


numeric_processor = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), onehot)


# preprocessor = ColumnTransformer([
#     ('onehot', categorical_processor, ['sex','smoker','region']),
#     ('scale', numeric_processor, ['age', 'bmi', 'children'])
# ])

# model_pipeline = Pipeline(steps=[
#     ("preprocessor", preprocessor),
#     ('lr_model', LinearRegression())
# ])

pipe = PipelineClass(['age', 'bmi', 'children'], ['sex','smoker','region'], LinearRegression(), categorical_processor, numeric_processor)
model_pipeline = pipe.get_pipeline()

model_pipeline.fit(X_train, y_train)

In [22]:
model_pipeline.predict(X_test)

array([11672., 11448., 26776.,  7076.,  7736.,  5052.,  6996.,  2220.,
       11932., 33188., 12100., 33544., 40136., 32220.,  7256., 40128.,
        9956., 11148.,  4256.,  3288., 27336.,  2660.,  6036.,  2180.,
        2128.,  8940., 29224., 11500., 37408., 14004.,  8648.,    68.,
       26024.,  4644., 11140., 34204.,  9900., 30436., 11848.,  8328.,
        8680.,  8008.,  2140.,  7388., 31104., 12708.,  8780., 11012.,
        5748., 12768.,  4212., 25556., 37176., 12960., 10416., 26764.,
        5600., 13952.,  -252.,  9764.,  3440., 10184., 11452., 13984.,
        7232., 37600., 11308., 26752.,  7680., 12932., 18140.,  7764.,
       12848.,  2500., 25112., 28788.,  8656.,  2452., 16192.,  5040.,
        9360., 12480., 23884., 33940.,  9488., 13584.,  7016., 14800.,
       26496.,  4920., 14216.,  2960., 11804., 11640.,  7312.,  6776.,
        1564.,  4476., 17612.,  7572.,  4800.,  8192., 14712., 13460.,
       11236., 31412., 13432.,  4308.,  5100., 28028., 17884., 38168.,
      

In [23]:
preprocessor = pipe.get_preprocessor()
preprocessor.fit(X_train, y_train)

In [24]:
preprocessor.transform(X_train)

Unnamed: 0,encoder__sex_male,encoder__smoker_yes,encoder__region_northeast,encoder__region_northwest,encoder__region_southeast,encoder__region_southwest,scaler__age,scaler__bmi,scaler__children
632,0.0,0.0,0.0,0.0,1.0,0.0,-0.734484,0.783894,-0.897750
895,0.0,0.0,0.0,0.0,0.0,1.0,1.550704,2.160687,-0.897750
1173,1.0,0.0,0.0,1.0,0.0,0.0,-0.091775,-0.235291,0.786896
1131,1.0,0.0,0.0,0.0,0.0,1.0,-0.877309,2.469531,0.786896
363,0.0,0.0,0.0,0.0,0.0,1.0,-1.305781,-0.700182,-0.055427
...,...,...,...,...,...,...,...,...,...
884,1.0,0.0,0.0,1.0,0.0,0.0,-1.020133,-0.652230,2.471541
515,1.0,0.0,0.0,0.0,0.0,1.0,1.336468,0.811527,-0.897750
695,0.0,0.0,0.0,1.0,0.0,0.0,-0.948721,1.540561,-0.897750
454,1.0,0.0,0.0,0.0,1.0,0.0,-0.520248,2.571937,0.786896


In [25]:
preprocessor.transform(df_futuro)



Unnamed: 0,encoder__sex_male,encoder__smoker_yes,encoder__region_northeast,encoder__region_northwest,encoder__region_southeast,encoder__region_southwest,scaler__age,scaler__bmi,scaler__children
0,1.0,0.0,0.0,0.0,1.0,0.0,-0.163187,2.571937,1.629218
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.274302,-0.055427
2,1.0,0.0,0.0,1.0,0.0,0.0,1.336468,-0.343386,-0.89775
3,0.0,0.0,0.0,0.0,1.0,0.0,-1.305781,-1.433279,0.786896
4,0.0,1.0,0.0,0.0,0.0,0.0,0.265286,-0.976516,0.786896
