In [46]:
# Data Wrangling
import pandas as pd 
import numpy as np

#Seleção e validação do modelo
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Pré- Processamento e validação dos modelos
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#Pipiline para tratar dados futuros
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Modelos 
from sklearn.linear_model import LinearRegression

#Manter saída do processamento do sklearn como DataFrame pandas
from sklearn import set_config
set_config(transform_output="pandas")



In [47]:
ulr_data = 'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv'
df = pd.read_csv(ulr_data)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [48]:
df_futuro = pd.DataFrame ({
    'age': [37, np.nan, 58,21,43],
    'sex': ['male', 'female', 'male', 'female', 'female'],
    'bmi': [46.53, 32.395, 28.595, 21.89, 24.7],
    'children': [3, 1, 0, 2 ,2],
    'smoker': ['no', np.nan, 'no', 'no', 'yes'],
    'region': ['southeast', 'northeast', 'northwest', np.nan, 'north']
})

df_futuro

Unnamed: 0,age,sex,bmi,children,smoker,region
0,37.0,male,46.53,3,no,southeast
1,,female,32.395,1,,northeast
2,58.0,male,28.595,0,no,northwest
3,21.0,female,21.89,2,no,
4,43.0,female,24.7,2,yes,north


In [49]:
lr = LinearRegression()

In [50]:
#Seleção dos modelos
X = df.drop(columns='charges')
y = df['charges']

X_train , X_test , y_train , y_test = train_test_split(X, y , random_state=2023)
X_test.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
748,47,female,36.0,1,no,southwest
745,50,female,30.115,1,no,northwest
57,18,male,31.68,2,yes,southeast
546,28,male,35.435,0,no,northeast
279,51,female,21.56,1,no,southeast


In [51]:
X_train_trasformed = pd.get_dummies(X_train)
X_train_trasformed

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,29,35.530,0,True,False,True,False,False,False,True,False
895,61,44.000,0,True,False,True,False,False,False,False,True
1173,38,29.260,2,False,True,True,False,False,True,False,False
1131,27,45.900,2,False,True,True,False,False,False,False,True
363,21,26.400,1,True,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
884,25,26.695,4,False,True,True,False,False,True,False,False
515,58,35.700,0,False,True,True,False,False,False,False,True
695,26,40.185,0,True,False,True,False,False,True,False,False
454,32,46.530,2,False,True,True,False,False,False,True,False


In [52]:
lr.fit(X_train_trasformed, y_train)

In [53]:
lr.predict(X_train_trasformed)

array([ 6297.81640554, 17604.53144584,  8564.16097047, ...,
        8171.64062054, 11728.69287826,  1928.89483764])

In [54]:
df_futuro_transformed = pd.get_dummies(df_futuro)
df_futuro_transformed

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_north,region_northeast,region_northwest,region_southeast
0,37.0,46.53,3,False,True,True,False,False,False,False,True
1,,32.395,1,True,False,False,False,False,True,False,False
2,58.0,28.595,0,False,True,True,False,False,False,True,False
3,21.0,21.89,2,True,False,True,False,False,False,False,False
4,43.0,24.7,2,True,False,False,True,True,False,False,False


In [55]:
lr.predict(df_futuro_transformed)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- region_north
Feature names seen at fit time, yet now missing:
- region_southwest


In [56]:
X_train_trasformed

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,29,35.530,0,True,False,True,False,False,False,True,False
895,61,44.000,0,True,False,True,False,False,False,False,True
1173,38,29.260,2,False,True,True,False,False,True,False,False
1131,27,45.900,2,False,True,True,False,False,False,False,True
363,21,26.400,1,True,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
884,25,26.695,4,False,True,True,False,False,True,False,False
515,58,35.700,0,False,True,True,False,False,False,False,True
695,26,40.185,0,True,False,True,False,False,True,False,False
454,32,46.530,2,False,True,True,False,False,False,True,False


In [57]:
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore',sparse_output=False)
onehot.fit(X_train)

In [58]:
onehot.transform(df_futuro)



Unnamed: 0,age_18,age_19,age_20,age_21,age_22,age_23,age_24,age_25,age_26,age_27,...,children_1,children_2,children_3,children_4,children_5,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [59]:
categorical_columns= [
    'sex',
    'smoker',
    'region'
]
onehot.fit(X_train[categorical_columns])

X_train_trasformed = onehot.transform(X_train[categorical_columns])
X_train_trasformed = X_train_trasformed.join(X_train.drop(columns= categorical_columns))
X_train_trasformed

Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
632,0.0,0.0,0.0,0.0,1.0,0.0,29,35.530,0
895,0.0,0.0,0.0,0.0,0.0,1.0,61,44.000,0
1173,1.0,0.0,0.0,1.0,0.0,0.0,38,29.260,2
1131,1.0,0.0,0.0,0.0,0.0,1.0,27,45.900,2
363,0.0,0.0,0.0,0.0,0.0,1.0,21,26.400,1
...,...,...,...,...,...,...,...,...,...
884,1.0,0.0,0.0,1.0,0.0,0.0,25,26.695,4
515,1.0,0.0,0.0,0.0,0.0,1.0,58,35.700,0
695,0.0,0.0,0.0,1.0,0.0,0.0,26,40.185,0
454,1.0,0.0,0.0,0.0,1.0,0.0,32,46.530,2


In [60]:
df_futuro_transformed = onehot.transform(df_futuro[categorical_columns])
df_futuro_transformed = df_futuro_transformed.join(df_futuro.drop(columns= categorical_columns))
df_futuro_transformed



Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
0,1.0,0.0,0.0,0.0,1.0,0.0,37.0,46.53,3
1,0.0,0.0,1.0,0.0,0.0,0.0,,32.395,1
2,1.0,0.0,0.0,1.0,0.0,0.0,58.0,28.595,0
3,0.0,0.0,0.0,0.0,0.0,0.0,21.0,21.89,2
4,0.0,1.0,0.0,0.0,0.0,0.0,43.0,24.7,2


In [61]:
lr = LinearRegression()
lr.fit(X_train_trasformed, y_train)

In [62]:
lr.predict(X_train_trasformed)

array([ 6297.81640554, 17604.53144584,  8564.16097047, ...,
        8171.64062054, 11728.69287826,  1928.89483764])

In [64]:
# lr.predict(df_futuro_transformed)

In [66]:
onehot = OneHotEncoder(drop='if_binary',handle_unknown='ignore',sparse_output=False)

numeric_processor = make_pipeline (SimpleImputer(strategy='mean'), StandardScaler())
categoric_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), onehot)

prepocessor = ColumnTransformer([
    ('onehot', categoric_processor, ['sex', 'smoker', 'region']),
    ('scale', numeric_processor,['age', 'bmi','children'])
])
model_pipeline =Pipeline(steps=[
    ('prepocessor', prepocessor),
    ("lr_model", LinearRegression())
])

model_pipeline.fit(X_train, y_train)

In [68]:
model_pipeline.predict(X_test)

array([ 1.16163736e+04,  1.15012201e+04,  2.68177810e+04,  7.05905948e+03,
        7.78191815e+03,  5.01429872e+03,  7.04617329e+03,  2.22023941e+03,
        1.18240459e+04,  3.32418104e+04,  1.21250921e+04,  3.34656405e+04,
        4.01997674e+04,  3.21522892e+04,  7.25481272e+03,  4.01764884e+04,
        1.00259858e+04,  1.12013188e+04,  4.16113446e+03,  3.26169568e+03,
        2.73575744e+04,  2.65622169e+03,  5.93757039e+03,  2.23108761e+03,
        2.18444446e+03,  8.98638483e+03,  2.92518585e+04,  1.15213697e+04,
        3.74926967e+04,  1.40203318e+04,  8.65944669e+03,  9.02188953e+00,
        2.61116275e+04,  4.69659124e+03,  1.11830680e+04,  3.42245754e+04,
        9.79720504e+03,  3.04683155e+04,  1.18948076e+04,  8.22452563e+03,
        8.69321850e+03,  7.90705607e+03,  2.15239531e+03,  7.44095108e+03,
        3.11010610e+04,  1.27207881e+04,  8.77689232e+03,  1.10247871e+04,
        5.75446604e+03,  1.28197810e+04,  4.19645461e+03,  2.56283201e+04,
        3.72611915e+04,  