In [72]:
import pickle

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

In [73]:
df = pd.read_csv('../data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [74]:
X = df.drop(columns=['charges'])
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print("#" * 30)
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (936, 6)
y_train shape: (936,)
##############################
X_test shape: (402, 6)
y_test shape: (402,)


In [75]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_vars = ['age', 'bmi', 'children']
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy='mean')),
           ("scaler", StandardScaler())]
)

cat_vars = ['sex', 'smoker', 'region']
categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy='most_frequent')),
           ("ohe", OneHotEncoder(handle_unknown="ignore", drop='first'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_vars),
        ("cat", categorical_transformer, cat_vars)
    ]
)
preprocessor



In [76]:
polyreg = make_pipeline(PolynomialFeatures(degree=3, include_bias=True), LinearRegression())

In [77]:
pipe = Pipeline([('preprocessor', preprocessor),
                 ('polyreg', polyreg)])

pipe

In [78]:
pipe.fit(X_train, y_train)

In [79]:
poly = pipe.fit(X_train, y_train)
poly

In [80]:
poly_predict_train = poly.predict(X_train)

In [81]:
poly_predict_test = poly.predict(X_test)

In [82]:
r2_score_train = poly.score(X_train, y_train)
mse_train = mean_squared_error(poly_predict_train, y_train)
mae_train = mean_absolute_error(poly_predict_train, y_train)

r2_score_test = r2_score(poly_predict_test, y_test)
mae = mean_absolute_error(poly_predict_test, y_test)
mse = mean_squared_error(poly_predict_test, y_test)

In [83]:
print(f"R2 Score is: {r2_score_train}")
print(f"Mean Squared error: {mse_train}")
print(f"Mean Absolute Error: {mae_train} ")

R2 Score is: 0.8645142828184043
Mean Squared error: 20211515.442576416
Mean Absolute Error: 2759.6576423526185 


In [84]:
print(f"R2 Score is: {r2_score_test}")
print(f"Mean Squared error: {mse}")
print(f"Mean Absolute Error: {mae} ")

R2 Score is: 0.7960781349730756
Mean Squared error: 25836313.3710352
Mean Absolute Error: 3192.820837299965 


In [85]:
with open(r"../models/model.pickle", "wb") as model:
    pickle.dump(pipe, model)