In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


df = pd.read_csv("/kaggle/input/insurance/insurance.csv")

df = df.drop(columns=['region'])

X = df.drop(columns=['charges'])
y = df['charges']

categorical_features = ['sex', 'smoker']
numerical_features = ['age', 'bmi', 'children']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


linear_regression = LinearRegression()


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', linear_regression)
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f"Linear Regression: MSE = {mse:.2f}")


Linear Regression: MSE = 33979257.05
