In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset (your file)
df = pd.read_csv("insurance.csv")

df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# Features & target
X = df.drop("charges", axis=1)
y = df["charges"]

# Categorical & numeric columns
categorical_cols = ["sex", "smoker", "region"]
numeric_cols = ["age", "bmi", "children"]

# Preprocessing
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first"), categorical_cols),
    ("num", StandardScaler(), numeric_cols)
])


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [5]:
# Linear Regression Pipeline
lr_model = Pipeline([
    ("prep", preprocess),
    ("lr", LinearRegression())
])

# Train
lr_model.fit(X_train, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test)

# Evaluation
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(" Linear Regression:")
print("MAE: ", mae_lr)
print("MSE: ", mse_lr)
print("RMSE:", rmse_lr)
print("R²:  ", r2_lr)


 Linear Regression:
MAE:  4181.194473753646
MSE:  33596915.85136135
RMSE: 5796.2846592762635
R²:   0.7835929767120731


In [6]:
knn_model = Pipeline([
    ("prep", preprocess),
    ("knn", KNeighborsRegressor(n_neighbors=5))
])

knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

mae_knn = mean_absolute_error(y_test, y_pred_knn)
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print(" KNN Regression:")
print("MAE: ", mae_knn)
print("MSE: ", mse_knn)
print("RMSE:", rmse_knn)
print("R²:  ", r2_knn)


 KNN Regression:
MAE:  3891.0524894365667
MSE:  46285808.77767338
RMSE: 6803.367458668787
R²:   0.7018603093698963


In [7]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "KNN Regression"],
    "MAE": [mae_lr, mae_knn],
    "RMSE": [rmse_lr, rmse_knn],
    "R²": [r2_lr, r2_knn]
})

results


Unnamed: 0,Model,MAE,RMSE,R²
0,Linear Regression,4181.194474,5796.284659,0.783593
1,KNN Regression,3891.052489,6803.367459,0.70186
