# import the needed liberaries

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler, RobustScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

import numpy as np
from sklearn.model_selection import train_test_split

# get the data set

In [33]:
df = pd.read_csv('insurance.csv')

In [34]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Identify missing values and deal with them

In [35]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Remove unneeded features

In [36]:
# Commented out features are the extracted ones
df = df.drop(columns=[
                      # 'age', 
                      # 'sex',
                      # 'bmi',
                      # 'children',
                      # 'smoker',
                      'region',
                      ])

# Encode Catagorical Data

In [37]:
# Features and target
X = df.drop(columns=["charges"])
y = df["charges"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train)

      age     sex     bmi  children smoker
560    46  female  19.950         2     no
1285   47  female  24.320         0     no
1142   52  female  24.860         0     no
969    39  female  34.320         5     no
486    54  female  21.470         3     no
...   ...     ...     ...       ...    ...
1095   18  female  31.350         4     no
1130   39  female  23.870         5     no
1294   58    male  25.175         0     no
860    37  female  47.600         2    yes
1126   55    male  29.900         0     no

[1070 rows x 5 columns]


#### Encode X

In [38]:
# Identify columns
numeric_cols = [
  "age", 
  "bmi",
  "children"
]

boolean_cols = [
  "smoker"
]

categorical_cols = X.columns.drop(labels=numeric_cols + boolean_cols).tolist()

# ColumnTransformer: correct encoding
ct = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols),
        ("bool", OrdinalEncoder(), boolean_cols)
    ]
)

# Fit/transform
X_train_processed = ct.fit_transform(X_train)
X_test_processed = ct.transform(X_test)

#### Encode y

In [39]:
# le = LabelEncoder()
ss = StandardScaler()
y_train_encoded = ss.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = ss.transform(y_test.values.reshape(-1, 1))

# Training The Linear Regression Model

In [40]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_processed, y_train_encoded)

y_pred = model.predict(X_test_processed)

# Back to dollars
y_pred_real = ss.inverse_transform(y_pred)
y_test_real = ss.inverse_transform(y_test_encoded)

print("MAE ($):", mean_absolute_error(y_test_real, y_pred_real))
print("MAE (std):", mean_absolute_error(y_test_encoded, y_pred))
print("MSE (std):", mean_squared_error(y_test_encoded, y_pred))
print("R²:", r2_score(y_test_encoded, y_pred))



MAE ($): 4213.484797807132
MAE (std): 0.35071769349014875
MSE (std): 0.2354216335438922
R²: 0.7811302113434097
