# Import Nessesary Libraries

In [167]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score


# Load and Inspect Dataset
We start by loading the `insurance.csv` dataset into a Pandas DataFrame. We then remove any rows with missing values to prevent errors during model training.

In [153]:
df = pd.read_csv('insurance.csv')
df.dropna(inplace=True)

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [155]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


# Encode Categorical Variables
- `sex`: male -> 0, female -> 1
- `smoker`: yes -> 1, no -> 0
- `region`: one-hot encoded using `pd.get_dummies()` to convert categories into separate numeric columns. `drop_first=True` avoids multicollinearity.

In [156]:
pd.set_option('future.no_silent_downcasting', True)
df['sex'] = df['sex'].replace({'male': 0, 'female': 1}).astype(int)
df['smoker'] = df['smoker'].replace({'yes': 1, 'no': 0}).astype(int)
df = pd.get_dummies(df, columns=['region'], drop_first=True)


In [157]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   int32  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int32  
 5   expenses          1338 non-null   float64
 6   region_northwest  1338 non-null   bool   
 7   region_southeast  1338 non-null   bool   
 8   region_southwest  1338 non-null   bool   
dtypes: bool(3), float64(2), int32(2), int64(2)
memory usage: 56.3 KB


In [158]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,expenses,region_northwest,region_southeast,region_southwest
0,19,1,27.9,0,1,16884.92,False,False,True
1,18,0,33.8,1,0,1725.55,False,True,False
2,28,0,33.0,3,0,4449.46,False,True,False
3,33,0,22.7,0,0,21984.47,True,False,False
4,32,0,28.9,0,0,3866.86,True,False,False


In [159]:
features = ["age", "sex", "bmi", "children", "smoker", "region_northwest", "region_southeast", "region_southwest"]
target = ["expenses"]

# Standardize Features
We scale features to have mean=0 and standard deviation=1 using `StandardScaler`. This ensures that all features contribute equally to the model.

In [160]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(df[features])

# Split Data into Training and Testing Sets
80% of the data is used for training and 20% for testing. This ensures the model is evaluated on unseen data.

In [161]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, df[target], test_size=0.2, random_state=42)

# Train Linear Regression Model
We initialize a `LinearRegression` model and fit it on the training data.

In [162]:
model = LinearRegression()
model.fit(x_train, y_train)

# Evaluate Model Performance
- R² Score: Indicates the proportion of variance explained by the model.
- MAE (Mean Absolute Error): Average absolute difference between predicted and actual values.
- RMSE (Root Mean Squared Error): Measures prediction error, penalizing larger errors more than MAE.

In [173]:
y_pred = model.predict(x_test)  # Xtest = features for test set


r_2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(
    f"""
        R^2 Score: {r_2}
        MAE: {mae}
        RMSE: {rmse}
    """
)


        R^2 Score: 0.7835726930039906
        MAE: 4181.561524000788
        RMSE: 5796.556335884076
    


# Predict Expenses for a New Sample
We prepare a new input sample, scale it using the same scaler as training data, and predict insurance expenses using our trained model.

In [174]:
# 25, male, 26.2, 0, no, northeast, 2721.32

input_data = pd.DataFrame([[25, 0, 26.2, 0, 0, 0, 1, 0]], columns=features)
input_scaled = scaler.transform(input_data)
prediction = model.predict(input_scaled)

print(f"Predicted expenses: {prediction[0][0]:.2f}")

Predicted expenses: 2646.41
