<a href="https://colab.research.google.com/github/Hafizur-Rahman-SD/ML-with-Python-FCC-Course-/blob/main/Linear_Regression_Health_Costs_Calculator_FCC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Cell 1 — Import libraries and load dataset
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Download dataset (if not already)
!wget -q https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv

# Load dataset correctly (ignore the extra index column)
dataset = pd.read_csv('insurance.csv', index_col=0)

print(dataset.head())
print("✅ Columns:", dataset.columns.tolist())


        sex   bmi  children smoker     region  expenses
age                                                    
19   female  27.9         0    yes  southwest  16884.92
18     male  33.8         1     no  southeast   1725.55
28     male  33.0         3     no  southeast   4449.46
33     male  22.7         0     no  northwest  21984.47
32     male  28.9         0     no  northwest   3866.86
✅ Columns: ['sex', 'bmi', 'children', 'smoker', 'region', 'expenses']


In [11]:
# Cell 2 — Data preprocessing
# Make all column names lowercase (safety step)
dataset.columns = dataset.columns.str.strip().str.lower()

# Convert categorical columns to numeric
dataset = pd.get_dummies(dataset, columns=['sex', 'smoker', 'region'], drop_first=True)
print (dataset.head())

# Split into train/test
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

# Separate labels
train_labels = train_dataset.pop('expenses')
test_labels = test_dataset.pop('expenses')

# Normalize numeric columns for better performance
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_dataset))


      bmi  children  expenses  sex_male  smoker_yes  region_northwest  \
age                                                                     
19   27.9         0  16884.92     False        True             False   
18   33.8         1   1725.55      True       False             False   
28   33.0         3   4449.46      True       False             False   
33   22.7         0  21984.47      True       False              True   
32   28.9         0   3866.86      True       False              True   

     region_southeast  region_southwest  
age                                      
19              False              True  
18               True             False  
28               True             False  
33              False             False  
32              False             False  


In [12]:
# Cell 3 — Build linear regression model
model = keras.Sequential([
    normalizer,
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss='mean_absolute_error',
    metrics=['mae', 'mse']
)

model.summary()


In [19]:
# Cell 4 — Train model
EPOCHS = 150

history = model.fit(
    train_dataset, train_labels,
    epochs=EPOCHS,
    validation_split=0.2,
    verbose=1
)


Epoch 1/150
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 4228.5840 - mae: 4228.5840 - mse: 37223100.0000 - val_loss: 4644.2720 - val_mae: 4644.2720 - val_mse: 39444484.0000
Epoch 2/150
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 4304.4429 - mae: 4304.4429 - mse: 35785628.0000 - val_loss: 4637.0469 - val_mae: 4637.0469 - val_mse: 39324352.0000
Epoch 3/150
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 4518.8242 - mae: 4518.8242 - mse: 40120292.0000 - val_loss: 4629.8052 - val_mae: 4629.8052 - val_mse: 39253756.0000
Epoch 4/150
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 4470.8506 - mae: 4470.8506 - mse: 40331884.0000 - val_loss: 4635.8813 - val_mae: 4635.8813 - val_mse: 39344460.0000
Epoch 5/150
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 4327.8647 - mae: 4327.8647 - mse: 36106340.0000 - val_loss: 4623.2290 -

In [24]:
# ✅ Cell 5 — Evaluate model performance and visualize results

# Make sure index alignment is correct
test_dataset = test_dataset.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

# Evaluate the model
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)
print("📊 Testing set Mean Abs Error (MAE): {:5.2f} expenses".format(mae))

# Check if challenge passed
if mae < 3500:
    print("✅ You passed the challenge! Great job 🎉")
else:
    print("❌ The Mean Abs Error must be less than 3500. Keep trying!")

# Predict expenses using the test dataset
test_predictions = model.predict(test_dataset).flatten()

# Plot predictions vs true values
import matplotlib.pyplot as plt

plt.figure(figsize=(7,7))
plt.scatter(test_labels, test_predictions, alpha=0.7)
plt.xlabel('True Values (Expenses)')
plt.ylabel('Predicted Values (Expenses)')
plt.title('True vs Predicted Health Costs')

# Line showing perfect prediction
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
plt.plot(lims, lims, 'r')
plt.grid(True)
plt.show()




ValueError: math domain error