<a href="https://colab.research.google.com/github/MagretAdekunle/Machine-Learning-Projects-freeCodeCamp/blob/main/Linear%20Regression%20Health%20Costs%20Calculator/fcc_predict_health_costs_with_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load libraries

In [None]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

# Load dataset

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')

In [None]:
dataset.head()

In [None]:
dataset.describe()

# Visualizations for Exploratory Data Analysis (EDA)


In [None]:

# @title Distribution of Age
plt.figure(figsize=(10,6))
sns.histplot(dataset['age'], kde=True, bins=20)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# @title Distribution of BMI
plt.figure(figsize=(10,6))
sns.histplot(dataset['bmi'], kde=True, bins=20, color='green')
plt.title('Distribution of BMI')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.show()

In [None]:
# @title Number of Children vs. Expenses (Scatter Plot)
plt.figure(figsize=(10,6))
sns.scatterplot(x='children', y='expenses', data=dataset, hue='sex', palette='coolwarm')
plt.title('Number of Children vs. Expenses')
plt.xlabel('Number of Children')
plt.ylabel('Medical Expenses')
plt.show()

In [None]:
# @title Expenses by Region (Box Plot)
plt.figure(figsize=(10,6))
sns.boxplot(x='region', y='expenses', data=dataset)
plt.title('Expenses by Region')
plt.xlabel('Region')
plt.ylabel('Medical Expenses')
plt.show()

In [None]:
# @title Sex vs. Expenses (Box Plot)
plt.figure(figsize=(10,6))
sns.boxplot(x='sex', y='expenses', data=dataset)
plt.title('Expenses by Sex')
plt.xlabel('Sex')
plt.ylabel('Medical Expenses')
plt.show()

In [None]:
# @title Smoker vs. Expenses (Box Plot)
plt.figure(figsize=(10,6))
sns.boxplot(x='smoker', y='expenses', data=dataset)
plt.title('Expenses by Smoker')
plt.xlabel('Smoker')
plt.ylabel('Medical Expenses')
plt.show()

In [None]:
# @title Expenses vs. Number of Children by Region

import matplotlib.pyplot as plt
children_region_expenses = dataset.groupby(['children', 'region'])['expenses'].mean().unstack()
children_region_expenses.plot(kind='line', figsize=(10, 6), marker='o')
plt.title('Expenses vs. Number of Children by Region')
plt.xlabel('Number of Children')
plt.ylabel('Average Expenses')
plt.xticks(range(6))
_ = plt.legend(title='Region')

In [None]:
# @title Correlation Heatmap
plt.figure(figsize=(8,6))
corr = dataset[['age', 'bmi', 'children', 'expenses']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# @title Age, BMI, and Children vs Expenses

fig, axes = plt.subplots(1, 3, figsize=(18, 4))

# Identify numerical columns (excluding 'expenses' and categorical columns)
cols = [f for f in dataset.columns if f != 'expenses' and dataset[f].dtype != 'object']

# Create scatter plots for each numerical feature vs 'expenses'
for i, feature in enumerate(cols):
    dataset.plot.scatter(x=feature, y='expenses', ax=axes[i])
    axes[i].set_title(f'{feature} vs Expenses')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Expenses')

plt.show()


In [None]:
model_dataset = dataset.copy()

# Preprocessing categorical data

In [None]:
# Preprocessing categorical data by converting them into numerical codes
feature_columns = {}
for col_name in model_dataset.columns:
    if model_dataset[col_name].dtype == 'object':
        c = model_dataset[col_name].astype('category')
        model_dataset[col_name] = c.cat.codes
        feature_columns[col_name] = dict(enumerate(c.cat.categories))

model_dataset.head()

# Splitting data

In [None]:
# Shuffling the dataset
model_dataset = model_dataset.sample(frac=1)

In [None]:
# Splitting data in training datasets
train_dataset = model_dataset.sample(frac=0.8, random_state=0)
train_labels = train_dataset.pop('expenses')

In [None]:
# Splitting data in testing datasets
test_dataset = model_dataset.drop(train_dataset.index)
test_labels = test_dataset.pop('expenses')

# Normalizing Data

In [None]:
normalizer = layers.Normalization(axis=-1)
# Compute mean and std from the training data
normalizer.adapt(np.array(train_dataset))

# Building the Model

In [None]:
# Define the model architecture
model = tf.keras.Sequential([
    layers.Input(shape=(train_dataset.shape[1],)),
    normalizer,
    # First hidden layer
    layers.Dense(128, activation='relu'),
    # Dropout layer with 30% rate
    layers.Dropout(0.3),
    # Second hidden layer
    layers.Dense(64, activation='relu'),
    # Dropout layer
    layers.Dropout(0.3),
    # Third hidden layer
    layers.Dense(32, activation='relu'),
    # Output layer (regression task)
    layers.Dense(1)
])

# Compile the model
model.compile(
    loss='mean_absolute_error',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['mae', 'mse']
)

model.summary()

# Traning the Model

In [None]:
history = model.fit(
    train_dataset,
    train_labels,
    validation_split=0.2,
    verbose=1,
    epochs=1000
)

# Plotting The Loss and Validation Loss

In [None]:
plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')

# Add labels and legend
plt.xlabel('Epoch')
plt.ylabel('Error [MPG]')

# Display legend and grid
plt.legend()
plt.grid(True)

plt.show()

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (Expenses)')
plt.ylabel('Predictions (Expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
