In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import tensorflow as tf
import utils as utils

In [None]:
raw_data = utils.load_csv_data('<local-path>/data/Advertising.csv')

feature_keys = ['tv', 'radio', 'newspaper']
target_key = 'sales'

X, y = utils.prepare_data(raw_data, feature_keys, target_key)

print("X Shape: ", X.shape)
print("X length: ", len(X))
print("X first 5 features: ", X[:5])
print("X type: ", type(X))

print("y Shape: ", y.shape)
print("y length: ", len(y))
print("y first 5 features: ", y[:5])
print("y type: ", type(y))

In [None]:
# Plot the first 5 features vs target
print("Plot first 5 X vs y")
utils.plot_features_vs_target(X[:5], y[:5], feature_keys, target_key)

# Plot the entire features vs target
print("Plot entire X vs y")
utils.plot_features_vs_target(X, y, feature_keys, target_key)

In [None]:
# Step 1: Split the dataset into training, validation, and test sets
# First split: 75% training, 25% temporary set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, random_state=55)
# Second split: Divide the temporary set into validation and test sets (50% each, which is 12.5% of the original data each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=55)

# Step 2: Normalize using TensorFlow (adapt on train only)
# Create a normalization layer that will standardize the features
normalizer = tf.keras.layers.Normalization(axis=-1)
# Fit the normalizer only on training data to avoid data leakage
normalizer.adapt(X_train)  # Only fit on training data

# Step 3: Transform all datasets using the fitted normalizer
# Apply normalization to training data and convert to numpy array
X_train_norm = normalizer(X_train).numpy()
# Apply same normalization to validation data
X_val_norm = normalizer(X_val).numpy() 
# Apply same normalization to test data
X_test_norm = normalizer(X_test).numpy()

# Print shapes of all datasets to verify the splitting worked correctly
print("X_train_norm Shape: ", X_train_norm.shape)
print("y_train Shape: ", y_train.shape)
print("X_val_norm Shape: ", X_val_norm.shape)
print("y_val Shape: ", y_val.shape)
print("X_test_norm Shape: ", X_test_norm.shape)
print("y_test Shape: ", y_test.shape)

In [None]:
# Train Linear Regression model using sklearn
# Initialize and train the Linear Regression model
sklearn_model = LinearRegression()
sklearn_model.fit(X_train_norm, y_train)

# Make predictions on validation and test sets
val_predictions = sklearn_model.predict(X_val_norm)
y_predict_sklearn = sklearn_model.predict(X_test_norm)

# Calculate mean squared error for both validation and test sets
val_loss = mean_squared_error(y_val, val_predictions)
test_loss = mean_squared_error(y_test, y_predict_sklearn)

In [None]:
print(f"Validation MSE: {val_loss:.4f}")
print(f"Test MSE: {test_loss:.4f}")

# Create a plot comparing actual vs predicted sales
utils.plot_predictions(y_test, y_predict_sklearn, 'Predicted vs Actual Sales', 'Actual Sales', 'Predicted Sales')

# Print the first 25 actual and predicted values for comparison
for i in range(25):
    print("Print actual vs predicted values")
    print(f"Actual: {y_test[i]}, Predicted: {y_predict_sklearn[i]:.1f}")

In [None]:
# Train Linear Regression model using TensorFlow
# Define a simple Sequential model with a Dense layer (1 unit for linear regression)
tf_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train_norm.shape[1],)),
    tf.keras.layers.Dense(1)  # Linear regression (no activation)
])

# Display model architecture
tf_model.summary()

# Configure the model training parameters
tf_model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
    loss='mse', # Mean Squared Error loss
    metrics=['mae'] # Track Mean Absolute Error during training
)

# Train the model
history = tf_model.fit(
    X_train_norm, y_train,
    validation_data=(X_val_norm, y_val),
    epochs=200,
    batch_size=32
)

# Evaluate on test data
test_loss, test_mae = tf_model.evaluate(X_test_norm, y_test, verbose=0)

# Generate predictions on test data
y_predict_tf = tf_model.predict(X_test_norm).flatten() # Flatten converts 2D array to 1D for easier comparison

In [None]:
print(f"Test Loss (MSE): {test_loss:.4f}")
print(f"Test Mean Absolute Error: {test_mae:.4f}")

# Create a plot comparing actual vs predicted sales
utils.plot_predictions(y_test, y_predict_tf, 'Predicted vs Actual Sales', 'Actual Sales', 'Predicted Sales')

# Print the first 25 actual and predicted values for comparison
for i in range(25):
    print("Print actual vs predicted values")
    print(f"Actual: {y_test[i]}, Predicted: {y_predict_tf[i]:.1f}")