# Notebook 3: Building the Neural Additive Model Architecture
## Designing Networks for Interpretable Marketing Mix Modeling

**Learning Objectives:**
- Understand NAM architecture principles
- Build feature-specific neural networks
- Implement Beta-Gamma transformation layer
- Apply monotonic constraints
- Create the additive structure

---

## NAM Architecture Overview

Unlike traditional neural networks, NAM builds a separate network for each feature, then combines them additively. This preserves interpretability while capturing non-linear relationships.

In [None]:
# Import required libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

print(f"TensorFlow version: {tf.__version__}")

# Load feature configuration from Notebook 2
data = pd.read_csv('data/processed/mmm_data_with_features.csv')
print(f"Loaded data: {data.shape}")

# Identify feature types
beta_gamma_features = [col for col in data.columns if any(x in col for x in ['_adstock', '_log'])]
print(f"Beta-Gamma features: {len(beta_gamma_features)}")

## Part 1: The Beta-Gamma Transformation Layer

This custom layer implements the marketing saturation curve:
f(x) = alpha  x^beta  e^(-gammax)

In [None]:
class BetaGammaLayer(keras.layers.Layer):
    '''
    Custom layer implementing Beta-Gamma transformation for marketing saturation.

    Parameters learned:
    - alpha: Scale parameter (overall impact)
    - beta: Shape parameter (initial effectiveness)
    - gamma: Saturation parameter (diminishing returns)
    '''

    def __init__(self, name=None, **kwargs):
        super().__init__(name=name, **kwargs)

    def build(self, input_shape):
        # Initialize learnable parameters
        self.alpha = self.add_weight(
            name='alpha',
            shape=(1,),
            initializer=keras.initializers.Constant(1.0),
            trainable=True
        )

        self.beta = self.add_weight(
            name='beta',
            shape=(1,),
            initializer=keras.initializers.Constant(0.5),
            constraint=keras.constraints.MinMaxNorm(min_value=0.1, max_value=2.0),
            trainable=True
        )

        self.gamma = self.add_weight(
            name='gamma',
            shape=(1,),
            initializer=keras.initializers.Constant(0.01),
            constraint=keras.constraints.NonNeg(),
            trainable=True
        )

    def call(self, inputs):
        # Ensure positive inputs
        x = tf.nn.relu(inputs) + 1e-8

        # Apply Beta-Gamma transformation
        power_term = tf.pow(x, self.beta)
        exp_term = tf.exp(-self.gamma * x)

        return self.alpha * power_term * exp_term

print("Beta-Gamma layer defined successfully")

## Part 2: Feature-Specific Networks

Each feature gets its own neural network based on its type:
- **Marketing features**: Use Beta-Gamma transformation
- **Price features**: Apply monotonic constraints
- **Other features**: Standard neural network

In [None]:
def create_feature_network(feature_name, feature_type):
    '''
    Create a feature-specific neural network based on feature type.
    '''

    # Single feature input
    feature_input = keras.Input(shape=(1,), name=f'input_{feature_name}')

    if 'beta_gamma' in feature_type:
        # Marketing feature with saturation
        hidden = layers.Dense(32, activation='relu')(feature_input)
        hidden = layers.Dense(16, activation='relu')(hidden)
        output = BetaGammaLayer(name=f'saturation_{feature_name}')(hidden)

    elif 'monotonic_negative' in feature_type:
        # Price feature (negative relationship)
        hidden = layers.Dense(16, activation='relu')(feature_input)
        positive_out = layers.Dense(1, activation='softplus',
                                   kernel_constraint=keras.constraints.NonNeg())(hidden)
        output = layers.Lambda(lambda x: -x, name=f'negative_{feature_name}')(positive_out)

    elif 'monotonic_positive' in feature_type:
        # Discount feature (positive relationship)
        hidden = layers.Dense(16, activation='relu')(feature_input)
        output = layers.Dense(1, activation='softplus',
                            kernel_constraint=keras.constraints.NonNeg())(hidden)
    else:
        # Standard feature
        hidden = layers.Dense(32, activation='relu')(feature_input)
        hidden = layers.Dense(16, activation='relu')(hidden)
        output = layers.Dense(1)(hidden)

    return feature_input, output

# Example: Create a marketing feature network
example_input, example_output = create_feature_network('TV_adstock', 'beta_gamma')
print(f"Created feature network: Input shape {example_input.shape}, Output shape {example_output.shape}")

## Part 3: Building the Complete NAM Model

Now we combine all feature networks into the additive structure:

In [None]:
def build_nam_model(feature_names, feature_config):
    '''
    Build complete Neural Additive Model with feature-specific networks.
    '''

    inputs = []
    outputs = []

    print("Building NAM Model...")
    print("-" * 60)

    for i, feature in enumerate(feature_names):
        # Determine feature type
        if any(x in feature for x in ['_adstock', '_log']):
            feature_type = 'beta_gamma'
        elif 'Price' in feature or 'MRP' in feature:
            feature_type = 'monotonic_negative'
        elif 'Discount' in feature:
            feature_type = 'monotonic_positive'
        else:
            feature_type = 'standard'

        # Create feature network
        feature_input, feature_output = create_feature_network(feature, feature_type)
        inputs.append(feature_input)
        outputs.append(feature_output)

        if (i + 1) % 10 == 0:
            print(f"  Created {i + 1} feature networks...")

    # Combine all features additively
    if len(outputs) > 1:
        combined = layers.Add(name='additive_combination')(outputs)
    else:
        combined = outputs[0]

    # Final output layer
    final_output = layers.Dense(1, name='prediction')(combined)

    # Create model
    model = keras.Model(inputs=inputs, outputs=final_output, name='NAM_MMM')

    # Compile model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae', 'mape']
    )

    print(f"\nModel built successfully!")
    print(f"  Total parameters: {model.count_params():,}")
    print(f"  Input features: {len(inputs)}")
    print(f"  Beta-Gamma features: {sum(1 for f in feature_names if any(x in f for x in ['_adstock', '_log']))}")

    return model

# Prepare features (excluding target and metadata)
exclude_cols = ['Date', 'GMV', 'product_category', 'product_subcategory']
feature_cols = [col for col in data.columns if col not in exclude_cols]

# Build the model (simplified for demo)
print(f"\nBuilding model with {len(feature_cols)} features...")
# Note: In practice, you would pass actual features here
# model = build_nam_model(feature_cols, feature_config)

## Part 4: Visualizing the Architecture

Let's visualize how the NAM architecture differs from traditional neural networks:

In [None]:
def visualize_nam_architecture():
    '''Visualize the NAM architecture concept'''

    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    # Traditional Neural Network
    ax = axes[0]
    ax.text(0.5, 0.1, 'All Features', ha='center', fontsize=10,
            bbox=dict(boxstyle='round', facecolor='lightblue'))
    ax.arrow(0.5, 0.15, 0, 0.15, head_width=0.05, fc='black')
    ax.text(0.5, 0.35, 'Hidden Layer 1', ha='center', fontsize=10,
            bbox=dict(boxstyle='round', facecolor='lightgray'))
    ax.arrow(0.5, 0.4, 0, 0.15, head_width=0.05, fc='black')
    ax.text(0.5, 0.6, 'Hidden Layer 2', ha='center', fontsize=10,
            bbox=dict(boxstyle='round', facecolor='lightgray'))
    ax.arrow(0.5, 0.65, 0, 0.15, head_width=0.05, fc='black')
    ax.text(0.5, 0.85, 'Output', ha='center', fontsize=10,
            bbox=dict(boxstyle='round', facecolor='lightgreen'))
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_title('Traditional NN (Black Box)', fontweight='bold')
    ax.axis('off')

    # NAM Architecture
    ax = axes[1]
    features = ['TV', 'Price', 'Season', '...']
    colors = ['lightcoral', 'lightblue', 'lightgreen', 'lightyellow']

    for i, (feat, color) in enumerate(zip(features, colors)):
        x = 0.2 + i * 0.2

        # Feature
        ax.text(x, 0.1, feat, ha='center', fontsize=9,
                bbox=dict(boxstyle='round', facecolor=color))
        ax.arrow(x, 0.15, 0, 0.25, head_width=0.02, fc='gray', alpha=0.5)

        # Network
        ax.text(x, 0.45, 'NN', ha='center', fontsize=8,
                bbox=dict(boxstyle='round', facecolor=color, alpha=0.5))
        ax.arrow(x, 0.5, 0, 0.15, head_width=0.02, fc='gray', alpha=0.5)

        # Contribution
        ax.text(x, 0.7, f'f({feat})', ha='center', fontsize=8)

    # Addition
    ax.text(0.5, 0.8, '+', ha='center', fontsize=14, fontweight='bold')
    ax.arrow(0.5, 0.82, 0, 0.08, head_width=0.05, fc='black')
    ax.text(0.5, 0.95, 'Output', ha='center', fontsize=10,
            bbox=dict(boxstyle='round', facecolor='lightgreen'))

    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_title('NAM (Interpretable)', fontweight='bold')
    ax.axis('off')

    plt.suptitle('Neural Network Architectures Comparison', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

visualize_nam_architecture()

## Key Takeaways

### What We've Built:
1. **Beta-Gamma Layer**: Custom transformation for marketing saturation
2. **Feature Networks**: Separate neural network per feature
3. **Monotonic Constraints**: Business logic enforcement
4. **Additive Structure**: Interpretable combination

### Architecture Benefits:
- **Interpretability**: Can visualize each feature's contribution
- **Business Logic**: Constraints ensure sensible predictions
- **Flexibility**: Different architectures for different feature types
- **Saturation Modeling**: Captures diminishing returns in marketing

### The Power of NAM:
- Unlike black-box models, we can see exactly how each feature contributes
- Marketing features properly model saturation curves
- Price relationships follow economic theory
- The additive structure maintains interpretability

### Next Steps:
In Notebook 4, we'll train this model with proper validation techniques.