In [None]:
# Essential imports
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import make_classification, make_circles, make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🚀 Environment ready for linear vs non-linear exploration!")
print(f"TensorFlow version: {tf.__version__}")

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


In [None]:
# Demonstration: Linear composition is still linear
def demonstrate_linear_composition():
    """
    Prove that composing linear functions results in a linear function
    """
    # Define two linear transformations
    W1 = np.array([[2, 1], [0, 3]])
    b1 = np.array([1, 2])
    
    W2 = np.array([[1, -1], [2, 0]])
    b2 = np.array([0, 1])
    
    # Test input
    x = np.array([1, 2])
    
    # Apply transformations sequentially
    layer1_output = W1 @ x + b1
    layer2_output = W2 @ layer1_output + b2
    
    print("🔍 Linear Composition Demonstration:")
    print(f"Input x: {x}")
    print(f"Layer 1 output: {layer1_output}")
    print(f"Layer 2 output: {layer2_output}")
    
    # Equivalent single transformation
    W_combined = W2 @ W1
    b_combined = W2 @ b1 + b2
    single_output = W_combined @ x + b_combined
    
    print(f"\nEquivalent single transformation:")
    print(f"Combined W: \n{W_combined}")
    print(f"Combined b: {b_combined}")
    print(f"Single transformation output: {single_output}")
    
    # Verify they're the same
    print(f"\n✅ Outputs are identical: {np.allclose(layer2_output, single_output)}")
    
    return W_combined, b_combined

W_combined, b_combined = demonstrate_linear_composition()


In [None]:
# Create datasets to demonstrate linear vs non-linear capabilities
def create_demonstration_datasets():
    """
    Create various datasets to show the power of non-linear activations
    """
    # 1. Linearly separable data
    X_linear, y_linear = make_classification(
        n_samples=300, n_features=2, n_redundant=0, n_informative=2,
        random_state=42, n_clusters_per_class=1, class_sep=2
    )
    
    # 2. Non-linearly separable data (circles)
    X_circles, y_circles = make_circles(n_samples=300, noise=0.1, factor=0.3, random_state=42)
    
    # 3. Non-linearly separable data (moons)
    X_moons, y_moons = make_moons(n_samples=300, noise=0.15, random_state=42)
    
    # 4. XOR-like data (classic non-linear problem)
    np.random.seed(42)
    X_xor = np.random.randn(200, 2)
    y_xor = ((X_xor[:, 0] > 0) & (X_xor[:, 1] > 0)) | ((X_xor[:, 0] < 0) & (X_xor[:, 1] < 0))
    y_xor = y_xor.astype(int)
    
    return {
        'linear': (X_linear, y_linear),
        'circles': (X_circles, y_circles),
        'moons': (X_moons, y_moons),
        'xor': (X_xor, y_xor)
    }

# Create all datasets
datasets = create_demonstration_datasets()

# Visualize the datasets
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Datasets: Linear vs Non-linear Separability', fontsize=16, fontweight='bold')

titles = ['Linearly Separable', 'Circles (Non-linear)', 'Moons (Non-linear)', 'XOR (Non-linear)']
dataset_names = ['linear', 'circles', 'moons', 'xor']

for idx, (ax, title, name) in enumerate(zip(axes.flat, titles, dataset_names)):
    X, y = datasets[name]
    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', alpha=0.7, s=30)
    ax.set_title(title)
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=ax)

plt.tight_layout()
plt.show()

print("📊 Datasets created successfully!")
print("Notice how some datasets cannot be separated by a straight line...")


In [None]:
def softmax(x):
    """
    Stable softmax implementation
    
    Args:
        x: Input array (can be 1D or 2D)
    
    Returns:
        Softmax probabilities
    """
    # Subtract max for numerical stability
    if x.ndim == 1:
        x_stable = x - np.max(x)
        exp_x = np.exp(x_stable)
        return exp_x / np.sum(exp_x)
    else:
        x_stable = x - np.max(x, axis=1, keepdims=True)
        exp_x = np.exp(x_stable)
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def softmax_derivative(x):
    """
    Derivative of softmax function
    
    Args:
        x: Input array
    
    Returns:
        Jacobian matrix of softmax
    """
    s = softmax(x)
    return np.diag(s) - np.outer(s, s)

# Test softmax with different inputs
test_cases = [
    np.array([1.0, 2.0, 3.0]),
    np.array([0.0, 0.0, 0.0]),
    np.array([1000.0, 1001.0, 1002.0]),  # Test numerical stability
    np.array([-1000.0, -999.0, -998.0])  # Test negative large values
]

print("🧮 Softmax Function Analysis:")
print("=" * 50)

for i, test_input in enumerate(test_cases):
    softmax_output = softmax(test_input)
    derivative_matrix = softmax_derivative(test_input)
    
    print(f"\nTest Case {i + 1}:")
    print(f"Input: {test_input}")
    print(f"Softmax output: {softmax_output}")
    print(f"Sum of outputs: {np.sum(softmax_output):.6f}")
    print(f"Max probability: {np.max(softmax_output):.6f}")
    print(f"Predicted class: {np.argmax(softmax_output)}")

# Visualize softmax behavior
x_range = np.linspace(-5, 5, 100)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# 2D softmax visualization
for offset in [-2, 0, 2]:
    inputs = np.column_stack([x_range, np.full_like(x_range, offset)])
    softmax_vals = softmax(inputs)
    ax1.plot(x_range, softmax_vals[:, 0], label=f'Class 1 (offset={offset})', alpha=0.7)
    ax1.plot(x_range, softmax_vals[:, 1], label=f'Class 2 (offset={offset})', linestyle='--', alpha=0.7)

ax1.set_title('Softmax Behavior with Different Offsets')
ax1.set_xlabel('Input Value for Class 1')
ax1.set_ylabel('Probability')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Temperature effect on softmax
def softmax_with_temperature(x, temperature=1.0):
    return softmax(x / temperature)

x_test = np.array([1.0, 2.0, 3.0, 4.0])
temperatures = [0.5, 1.0, 2.0, 5.0]

for temp in temperatures:
    soft_temp = softmax_with_temperature(x_test, temp)
    ax2.plot(range(len(x_test)), soft_temp, 'o-', label=f'T={temp}', alpha=0.7)

ax2.set_title('Temperature Effect on Softmax')
ax2.set_xlabel('Class Index')
ax2.set_ylabel('Probability')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n📋 Key Observations:")
print("• Softmax outputs always sum to 1")
print("• Higher temperature → more uniform distribution")
print("• Lower temperature → more peaked distribution")
print("• Numerical stability is crucial for large inputs")


In [None]:
# Create models with different activation functions
def create_model(activation, input_dim, hidden_units=32):
    """
    Create a neural network with specified activation function
    """
    model = keras.Sequential([
        keras.layers.Dense(hidden_units, input_dim=input_dim),
        keras.layers.Activation(activation),
        keras.layers.Dense(hidden_units),
        keras.layers.Activation(activation),
        keras.layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train and evaluate models on different datasets
def train_and_evaluate(X, y, dataset_name):
    """
    Train models with different activations and compare performance
    """
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Test different activation functions
    activations = ['linear', 'relu', 'tanh', 'sigmoid']
    results = {}
    
    print(f"\n🧪 Training on {dataset_name} dataset...")
    print("-" * 50)
    
    for activation in activations:
        print(f"Training with {activation} activation...")
        
        # Create and train model
        model = create_model(activation, X_train_scaled.shape[1])
        
        # Train with early stopping to prevent overfitting
        early_stopping = keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=20, restore_best_weights=True
        )
        
        history = model.fit(
            X_train_scaled, y_train,
            validation_split=0.2,
            epochs=100,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=0
        )
        
        # Evaluate
        train_loss, train_acc = model.evaluate(X_train_scaled, y_train, verbose=0)
        test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
        
        results[activation] = {
            'train_accuracy': train_acc,
            'test_accuracy': test_acc,
            'train_loss': train_loss,
            'test_loss': test_loss,
            'history': history,
            'model': model
        }
        
        print(f"  {activation:8} | Train: {train_acc:.4f} | Test: {test_acc:.4f}")
    
    return results, X_test_scaled, y_test

# Run experiments on all datasets
all_results = {}
for dataset_name, (X, y) in datasets.items():
    results, X_test, y_test = train_and_evaluate(X, y, dataset_name)
    all_results[dataset_name] = {
        'results': results,
        'X_test': X_test,
        'y_test': y_test
    }

print("\n🏆 Final Results Summary:")
print("=" * 80)
print(f"{'Dataset':<12} | {'Linear':<8} | {'ReLU':<8} | {'Tanh':<8} | {'Sigmoid':<8}")
print("-" * 80)

for dataset_name, data in all_results.items():
    results = data['results']
    line = f"{dataset_name:<12} |"
    for activation in ['linear', 'relu', 'tanh', 'sigmoid']:
        acc = results[activation]['test_accuracy']
        line += f" {acc:6.4f} |"
    print(line)
