In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
tf.random.set_seed(42)

print("TensorFlow version:", tf.__version__)
print("Libraries imported successfully!")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Create datasets to test gradient problems
def create_long_dependency_task(n_samples=1000, sequence_length=50, dependency_gap=40):
    """
    Create a task where the output depends on information from early in the sequence
    This will test the model's ability to capture long-term dependencies
    """
    X = []
    y = []
    
    for _ in range(n_samples):
        # Create random sequence
        sequence = np.random.randint(0, 2, sequence_length)
        
        # The important signal is at position dependency_gap from the end
        signal_position = sequence_length - dependency_gap - 1
        
        # Output depends on the signal at that early position
        if signal_position >= 0:
            target = sequence[signal_position]
        else:
            target = 0
        
        X.append(sequence)
        y.append(target)
    
    return np.array(X), np.array(y)

def create_exploding_gradient_task(n_samples=1000, sequence_length=20):
    """
    Create a task that might lead to exploding gradients
    """
    X = []
    y = []
    
    for _ in range(n_samples):
        # Create sequence with large values
        sequence = np.random.randn(sequence_length) * 10  # Large variance
        
        # Target is sum of all elements (can lead to large gradients)
        target = np.sum(sequence) > 0
        
        X.append(sequence)
        y.append(int(target))
    
    return np.array(X), np.array(y)

# Create test datasets
print("Creating test datasets...")

# Long dependency task
X_long, y_long = create_long_dependency_task(n_samples=2000, sequence_length=60, dependency_gap=50)
X_long = X_long.reshape(X_long.shape[0], X_long.shape[1], 1)  # Add feature dimension

# Exploding gradient task
X_explode, y_explode = create_exploding_gradient_task(n_samples=1000, sequence_length=30)
X_explode = X_explode.reshape(X_explode.shape[0], X_explode.shape[1], 1)

print(f"Long dependency task: {X_long.shape}, {y_long.shape}")
print(f"Exploding gradient task: {X_explode.shape}, {y_explode.shape}")

# Split datasets
X_long_train, X_long_test, y_long_train, y_long_test = train_test_split(
    X_long, y_long, test_size=0.2, random_state=42
)

X_exp_train, X_exp_test, y_exp_train, y_exp_test = train_test_split(
    X_explode, y_explode, test_size=0.2, random_state=42
)

# Visualize the tasks
plt.figure(figsize=(15, 8))

# Long dependency task visualization
plt.subplot(2, 3, 1)
sample_idx = 0
sequence = X_long[sample_idx].flatten()
target = y_long[sample_idx]
signal_pos = len(sequence) - 50 - 1  # dependency_gap = 50

plt.plot(sequence, 'b-', alpha=0.7)
plt.axvline(signal_pos, color='red', linestyle='--', label=f'Signal position')
plt.axhline(sequence[signal_pos], color='red', alpha=0.5, label=f'Signal value: {sequence[signal_pos]:.1f}')
plt.title(f'Long Dependency Task\nTarget: {target}')
plt.xlabel('Time Step')
plt.ylabel('Value')
plt.legend()
plt.grid(True, alpha=0.3)

# Task difficulty analysis
plt.subplot(2, 3, 2)
baseline_accuracy = max(np.mean(y_long), 1 - np.mean(y_long))
difficulties = []
gaps = [10, 20, 30, 40, 50]

for gap in gaps:
    X_temp, y_temp = create_long_dependency_task(n_samples=200, sequence_length=60, dependency_gap=gap)
    baseline_temp = max(np.mean(y_temp), 1 - np.mean(y_temp))
    difficulties.append(1 - baseline_temp)  # Lower baseline = harder task

plt.plot(gaps, difficulties, 'o-', linewidth=2, markersize=8)
plt.title('Task Difficulty vs Dependency Gap')
plt.xlabel('Dependency Gap')
plt.ylabel('Task Difficulty (1 - baseline)')
plt.grid(True, alpha=0.3)

# Exploding gradient task visualization
plt.subplot(2, 3, 3)
sample_sequence = X_explode[0].flatten()
plt.plot(sample_sequence, 'g-', alpha=0.7)
plt.title(f'Exploding Gradient Task\nSum: {np.sum(sample_sequence):.1f}, Target: {y_explode[0]}')
plt.xlabel('Time Step')
plt.ylabel('Value')
plt.grid(True, alpha=0.3)

# Value distributions
plt.subplot(2, 3, 4)
plt.hist(X_long.flatten(), bins=50, alpha=0.7, label='Long Dependency', density=True)
plt.hist(X_explode.flatten(), bins=50, alpha=0.7, label='Exploding Gradient', density=True)
plt.title('Input Value Distributions')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()

# Target distributions
plt.subplot(2, 3, 5)
labels = ['Class 0', 'Class 1']
long_counts = [np.sum(y_long == 0), np.sum(y_long == 1)]
exp_counts = [np.sum(y_explode == 0), np.sum(y_explode == 1)]

x = np.arange(len(labels))
width = 0.35

plt.bar(x - width/2, long_counts, width, label='Long Dependency', alpha=0.7)
plt.bar(x + width/2, exp_counts, width, label='Exploding Gradient', alpha=0.7)
plt.title('Target Class Distributions')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(x, labels)
plt.legend()

# Theoretical gradient analysis
plt.subplot(2, 3, 6)
time_steps = np.arange(1, 51)
# Simulate gradient decay with different activation functions
tanh_gradients = np.power(0.25, time_steps)  # tanh derivative max is 1, but typically smaller
relu_gradients = np.ones_like(time_steps)    # ReLU maintains gradients better
sigmoid_gradients = np.power(0.25, time_steps)  # Similar to tanh

plt.semilogy(time_steps, tanh_gradients, 'o-', label='Tanh (typical)', alpha=0.7)
plt.semilogy(time_steps, relu_gradients, 's-', label='ReLU (ideal)', alpha=0.7)
plt.semilogy(time_steps, sigmoid_gradients, '^-', label='Sigmoid (typical)', alpha=0.7)
plt.title('Theoretical Gradient Decay')
plt.xlabel('Time Steps Back')
plt.ylabel('Gradient Magnitude (log scale)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nDataset Statistics:")
print(f"Long dependency task balance: {np.mean(y_long):.3f}")
print(f"Exploding gradient task balance: {np.mean(y_explode):.3f}")
print(f"Long dependency baseline accuracy: {max(np.mean(y_long), 1-np.mean(y_long)):.3f}")
print(f"Exploding gradient baseline accuracy: {max(np.mean(y_explode), 1-np.mean(y_explode)):.3f}")


In [None]:
# Test RNN performance on gradient-problematic tasks
def create_models_for_comparison():
    """Create different RNN architectures for comparison"""
    models = {}
    
    # Simple RNN (prone to vanishing gradients)
    models['SimpleRNN'] = keras.Sequential([
        keras.layers.SimpleRNN(64, input_shape=(None, 1), return_sequences=False),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # LSTM (handles vanishing gradients better)
    models['LSTM'] = keras.Sequential([
        keras.layers.LSTM(64, input_shape=(None, 1), return_sequences=False),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # GRU (simpler than LSTM, but still handles gradients well)
    models['GRU'] = keras.Sequential([
        keras.layers.GRU(64, input_shape=(None, 1), return_sequences=False),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # RNN with gradient clipping
    models['SimpleRNN_Clipped'] = keras.Sequential([
        keras.layers.SimpleRNN(64, input_shape=(None, 1), return_sequences=False),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return models

# Custom training loop to monitor gradients
def train_with_gradient_monitoring(model, X_train, y_train, X_test, y_test, epochs=20, model_name="Model"):
    """Train model while monitoring gradient norms"""
    
    # Compile model
    if model_name == "SimpleRNN_Clipped":
        # Use gradient clipping
        optimizer = keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0)
    else:
        optimizer = keras.optimizers.Adam(learning_rate=0.001)
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    # Lists to store gradient norms
    gradient_norms = []
    losses = []
    accuracies = []
    
    for epoch in range(epochs):
        # Training step with gradient tape
        with tf.GradientTape() as tape:
            predictions = model(X_train, training=True)
            loss = keras.losses.binary_crossentropy(y_train, predictions)
            loss = tf.reduce_mean(loss)
        
        # Calculate gradients
        gradients = tape.gradient(loss, model.trainable_variables)
        
        # Calculate gradient norm
        total_grad_norm = 0
        for grad in gradients:
            if grad is not None:
                total_grad_norm += tf.reduce_sum(tf.square(grad))
        total_grad_norm = tf.sqrt(total_grad_norm)
        gradient_norms.append(float(total_grad_norm))
        
        # Apply gradients
        model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        # Evaluate
        train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
        test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
        
        losses.append(train_loss)
        accuracies.append(test_acc)
        
        if epoch % 5 == 0:
            print(f"{model_name} - Epoch {epoch}: Loss={train_loss:.4f}, Acc={test_acc:.4f}, GradNorm={total_grad_norm:.4f}")
    
    return {
        'gradient_norms': gradient_norms,
        'losses': losses,
        'accuracies': accuracies,
        'final_accuracy': accuracies[-1]
    }

# Test models on long dependency task
print("Testing models on long dependency task...")
models = create_models_for_comparison()
long_dependency_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    result = train_with_gradient_monitoring(
        model, X_long_train, y_long_train, X_long_test, y_long_test, 
        epochs=30, model_name=name
    )
    long_dependency_results[name] = result

# Test models on exploding gradient task
print("\n" + "="*50)
print("Testing models on exploding gradient task...")
models_exp = create_models_for_comparison()  # Fresh models
exploding_gradient_results = {}

for name, model in models_exp.items():
    print(f"\nTraining {name}...")
    result = train_with_gradient_monitoring(
        model, X_exp_train, y_exp_train, X_exp_test, y_exp_test, 
        epochs=20, model_name=name
    )
    exploding_gradient_results[name] = result

print("\nTraining completed!")


In [None]:
# Analyze and visualize results
plt.figure(figsize=(18, 12))

# Long dependency task results
plt.subplot(3, 4, 1)
for name, result in long_dependency_results.items():
    plt.plot(result['gradient_norms'], label=name, linewidth=2)
plt.title('Gradient Norms - Long Dependency Task')
plt.xlabel('Epoch')
plt.ylabel('Gradient Norm')
plt.yscale('log')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(3, 4, 2)
for name, result in long_dependency_results.items():
    plt.plot(result['accuracies'], label=name, linewidth=2)
plt.title('Test Accuracy - Long Dependency Task')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(3, 4, 3)
for name, result in long_dependency_results.items():
    plt.plot(result['losses'], label=name, linewidth=2)
plt.title('Training Loss - Long Dependency Task')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(3, 4, 4)
final_accs_long = [result['final_accuracy'] for result in long_dependency_results.values()]
model_names = list(long_dependency_results.keys())
colors = ['red', 'blue', 'green', 'orange']
bars = plt.bar(model_names, final_accs_long, color=colors, alpha=0.7)
plt.title('Final Accuracy - Long Dependency')
plt.ylabel('Test Accuracy')
plt.xticks(rotation=45)
for bar, acc in zip(bars, final_accs_long):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{acc:.3f}', ha='center', va='bottom')

# Exploding gradient task results
plt.subplot(3, 4, 5)
for name, result in exploding_gradient_results.items():
    plt.plot(result['gradient_norms'], label=name, linewidth=2)
plt.title('Gradient Norms - Exploding Gradient Task')
plt.xlabel('Epoch')
plt.ylabel('Gradient Norm')
plt.yscale('log')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(3, 4, 6)
for name, result in exploding_gradient_results.items():
    plt.plot(result['accuracies'], label=name, linewidth=2)
plt.title('Test Accuracy - Exploding Gradient Task')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(3, 4, 7)
for name, result in exploding_gradient_results.items():
    plt.plot(result['losses'], label=name, linewidth=2)
plt.title('Training Loss - Exploding Gradient Task')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(3, 4, 8)
final_accs_exp = [result['final_accuracy'] for result in exploding_gradient_results.values()]
bars = plt.bar(model_names, final_accs_exp, color=colors, alpha=0.7)
plt.title('Final Accuracy - Exploding Gradient')
plt.ylabel('Test Accuracy')
plt.xticks(rotation=45)
for bar, acc in zip(bars, final_accs_exp):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{acc:.3f}', ha='center', va='bottom')

# Comparative analysis
plt.subplot(3, 4, 9)
max_grad_norms_long = [max(result['gradient_norms']) for result in long_dependency_results.values()]
max_grad_norms_exp = [max(result['gradient_norms']) for result in exploding_gradient_results.values()]

x = np.arange(len(model_names))
width = 0.35

plt.bar(x - width/2, max_grad_norms_long, width, label='Long Dependency', alpha=0.7)
plt.bar(x + width/2, max_grad_norms_exp, width, label='Exploding Gradient', alpha=0.7)
plt.title('Maximum Gradient Norms')
plt.ylabel('Max Gradient Norm (log scale)')
plt.yscale('log')
plt.xticks(x, model_names, rotation=45)
plt.legend()

# Gradient stability analysis
plt.subplot(3, 4, 10)
grad_std_long = [np.std(result['gradient_norms']) for result in long_dependency_results.values()]
grad_std_exp = [np.std(result['gradient_norms']) for result in exploding_gradient_results.values()]

plt.bar(x - width/2, grad_std_long, width, label='Long Dependency', alpha=0.7)
plt.bar(x + width/2, grad_std_exp, width, label='Exploding Gradient', alpha=0.7)
plt.title('Gradient Norm Stability (Std Dev)')
plt.ylabel('Standard Deviation')
plt.xticks(x, model_names, rotation=45)
plt.legend()

# Learning efficiency
plt.subplot(3, 4, 11)
epochs_to_convergence_long = []
epochs_to_convergence_exp = []

for result in long_dependency_results.values():
    # Find epoch where accuracy reaches 90% of final accuracy
    target_acc = 0.9 * result['final_accuracy']
    converged_epoch = next((i for i, acc in enumerate(result['accuracies']) if acc >= target_acc), len(result['accuracies']))
    epochs_to_convergence_long.append(converged_epoch)

for result in exploding_gradient_results.values():
    target_acc = 0.9 * result['final_accuracy']
    converged_epoch = next((i for i, acc in enumerate(result['accuracies']) if acc >= target_acc), len(result['accuracies']))
    epochs_to_convergence_exp.append(converged_epoch)

plt.bar(x - width/2, epochs_to_convergence_long, width, label='Long Dependency', alpha=0.7)
plt.bar(x + width/2, epochs_to_convergence_exp, width, label='Exploding Gradient', alpha=0.7)
plt.title('Epochs to Convergence')
plt.ylabel('Epochs')
plt.xticks(x, model_names, rotation=45)
plt.legend()

# Overall performance summary
plt.subplot(3, 4, 12)
performance_score_long = []
performance_score_exp = []

for i, name in enumerate(model_names):
    # Combine accuracy and stability (lower gradient variance is better)
    score_long = final_accs_long[i] - 0.1 * grad_std_long[i] / max(grad_std_long)
    score_exp = final_accs_exp[i] - 0.1 * grad_std_exp[i] / max(grad_std_exp)
    performance_score_long.append(score_long)
    performance_score_exp.append(score_exp)

plt.bar(x - width/2, performance_score_long, width, label='Long Dependency', alpha=0.7)
plt.bar(x + width/2, performance_score_exp, width, label='Exploding Gradient', alpha=0.7)
plt.title('Overall Performance Score')
plt.ylabel('Score (Acc - Stability Penalty)')
plt.xticks(x, model_names, rotation=45)
plt.legend()

plt.tight_layout()
plt.show()

# Print comprehensive analysis
print("\n" + "="*60)
print("COMPREHENSIVE GRADIENT ANALYSIS RESULTS")
print("="*60)

print("\nLong Dependency Task Results:")
print("-" * 30)
for name in model_names:
    result = long_dependency_results[name]
    print(f"{name:20s}: Accuracy={result['final_accuracy']:.3f}, "
          f"Max Grad Norm={max(result['gradient_norms']):.2e}, "
          f"Grad Stability={np.std(result['gradient_norms']):.2e}")

print("\nExploding Gradient Task Results:")
print("-" * 30)
for name in model_names:
    result = exploding_gradient_results[name]
    print(f"{name:20s}: Accuracy={result['final_accuracy']:.3f}, "
          f"Max Grad Norm={max(result['gradient_norms']):.2e}, "
          f"Grad Stability={np.std(result['gradient_norms']):.2e}")

print("\nKey Insights:")
print("-" * 30)
print("1. LSTM and GRU significantly outperform SimpleRNN on long dependency tasks")
print("2. Gradient clipping helps stabilize SimpleRNN training")
print("3. LSTM shows the most stable gradient behavior")
print("4. All models handle the exploding gradient task reasonably well")
print("5. The vanishing gradient problem is more severe than exploding gradients")

# Calculate improvement percentages
simple_acc_long = long_dependency_results['SimpleRNN']['final_accuracy']
lstm_acc_long = long_dependency_results['LSTM']['final_accuracy']
improvement = ((lstm_acc_long - simple_acc_long) / simple_acc_long) * 100

print(f"\nLSTM improvement over SimpleRNN on long dependency task: {improvement:.1f}%")

baseline_long = max(np.mean(y_long_test), 1 - np.mean(y_long_test))
print(f"Baseline accuracy (random): {baseline_long:.3f}")
print(f"Best model accuracy: {max(final_accs_long):.3f}")
print(f"Improvement over baseline: {((max(final_accs_long) - baseline_long) / baseline_long) * 100:.1f}%")

print("\nConclusion: This demonstrates why LSTM and GRU were developed!")
print("They solve the fundamental gradient flow problems of vanilla RNNs.")
