# Basic Neural Network Topological Analysis

This notebook demonstrates the basic usage of the neural topology framework for analyzing the topological structure of neural network layers.

## Overview

We will:
1. Load the COVID-19 drug discovery dataset
2. Build and train a neural network
3. Extract layer activations
4. Perform topological analysis using:
   - Persistent homology (Vietoris-Rips)
   - Mapper algorithm
   - UMAP projections
5. Visualize the results


## Setup and Imports

In [None]:
# Install required packages if needed
# !pip install -e ..[tda,tensorflow,visualization]

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Import neural topology framework
from neural_topology import (
    TopologicalAnalyzer,
    NeuralExtractor,
    DataProcessor
)

# Set random seed for reproducibility
np.random.seed(42)

print("Neural Topology Framework - Basic Analysis")
print("==========================================")

## Data Loading and Preprocessing

In [None]:
# Load COVID-19 drug discovery dataset
data_path = '../DDH_Data_with_Properties.csv'

try:
    data = pd.read_csv(data_path)
    print(f"Dataset loaded successfully: {data.shape}")
    print(f"Columns: {list(data.columns)[:5]}...")  # Show first 5 columns
except FileNotFoundError:
    print(f"Dataset not found at {data_path}")
    print("Please ensure the dataset is in the correct location")
    # Create synthetic data for demonstration
    np.random.seed(42)
    data = pd.DataFrame({
        **{f'feature_{i}': np.random.randn(100) for i in range(33)},
        'pIC50': np.random.uniform(-1, 3, 100)
    })
    print("Using synthetic dataset for demonstration")

In [None]:
# Preprocess data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Clean data
if 'pIC50' in data.columns:
    clean_data = data[data['pIC50'] != 'BLINDED'] if 'BLINDED' in data['pIC50'].values else data
    clean_data = clean_data.dropna()
    
    # Convert pIC50 to numeric if needed
    clean_data['pIC50'] = pd.to_numeric(clean_data['pIC50'], errors='coerce')
    clean_data = clean_data.dropna()
    
    print(f"Clean dataset: {clean_data.shape}")
else:
    clean_data = data
    print("Using full dataset (no pIC50 column found)")

# Prepare features and target
feature_cols = [col for col in clean_data.select_dtypes(include=[np.number]).columns if col != 'pIC50']
X = clean_data[feature_cols].values

if 'pIC50' in clean_data.columns:
    y = clean_data['pIC50'].values
    y_binary = (y > np.median(y)).astype(int)  # Binary classification
else:
    y_binary = np.random.binomial(1, 0.5, len(X))  # Random binary for demo

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_binary, test_size=0.3, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution: {np.bincount(y_train)}")

## Neural Network Creation and Training

In [None]:
# Build neural network model
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from tensorflow.keras import regularizers
    
    # Build model
    model = Sequential([
        Dense(50, activation='relu', 
              kernel_regularizer=regularizers.l2(0.05),
              input_dim=X_train.shape[1], name='dense_1'),
        Dense(40, activation='relu',
              kernel_regularizer=regularizers.l2(0.05), name='dense_2'),
        Dense(20, activation='relu',
              kernel_regularizer=regularizers.l2(0.05), name='dense_3'),
        Dense(1, activation='sigmoid',
              kernel_regularizer=regularizers.l2(0.05), name='output')
    ])
    
    # Compile model
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    print("Model architecture:")
    model.summary()
    
    tensorflow_available = True
    
except ImportError:
    print("TensorFlow not available. Please install with: pip install tensorflow")
    tensorflow_available = False

In [None]:
# Train the model
if tensorflow_available:
    print("Training neural network...")
    
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=10,
        validation_data=(X_test, y_test),
        verbose=0
    )
    
    # Evaluate model
    train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    
    print(f"Training accuracy: {train_acc:.3f}")
    print(f"Test accuracy: {test_acc:.3f}")
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("Skipping model training - TensorFlow not available")

## Layer Activation Extraction

In [None]:
if tensorflow_available:
    # Initialize neural extractor
    extractor = NeuralExtractor()
    
    # Extract activations from all layers
    print("Extracting layer activations...")
    activations = extractor.extract_all_layers(model, X_test)
    
    print(f"Extracted activations from {len(activations)} layers:")
    for layer_name, activation in activations.items():
        print(f"  {layer_name}: {activation.shape}")
    
    # Get layer information
    layer_info = extractor.get_layer_info(model)
    print("\nLayer Information:")
    for name, info in layer_info.items():
        print(f"  {name}: {info['type']}, Parameters: {info['params']:,}")
else:
    print("Skipping activation extraction - TensorFlow not available")
    # Create dummy activations for demonstration
    activations = {
        'layer_1': np.random.randn(len(X_test), 50),
        'layer_2': np.random.randn(len(X_test), 40),
        'layer_3': np.random.randn(len(X_test), 20),
        'output': np.random.randn(len(X_test), 1)
    }
    print("Using dummy activations for demonstration")

## Topological Analysis

In [None]:
# Initialize topological analyzer
analyzer = TopologicalAnalyzer(homology_dimensions=[0, 1, 2])

print("Performing topological analysis...")
print("This may take a few minutes...")

# Perform comprehensive topological analysis
results = analyzer.analyze_network_topology(
    activations=activations,
    labels=y_test,
    compute_mapper=True,
    compute_umap=True
)

print("\nTopological analysis complete!")
print(f"Analyzed {results.metadata['layer_count']} layers")
print(f"Sample count: {results.metadata['sample_count']}")

## Results Visualization

In [None]:
# Display Betti numbers (topological features)
print("Betti Numbers (Topological Features) by Layer:")
print("=" * 50)

for layer_name, betti_nums in results.betti_numbers.items():
    print(f"\n{layer_name.upper()}:")
    print(f"  H0 (Connected Components): {betti_nums.get(0, 0)}")
    print(f"  H1 (Loops): {betti_nums.get(1, 0)}")
    print(f"  H2 (Voids): {betti_nums.get(2, 0)}")
    total_features = sum(betti_nums.values())
    print(f"  Total Features: {total_features}")

In [None]:
# Plot Betti number evolution across layers
layer_names = list(results.betti_numbers.keys())
h0_values = [results.betti_numbers[name].get(0, 0) for name in layer_names]
h1_values = [results.betti_numbers[name].get(1, 0) for name in layer_names]
h2_values = [results.betti_numbers[name].get(2, 0) for name in layer_names]

plt.figure(figsize=(12, 6))

x_pos = range(len(layer_names))
width = 0.25

plt.bar([x - width for x in x_pos], h0_values, width, label='H0 (Components)', alpha=0.8)
plt.bar(x_pos, h1_values, width, label='H1 (Loops)', alpha=0.8)
plt.bar([x + width for x in x_pos], h2_values, width, label='H2 (Voids)', alpha=0.8)

plt.xlabel('Layer')
plt.ylabel('Betti Number')
plt.title('Topological Feature Evolution Across Layers')
plt.xticks(x_pos, [name.replace('dense_', 'Layer ') for name in layer_names], rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Plot UMAP projections for each layer
n_layers = len(results.umap_projections)
if n_layers > 0:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.flatten()
    
    for i, (layer_name, projection) in enumerate(results.umap_projections.items()):
        if i >= len(axes):
            break
            
        ax = axes[i]
        scatter = ax.scatter(projection[:, 0], projection[:, 1], 
                           c=y_test, cmap='RdYlBu', alpha=0.7, s=30)
        ax.set_title(f'UMAP: {layer_name.replace("dense_", "Layer ")}')
        ax.set_xlabel('UMAP 1')
        ax.set_ylabel('UMAP 2')
        ax.grid(True, alpha=0.3)
        plt.colorbar(scatter, ax=ax)
    
    # Hide unused subplots
    for i in range(n_layers, len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle('Data Geometry Evolution Through Network Layers', fontsize=16)
    plt.tight_layout()
    plt.show()
else:
    print("No UMAP projections available")

## Compute and Display Topology Metrics

In [None]:
# Compute topology metrics
topology_metrics = analyzer.compute_topology_metrics(results)

print("Topology Metrics:")
print("=" * 40)

# Group metrics by layer
layer_metrics = {}
for metric_name, value in topology_metrics.items():
    parts = metric_name.split('_')
    layer_name = '_'.join(parts[:-2])  # Everything except last 2 parts
    metric_type = '_'.join(parts[-2:])  # Last 2 parts
    
    if layer_name not in layer_metrics:
        layer_metrics[layer_name] = {}
    layer_metrics[layer_name][metric_type] = value

for layer_name, metrics in layer_metrics.items():
    print(f"\n{layer_name.upper()}:")
    for metric_type, value in metrics.items():
        print(f"  {metric_type.replace('_', ' ').title()}: {value:.3f}")

## Summary and Insights

In [None]:
print("ANALYSIS SUMMARY")
print("=" * 60)

print(f"\nDataset: {len(y_test)} samples, {X_test.shape[1]} features")
if tensorflow_available:
    print(f"Model accuracy: {test_acc:.3f}")
print(f"Layers analyzed: {len(activations)}")

print("\nKey Topological Insights:")

# Find layer with most topological complexity
total_features_by_layer = {}
for layer_name, betti_nums in results.betti_numbers.items():
    total_features_by_layer[layer_name] = sum(betti_nums.values())

if total_features_by_layer:
    most_complex_layer = max(total_features_by_layer, key=total_features_by_layer.get)
    max_complexity = total_features_by_layer[most_complex_layer]
    print(f"• Most topologically complex layer: {most_complex_layer} ({max_complexity} features)")

# Analyze feature evolution
layer_order = list(results.betti_numbers.keys())
if len(layer_order) >= 2:
    first_layer_features = sum(results.betti_numbers[layer_order[0]].values())
    last_layer_features = sum(results.betti_numbers[layer_order[-1]].values())
    
    if last_layer_features > first_layer_features:
        print("• Topological complexity increases through the network")
    elif last_layer_features < first_layer_features:
        print("• Topological complexity decreases through the network")
    else:
        print("• Topological complexity remains stable through the network")

print("\nInterpretation:")
print("• H0 (connected components): Number of distinct clusters in the data")
print("• H1 (loops): Circular/cyclical patterns in the feature space")
print("• H2 (voids): Higher-dimensional holes or cavities in the data")
print("• UMAP projections show how data geometry evolve through layers")

print("\nAnalysis complete!")