In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_swiss_roll, make_s_curve, load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import euclidean_distances
from sklearn.manifold import Isomap, TSNE, MDS
from sklearn.decomposition import PCA
from sklearn.metrics import trustworthiness
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import shortest_path
import networkx as nx
import time
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('seaborn-v0_8')
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Custom Isomap implementation
class CustomIsomap:
    """Custom implementation of Isomap for educational purposes"""
    
    def __init__(self, n_neighbors=5, n_components=2, metric='euclidean'):
        self.n_neighbors = n_neighbors
        self.n_components = n_components
        self.metric = metric
        
        # Results storage
        self.geodesic_distances_ = None
        self.embedding_ = None
        self.eigenvalues_ = None
        self.neighborhood_graph_ = None
        
    def _build_neighborhood_graph(self, X):
        """Build k-nearest neighbor graph"""
        print(f"Building {self.n_neighbors}-NN graph...")
        
        # Find k-nearest neighbors
        nbrs = NearestNeighbors(n_neighbors=self.n_neighbors + 1, metric=self.metric)
        nbrs.fit(X)
        distances, indices = nbrs.kneighbors(X)
        
        # Remove self-distances
        distances = distances[:, 1:]
        indices = indices[:, 1:]
        
        # Build sparse adjacency matrix
        n_samples = X.shape[0]
        row_ind = np.repeat(np.arange(n_samples), self.n_neighbors)
        col_ind = indices.flatten()
        data = distances.flatten()
        
        # Create symmetric graph
        adjacency = csr_matrix((data, (row_ind, col_ind)), shape=(n_samples, n_samples))
        adjacency = adjacency + adjacency.T
        adjacency.data = adjacency.data / 2  # Average overlapping edges
        
        self.neighborhood_graph_ = adjacency
        return adjacency
    
    def _compute_geodesic_distances(self, adjacency):
        """Compute geodesic distances using shortest path algorithm"""
        print("Computing geodesic distances...")
        
        # Use scipy's shortest path algorithm (Dijkstra's for sparse graphs)
        geodesic_dist = shortest_path(adjacency, directed=False, method='auto')
        
        # Check for disconnected components
        if np.isinf(geodesic_dist).any():
            print("Warning: Graph has disconnected components!")
            print(f"Number of infinite distances: {np.isinf(geodesic_dist).sum()}")
            
            # Replace infinite distances with large finite values
            max_finite = np.max(geodesic_dist[np.isfinite(geodesic_dist)])
            geodesic_dist[np.isinf(geodesic_dist)] = max_finite * 10
        
        self.geodesic_distances_ = geodesic_dist
        return geodesic_dist
    
    def _classical_mds(self, distance_matrix):
        """Apply classical multidimensional scaling"""
        print("Applying classical MDS...")
        
        n = distance_matrix.shape[0]
        
        # Double centering: H = I - (1/n) * 1 * 1^T
        H = np.eye(n) - np.ones((n, n)) / n
        
        # Gram matrix: G = -0.5 * H * D^2 * H
        D_squared = distance_matrix ** 2
        G = -0.5 * H @ D_squared @ H
        
        # Eigendecomposition
        eigenvalues, eigenvectors = np.linalg.eigh(G)
        
        # Sort eigenvalues and eigenvectors in descending order
        idx = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]
        
        # Take first n_components
        eigenvalues = eigenvalues[:self.n_components]
        eigenvectors = eigenvectors[:, :self.n_components]
        
        # Handle negative eigenvalues
        eigenvalues = np.maximum(eigenvalues, 0)
        
        # Compute embedding
        embedding = eigenvectors @ np.diag(np.sqrt(eigenvalues))
        
        self.eigenvalues_ = eigenvalues
        return embedding
    
    def fit_transform(self, X):
        """Fit Isomap and return embedding"""
        print(f"Fitting Custom Isomap on {X.shape[0]} samples...")
        
        # Step 1: Build neighborhood graph
        adjacency = self._build_neighborhood_graph(X)
        
        # Step 2: Compute geodesic distances
        geodesic_distances = self._compute_geodesic_distances(adjacency)
        
        # Step 3: Apply classical MDS
        embedding = self._classical_mds(geodesic_distances)
        
        self.embedding_ = embedding
        print("Custom Isomap fitting completed!")
        
        return embedding
    
    def get_graph_statistics(self):
        """Get statistics about the neighborhood graph"""
        if self.neighborhood_graph_ is None:
            return None
        
        graph = self.neighborhood_graph_
        n_edges = graph.nnz // 2  # Divide by 2 for undirected graph
        n_nodes = graph.shape[0]
        
        # Convert to NetworkX for more statistics
        G = nx.from_scipy_sparse_matrix(graph)
        
        stats = {
            'n_nodes': n_nodes,
            'n_edges': n_edges,
            'density': n_edges / (n_nodes * (n_nodes - 1) / 2),
            'is_connected': nx.is_connected(G),
            'n_components': nx.number_connected_components(G),
            'avg_clustering': nx.average_clustering(G),
            'avg_path_length': nx.average_shortest_path_length(G) if nx.is_connected(G) else float('inf')
        }
        
        return stats

# Test custom Isomap implementation
def test_custom_isomap():
    """Test custom Isomap on Swiss Roll dataset"""
    
    # Generate Swiss Roll data
    X, color = make_swiss_roll(n_samples=1000, noise=0.1, random_state=42)
    
    # Standardize data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print("=== Testing Custom Isomap ===")
    print(f"Dataset shape: {X_scaled.shape}")
    
    # Apply custom Isomap
    custom_isomap = CustomIsomap(n_neighbors=10, n_components=2)
    Y_custom = custom_isomap.fit_transform(X_scaled)
    
    # Get graph statistics
    graph_stats = custom_isomap.get_graph_statistics()
    print("\\nGraph Statistics:")
    for key, value in graph_stats.items():
        print(f"  {key}: {value}")
    
    # Compare with sklearn's Isomap
    print("\\nComparing with sklearn Isomap...")
    sklearn_isomap = Isomap(n_neighbors=10, n_components=2)
    Y_sklearn = sklearn_isomap.fit_transform(X_scaled)
    
    # Calculate trustworthiness
    trust_custom = trustworthiness(X_scaled, Y_custom, n_neighbors=10)
    trust_sklearn = trustworthiness(X_scaled, Y_sklearn, n_neighbors=10)
    
    print(f"\\nTrustworthiness Comparison:")
    print(f"Custom Isomap: {trust_custom:.4f}")
    print(f"Sklearn Isomap: {trust_sklearn:.4f}")
    
    # Visualization
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Original data (3D projected to first 2 dims)
    axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], c=color, cmap='viridis', alpha=0.6)
    axes[0].set_title('Original Swiss Roll\\n(First 2 Dimensions)')
    axes[0].set_xlabel('Dimension 1')
    axes[0].set_ylabel('Dimension 2')
    
    # Custom Isomap
    axes[1].scatter(Y_custom[:, 0], Y_custom[:, 1], c=color, cmap='viridis', alpha=0.6)
    axes[1].set_title(f'Custom Isomap\\nTrustworthiness: {trust_custom:.3f}')
    axes[1].set_xlabel('Isomap 1')
    axes[1].set_ylabel('Isomap 2')
    
    # Sklearn Isomap
    axes[2].scatter(Y_sklearn[:, 0], Y_sklearn[:, 1], c=color, cmap='viridis', alpha=0.6)
    axes[2].set_title(f'Sklearn Isomap\\nTrustworthiness: {trust_sklearn:.3f}')
    axes[2].set_xlabel('Isomap 1')
    axes[2].set_ylabel('Isomap 2')
    
    plt.tight_layout()
    plt.show()
    
    return custom_isomap, Y_custom, Y_sklearn

# Run the test
custom_isomap_model, custom_embedding, sklearn_embedding = test_custom_isomap()


In [None]:
# Comprehensive manifold learning comparison
def comprehensive_manifold_comparison():
    """Compare multiple manifold learning methods"""
    
    # Create diverse datasets
    datasets = {}
    
    # 1. Swiss Roll
    X_swiss, color_swiss = make_swiss_roll(n_samples=800, noise=0.1, random_state=42)
    scaler = StandardScaler()
    datasets['Swiss Roll'] = (scaler.fit_transform(X_swiss), color_swiss)
    
    # 2. S-Curve
    X_s, color_s = make_s_curve(n_samples=800, noise=0.1, random_state=42)
    scaler = StandardScaler()
    datasets['S-Curve'] = (scaler.fit_transform(X_s), color_s)
    
    # 3. Digits dataset (high-dimensional)
    digits = load_digits()
    X_digits = digits.data[:400]  # Subset for speed
    y_digits = digits.target[:400]
    scaler = StandardScaler()
    datasets['Digits'] = (scaler.fit_transform(X_digits), y_digits)
    
    # 4. 3D Spiral
    t = np.linspace(0, 4*np.pi, 600)
    X_spiral = np.column_stack([
        t * np.cos(t) * 0.1,
        t * np.sin(t) * 0.1,
        t * 0.1
    ])
    noise = np.random.normal(0, 0.02, X_spiral.shape)
    X_spiral += noise
    scaler = StandardScaler()
    datasets['3D Spiral'] = (scaler.fit_transform(X_spiral), t)
    
    # Manifold learning methods
    methods = {
        'Isomap': lambda X: Isomap(n_neighbors=10, n_components=2).fit_transform(X),
        'PCA': lambda X: PCA(n_components=2).fit_transform(X),
        't-SNE': lambda X: TSNE(n_components=2, random_state=42, perplexity=30).fit_transform(X),
        'MDS': lambda X: MDS(n_components=2, random_state=42).fit_transform(X)
    }
    
    # Results storage
    results = {}
    
    for dataset_name, (X, y) in datasets.items():
        print(f"\\n=== Processing {dataset_name} Dataset ===")
        print(f"Shape: {X.shape}")
        
        dataset_results = {}
        
        for method_name, method_func in methods.items():
            print(f"  Applying {method_name}...")
            
            start_time = time.time()
            try:
                Y = method_func(X)
                elapsed_time = time.time() - start_time
                
                # Calculate trustworthiness
                trust = trustworthiness(X, Y, n_neighbors=min(10, X.shape[0]//10))
                
                dataset_results[method_name] = {
                    'embedding': Y,
                    'trustworthiness': trust,
                    'time': elapsed_time
                }
                
                print(f"    {method_name}: Trust={trust:.3f}, Time={elapsed_time:.2f}s")
                
            except Exception as e:
                print(f"    {method_name}: Failed - {str(e)}")
                dataset_results[method_name] = None
        
        results[dataset_name] = dataset_results
    
    # Visualization
    n_datasets = len(datasets)
    n_methods = len(methods)
    
    fig, axes = plt.subplots(n_datasets, n_methods + 1, figsize=(6*(n_methods+1), 5*n_datasets))
    if n_datasets == 1:
        axes = axes.reshape(1, -1)
    
    for dataset_idx, (dataset_name, (X, y)) in enumerate(datasets.items()):
        
        # Plot original data (first 2 dimensions if >2D)
        if X.shape[1] >= 2:
            axes[dataset_idx, 0].scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', alpha=0.6, s=20)
        else:
            axes[dataset_idx, 0].scatter(X[:, 0], np.zeros_like(X[:, 0]), c=y, cmap='viridis', alpha=0.6, s=20)
        
        axes[dataset_idx, 0].set_title(f'{dataset_name}\\nOriginal (2D projection)')
        axes[dataset_idx, 0].set_xlabel('Dim 1')
        axes[dataset_idx, 0].set_ylabel('Dim 2')
        
        # Plot each method's result
        for method_idx, method_name in enumerate(methods.keys()):
            col_idx = method_idx + 1
            
            if results[dataset_name][method_name] is not None:
                Y = results[dataset_name][method_name]['embedding']
                trust = results[dataset_name][method_name]['trustworthiness']
                time_taken = results[dataset_name][method_name]['time']
                
                scatter = axes[dataset_idx, col_idx].scatter(Y[:, 0], Y[:, 1], c=y, 
                                                           cmap='viridis', alpha=0.6, s=20)
                axes[dataset_idx, col_idx].set_title(f'{method_name}\\nTrust: {trust:.3f}, Time: {time_taken:.2f}s')
            else:
                axes[dataset_idx, col_idx].text(0.5, 0.5, 'Failed', 
                                              transform=axes[dataset_idx, col_idx].transAxes,
                                              ha='center', va='center', fontsize=16)
                axes[dataset_idx, col_idx].set_title(f'{method_name}\\n(Failed)')
            
            axes[dataset_idx, col_idx].set_xlabel('Component 1')
            axes[dataset_idx, col_idx].set_ylabel('Component 2')
    
    plt.tight_layout()
    plt.show()
    
    # Summary comparison table
    print("\\n=== Method Comparison Summary ===")
    
    summary_data = []
    for dataset_name, dataset_results in results.items():
        for method_name, method_result in dataset_results.items():
            if method_result is not None:
                summary_data.append({
                    'Dataset': dataset_name,
                    'Method': method_name,
                    'Trustworthiness': f"{method_result['trustworthiness']:.3f}",
                    'Time (s)': f"{method_result['time']:.2f}"
                })
    
    summary_df = pd.DataFrame(summary_data)
    print(summary_df.to_string(index=False))
    
    # Best method per dataset
    print("\\n=== Best Method per Dataset (by Trustworthiness) ===")
    for dataset_name, dataset_results in results.items():
        valid_results = {k: v for k, v in dataset_results.items() if v is not None}
        if valid_results:
            best_method = max(valid_results.items(), key=lambda x: x[1]['trustworthiness'])
            print(f"{dataset_name}: {best_method[0]} (Trust: {best_method[1]['trustworthiness']:.3f})")
    
    return results, summary_df

# Run comprehensive comparison
print("=== Comprehensive Manifold Learning Comparison ===")
comparison_results, comparison_summary = comprehensive_manifold_comparison()


In [None]:
# Parameter analysis for Isomap
def isomap_parameter_analysis():
    """Analyze the effect of different Isomap parameters"""
    
    # Use Swiss Roll for parameter analysis
    X, color = make_swiss_roll(n_samples=600, noise=0.1, random_state=42)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print("=== Isomap Parameter Analysis ===")
    print(f"Dataset shape: {X_scaled.shape}")
    
    # Test different numbers of neighbors
    neighbor_values = [5, 10, 15, 20, 30, 50]
    neighbor_results = {}
    
    print("\\n1. Testing different k-neighbors values:")
    for k in neighbor_values:
        print(f"  k = {k}")
        
        try:
            isomap = Isomap(n_neighbors=k, n_components=2)
            Y = isomap.fit_transform(X_scaled)
            trust = trustworthiness(X_scaled, Y, n_neighbors=10)
            
            neighbor_results[k] = {
                'embedding': Y,
                'trustworthiness': trust,
                'eigenvalues': isomap.kernel_pca_.eigenvalues_ if hasattr(isomap, 'kernel_pca_') else None
            }
            print(f"    Trustworthiness: {trust:.4f}")
            
        except Exception as e:
            print(f"    Failed: {str(e)}")
            neighbor_results[k] = None
    
    # Visualize neighbor parameter effects
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    for i, k in enumerate(neighbor_values):
        if neighbor_results[k] is not None:
            Y = neighbor_results[k]['embedding']
            trust = neighbor_results[k]['trustworthiness']
            
            axes[i].scatter(Y[:, 0], Y[:, 1], c=color, cmap='viridis', alpha=0.6, s=15)
            axes[i].set_title(f'k = {k}\\nTrustworthiness: {trust:.3f}')
            axes[i].set_xlabel('Isomap 1')
            axes[i].set_ylabel('Isomap 2')
        else:
            axes[i].text(0.5, 0.5, f'k={k}\\nFailed', transform=axes[i].transAxes,
                        ha='center', va='center', fontsize=16)
    
    plt.suptitle('Effect of k-neighbors Parameter on Isomap', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # Plot trustworthiness vs k
    valid_k = [k for k in neighbor_values if neighbor_results[k] is not None]
    valid_trust = [neighbor_results[k]['trustworthiness'] for k in valid_k]
    
    plt.figure(figsize=(10, 6))
    plt.plot(valid_k, valid_trust, 'o-', linewidth=2, markersize=8)
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel('Trustworthiness')
    plt.title('Trustworthiness vs k-neighbors Parameter')
    plt.grid(True, alpha=0.3)
    
    # Add value labels
    for k, trust in zip(valid_k, valid_trust):
        plt.annotate(f'{trust:.3f}', (k, trust), xytext=(5, 5), 
                    textcoords='offset points', fontsize=9)
    
    plt.show()
    
    # Analyze connectivity
    print("\\n2. Graph Connectivity Analysis:")
    for k in neighbor_values:
        if neighbor_results[k] is not None:
            # Check connectivity by trying to build the graph
            try:
                nbrs = NearestNeighbors(n_neighbors=k, metric='euclidean')
                nbrs.fit(X_scaled)
                distances, indices = nbrs.kneighbors(X_scaled)
                
                # Build graph and check connectivity
                n_samples = X_scaled.shape[0]
                row_ind = np.repeat(np.arange(n_samples), k)
                col_ind = indices[:, :k].flatten()
                data = distances[:, :k].flatten()
                
                adjacency = csr_matrix((data, (row_ind, col_ind)), shape=(n_samples, n_samples))
                adjacency = adjacency + adjacency.T
                
                # Convert to NetworkX and check connectivity
                G = nx.from_scipy_sparse_matrix(adjacency)
                is_connected = nx.is_connected(G)
                n_components = nx.number_connected_components(G)
                
                print(f"  k={k}: Connected={is_connected}, Components={n_components}")
                
            except Exception as e:
                print(f"  k={k}: Error in connectivity analysis - {str(e)}")
    
    # Distance metric comparison
    print("\\n3. Distance Metric Comparison:")
    metrics = ['euclidean', 'manhattan', 'chebyshev']
    metric_results = {}
    
    for metric in metrics:
        print(f"  Testing {metric} metric...")
        try:
            isomap = Isomap(n_neighbors=15, n_components=2, metric=metric)
            Y = isomap.fit_transform(X_scaled)
            trust = trustworthiness(X_scaled, Y, n_neighbors=10)
            
            metric_results[metric] = {
                'embedding': Y,
                'trustworthiness': trust
            }
            print(f"    Trustworthiness: {trust:.4f}")
            
        except Exception as e:
            print(f"    Failed: {str(e)}")
            metric_results[metric] = None
    
    # Visualize metric comparison
    fig, axes = plt.subplots(1, len(metrics), figsize=(6*len(metrics), 5))
    if len(metrics) == 1:
        axes = [axes]
    
    for i, metric in enumerate(metrics):
        if metric_results[metric] is not None:
            Y = metric_results[metric]['embedding']
            trust = metric_results[metric]['trustworthiness']
            
            axes[i].scatter(Y[:, 0], Y[:, 1], c=color, cmap='viridis', alpha=0.6)
            axes[i].set_title(f'{metric.title()} Metric\\nTrust: {trust:.3f}')
            axes[i].set_xlabel('Isomap 1')
            axes[i].set_ylabel('Isomap 2')
        else:
            axes[i].text(0.5, 0.5, f'{metric}\\nFailed', transform=axes[i].transAxes,
                        ha='center', va='center', fontsize=16)
    
    plt.suptitle('Effect of Distance Metric on Isomap', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # Summary of parameter effects
    print("\\n=== Parameter Analysis Summary ===")
    print("\\nOptimal k-neighbors recommendations:")
    print("- Too small (k<10): May create disconnected components")
    print("- Too large (k>30): May lose local structure, approach PCA")
    print("- Sweet spot: k=10-20 for most datasets")
    
    print("\\nDistance metric insights:")
    for metric in metrics:
        if metric_results[metric] is not None:
            trust = metric_results[metric]['trustworthiness']
            print(f"- {metric.title()}: {trust:.4f}")
    
    return neighbor_results, metric_results

# Run parameter analysis
parameter_analysis_results = isomap_parameter_analysis()
