# Data Exploration and Visualization of Cosmic Strings in CMB
This notebook explores the Cosmic Microwave Background (CMB) data to identify potential cosmic string candidates.

Initial exploration of the CMB data using Healpy and Matplotlib.

In [None]:
# Install required packages
# pip freeze > requirements.txt
!pip install -r requirements.txt

First exploring the COM_CMB_IQU-smica_2048_R3.00_full.fits file to understand its structure and contents.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from astropy.constants.codata2014 import alpha
from scipy.constants import h, c, k
import healpy as hp

# Planck function (spectral radiance)
def planck(f, T):
    x = h * f / (k * T)
    return (2 * h * f**3 / c**2) / np.expm1(x)


# Frequency range (Hz)
frequencies = np.linspace(1e10, 1e12, 1000)  # 10 GHz to 1000 GHz
temperature = 2.725  # CMB temperature in Kelvin

# Compute spectral radiance
radiance = planck(frequencies, temperature)

# Plot spectrum
plt.figure(figsize=(10, 6))
plt.plot(frequencies / 1e9, radiance, color='darkblue', label=f'T = {temperature} K')
plt.title('CMB Blackbody Spectrum at T = 2.725 K')
plt.xlabel('Frequency (GHz)')
plt.ylabel('Spectral Radiance B(ν) [W·sr⁻¹·m⁻²·Hz⁻¹]')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:

from scipy import ndimage

# Load a Planck CMB temperature map
cmb_map = hp.read_map('data/COM_CMB_IQU-smica_2048_R3.00_full.fits', field=0)

# Not ideal for CMB structure detection
# cmb_map = hp.read_map('data/HFI_SkyMap_100_2048_R3.01_full.fits', field=0)


# Convert to 2D image
nside = hp.get_nside(cmb_map)
npix = hp.nside2npix(nside)
# Convert the CMB map to a 2D image for visualization
img = hp.mollview(cmb_map, return_projected_map=True, nest=False, title='CMB Temperature Map', cmap='inferno', xsize=2000, hold=True)


In [None]:
# Apply edge detection (e.g., Sobel)edges = ndimage.sobel(img)
# Plot the edges detected in the CMB map
edges = ndimage.sobel(img)

from pathlib import Path

folder_path = Path('./edge')

if not folder_path.exists():
    folder_path.mkdir(parents=True, exist_ok=True)
    print(f"Folder '{folder_path}' created successfully!")
else:
    print(f"Folder '{folder_path}' already exists.")

# Save to .fits
from astropy.io import fits
# Save as 2D FITS image
hdu = fits.PrimaryHDU(edges.astype(np.float32))
hdu.writeto("edge/edge_detected_image.fits", overwrite=True)


plt.figure(figsize=(30, 15))
plt.imshow(edges, cmap='inferno')
plt.title('Possible Cosmic String Candidates (Edges in CMB)')
plt.colorbar()
plt.show()


In [None]:
import numpy as np
from scipy import ndimage
import matplotlib.pyplot as plt

# --- Step 1: Load or define your image (already done)
# img = ...

# --- Step 2: Force-clean the CMB image ---
# Replace NaNs, -inf, +inf with zeros (or median)
img_cleaned = np.nan_to_num(img, nan=0.0, posinf=0.0, neginf=0.0)

# --- Step 3: Edge detection ---
edges = ndimage.sobel(img_cleaned)

# --- Step 4: Print CLEANED ranges ---
print("Cleaned CMB image range:", np.min(img_cleaned), "to", np.max(img_cleaned))
print("Edge image range:", np.min(edges), "to", np.max(edges))

# --- Step 5: Normalize for plotting ---
def normalize(arr):
    arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
    return (arr - np.min(arr)) / (np.ptp(arr) + 1e-8)

img_norm = normalize(img_cleaned)
edges_norm = normalize(edges)

# Optional: Threshold edges for clarity
edges_thresh = np.where(edges_norm > 0.3, 1.0, 0.0)

# --- Step 6: Plot overlay ---
plt.figure(figsize=(14, 6))
plt.imshow(img_norm, cmap='coolwarm',alpha=0.9, origin='lower', aspect='auto')
plt.imshow(edges_thresh, cmap='YlOrBr', alpha=0.9, origin='lower', aspect='auto')
plt.colorbar(label='Intensity')
plt.title("CMB with Edge Overlay (cleaned)")
plt.axis('off')
plt.tight_layout()
plt.savefig("cmb_edge_overlay_cleaned.png", dpi=300)
plt.show()


In [None]:
# Nonlinear exaggeration
edges_exaggerated = edges**0.9  # or try 2.0


plt.figure(figsize=(30, 15))
plt.imshow(edges_exaggerated, cmap='turbo')
plt.title('Possible Cosmic String Candidates (Edges in CMB)')
plt.colorbar()
plt.show()

In [None]:
sharp_edges = edges + 0.7 * edges  # amplify edge intensity


plt.figure(figsize=(30, 15))
plt.imshow(sharp_edges, cmap='inferno')
plt.title('Possible Cosmic String Candidates (Edges in CMB)')
plt.colorbar()
plt.show()


In [None]:
# Alter this code block
# Apply edge detection (e.g., Sobel)edges = ndimage.sobel(img)
# Plot the edges detected in the CMB map
# Apply Sobel in x and y directions
dx = ndimage.sobel(img, axis=0) ** 0.5
dy = ndimage.sobel(img, axis=1) ** 0.5

# Gradient magnitude
edges = np.hypot(dx, dy)  # Same as sqrt(dx**2 + dy**2)

plt.figure(figsize=(30, 15))
plt.imshow(edges, cmap='inferno')
plt.title('Possible Cosmic String Candidates (Edges in CMB)')
plt.colorbar()
plt.show()


In [None]:
frequencies = np.linspace(10e9, 1000e9, 1000)  # 10 GHz to 1000 GHz

T_mean = 2.725
delta_T = 100e-6  # 100 µK typical fluctuation

plt.figure(figsize=(10, 6))
plt.plot(frequencies / 1e9, planck(frequencies, T_mean), label="T = 2.725 K", color='blue')
plt.plot(frequencies / 1e9, planck(frequencies, T_mean + delta_T), '--', label="+100 µK", color='green', alpha=0.8)
plt.plot(frequencies / 1e9, planck(frequencies, T_mean - delta_T), '--', label="-100 µK", color='red', alpha=0.8)
plt.xlabel("Frequency (GHz)")
plt.ylabel("Spectral Radiance (W·sr⁻¹·m⁻²·Hz⁻¹)")
plt.title("CMB Spectrum with Fluctuation Bounds (±100 µK)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Show the CMB spectrum with +100 µK fluctuations
plt.figure(figsize=(10, 6))
plt.plot(frequencies / 1e9, planck(frequencies, T_mean), label="T = 2.725 K", color='blue')
plt.plot(frequencies / 1e9, planck(frequencies, T_mean + delta_T), '--', label="+100 µK", color='green', alpha=0.8)
plt.xlabel("Frequency (GHz)")
plt.ylabel("Spectral Radiance (W·sr⁻¹·m⁻²·Hz⁻¹)")
plt.title("CMB Spectrum with Fluctuation Bounds (±100 µK)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Show the CMB spectrum with -100 µK fluctuations
plt.figure(figsize=(10, 6))
plt.plot(frequencies / 1e9, planck(frequencies, T_mean), label="T = 2.725 K", color='blue')
plt.plot(frequencies / 1e9, planck(frequencies, T_mean - delta_T), '--', label="-100 µK", color='red', alpha=0.8)
plt.xlabel("Frequency (GHz)")
plt.ylabel("Spectral Radiance (W·sr⁻¹·m⁻²·Hz⁻¹)")
plt.title("CMB Spectrum with Fluctuation Bounds (±100 µK)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
cl = hp.anafast(cmb_map)
plt.plot(cl[:200])
plt.title("Low-ℓ Multipoles – Texture Signature Region")
plt.xlabel("Multipole ℓ [ℓ: inverse angular scale]")
plt.ylabel("C_ℓ [Power: variance of temperature fluctuations]")
plt.grid()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import healpy as hp
from matplotlib.ticker import FuncFormatter

# --- Compute power spectrum from CMB map ---
cl = hp.anafast(cmb_map)         # Power spectrum
ell = np.arange(len(cl))         # Multipole indices

# --- Begin Plot ---
plt.figure(figsize=(12, 6))

# Plot measured CMB power spectrum (excluding monopole)
plt.loglog(ell[1:1500], cl[1:1500], label='CMB Power Spectrum', color='blue')

# Plot theoretical scale-invariant reference curve
ref_value = np.mean(cl[5:15])
theory_curve = ref_value * (ell[5] * (ell[5] + 1)) / (ell[1:1500] * (ell[1:1500] + 1))
plt.loglog(ell[1:1500], theory_curve, 'r--', label='Scale-invariant (∝ 1/ℓ(ℓ+1))')

# --- Highlight Regions ---
plt.axvspan(2, 30, color='gray', alpha=0.15, label='Sachs-Wolfe Region')
plt.axvspan(50, 200, color='orange', alpha=0.08, label='Transition Region')
plt.axvspan(200, 1500, color='green', alpha=0.08, label='Acoustic Peaks Region')

# --- Annotations (repositioned to avoid overlap) ---
plt.text(7, 2e-10, 'Sachs-Wolfe\nSuper-horizon modes', fontsize=10, color='black')
plt.text(60, 5e-20, 'Transition:\nEarly Oscillations &\n Projection Effects', fontsize=10, color='black')
plt.text(300, 5e-12, 'Acoustic Peaks:\nPhoton-Baryon Oscillations', fontsize=10, color='black')

# 1st Peak marker
plt.axvline(220, color='black', linestyle='--', alpha=0.5)
plt.text(235, 1e-10, '1st Peak\n(~ℓ=220)', fontsize=9, color='black')

# --- Labels, Legend, Grid ---
plt.title("CMB Angular Power Spectrum (Log Scale)")
plt.xlabel("Multipole ℓ [ℓ: inverse angular scale]")
plt.ylabel("C_ℓ [Power: variance of temperature fluctuations]")
plt.legend()
plt.grid(True, which="both", ls="--", alpha=0.2)

# --- Format axes using human-readable numbers ---
def human_format(x, pos):
    if x >= 1:
        return f"{int(x)}"
    else:
        return f"{x:.1g}"

ax = plt.gca()
ax.xaxis.set_major_formatter(FuncFormatter(human_format))
ax.yaxis.set_major_formatter(FuncFormatter(human_format))

plt.tight_layout()
plt.show()


In [None]:
# Plot the power spectrum (ℓ = 0 to 199)
ell = np.arange(len(cl))  # Define ell as the array of multipole indices
plt.figure(figsize=(9, 5))
plt.plot(ell[:250], cl[:250], label='CMB Power')

# Overlay topological defect regions
plt.axvspan(2, 10, color='blue', alpha=0.3, label='Textures (ℓ ≈ 2–10)')
plt.axvspan(2, 5, color='red', alpha=0.3, label='Domain Walls (ℓ ≈ 2–5)')
plt.axvspan(50, 200, color='green', alpha=0.2, label='Cosmic Strings (ℓ ≈ 50–200)')
plt.axvspan(2, 3, color='orange', alpha=0.4, label='Monopoles (ℓ ≈ 2–3)')

# Labels and grid
plt.title("CMB Multipole Spectrum with Topological Defect Regions")
plt.xlabel("Multipole ℓ [ℓ: inverse angular scale]")
plt.ylabel("C_ℓ [Power: variance of temperature fluctuations]")
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot the power spectrum (ℓ = 0 to 199)
ell = np.arange(len(cl))  # Define ell as the array of multipole indices
plt.figure(figsize=(9, 5))
plt.plot(ell[:50], cl[:50], label='CMB Power')

# Overlay topological defect regions
plt.axvspan(2, 10, color='blue', alpha=0.3, label='Textures (ℓ ≈ 2–10)')
plt.axvspan(2, 5, color='red', alpha=0.3, label='Domain Walls (ℓ ≈ 2–5)')
plt.axvspan(2, 3, color='orange', alpha=0.4, label='Monopoles (ℓ ≈ 2–3)')

# Labels and grid
plt.title("CMB Multipole Spectrum with Topological Defect Regions")
plt.xlabel("Multipole ℓ [ℓ: inverse angular scale]")
plt.ylabel("C_ℓ [Power: variance of temperature fluctuations]")
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Use same `cl` from previous block
ell = np.arange(1, 200)  # Skip ℓ = 0 to avoid divide-by-zero
theta_deg = 180 / ell    # Angular scale in degrees

# Trim cl accordingly
cl_trim = cl[1:200]

# Plot C_ell vs angular scale θ
plt.plot(theta_deg, cl_trim)
plt.title("CMB Power Spectrum vs Angular Scale")
plt.xlabel("Angular Scale θ [degrees]")
plt.ylabel("C_ℓ [Power: variance of temperature fluctuations]")
plt.grid()
plt.gca().invert_xaxis()  # Larger angular scales on the left
plt.show()

In [None]:
# ℓ = 1 to 199 (skip ℓ = 0 to avoid division by zero)
theta_deg = 180 / ell
cl_trim = cl[1:200]

# Plot power vs angular scale
plt.figure(figsize=(9, 5))
plt.plot(theta_deg, cl_trim, label='CMB Power')

# Overlay topological defect regions in degrees
plt.axvspan(18, 90, color='blue', alpha=0.3, label='Textures (θ ≈ 18° to 90°)')
plt.axvspan(36, 90, color='red', alpha=0.3, label='Domain Walls (θ ≈ 36° to 90°)')
plt.axvspan(1, 4, color='green', alpha=0.2, label='Cosmic Strings (θ ≈ 1° to 4°)')
plt.axvspan(60, 90, color='orange', alpha=0.4, label='Monopoles (θ ≈ 60° to 90°)')

# Labels and formatting
plt.title("CMB Power vs Angular Scale with Topological Defect Regions")
plt.xlabel("Angular Scale θ [degrees]")
plt.ylabel("C_ℓ [Power: variance of temperature fluctuations]")
plt.grid()
plt.gca().invert_xaxis()  # Large scales (low ℓ) on left
plt.legend()
plt.tight_layout()
plt.show()


Example of exaggerated cosmic strings

In [None]:
from skimage import draw  # Changed import to use skimage.draw instead of matplotlib.pyplot.draw

def simulate_cosmic_strings(map_size, num_strings):
    map_data = np.zeros((map_size, map_size))
    for _ in range(num_strings):
        x = np.random.randint(0, map_size)
        y = np.random.randint(0, map_size)
        dx = np.random.randint(-map_size//4, map_size//4)
        dy = np.random.randint(-map_size//4, map_size//4)
        rr, cc = draw.line(x, y, x+dx, y+dy)  # Now using skimage.draw.line
        map_data[rr % map_size, cc % map_size] += np.random.choice([-1, 1]) * 100e-6  # ~100 µK jump
    return map_data

# Simulate and plot
map_size = 1000  # Reduced size for better visualization
num_strings = 10
cosmic_map = simulate_cosmic_strings(map_size, num_strings)

plt.figure(figsize=(10, 10))
plt.imshow(cosmic_map, cmap='RdBu_r')
plt.colorbar(label='Temperature fluctuation (K)')
plt.title('Simulated Cosmic Strings')
plt.show()


## Exploring the data with Machine Learning Methods

In this section, the application of various machine learning techniques to analyze the CMB data and identify potential patterns or anomalies that might be associated with cosmic strings or other phenomena.

Ensure the data and necessary libraries are loaded (just means sections below are not reliant on sections above.)


In [None]:
# Import necessary ML libraries
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy import stats

# Make sure we have the CMB map loaded
# If not already loaded, load it again
if 'cmb_map' not in locals():
    cmb_map = hp.read_map('data/COM_CMB_IQU-smica_2048_R3.00_full.fits', field=0)
    nside = hp.get_nside(cmb_map)
    npix = hp.nside2npix(nside)
    img = hp.mollview(cmb_map, return_projected_map=True, nest=False, title='CMB Temperature Map', cmap='inferno', xsize=2000, hold=True)
    img_cleaned = np.nan_to_num(img, nan=0.0, posinf=0.0, neginf=0.0)

print(f"CMB map shape: {cmb_map.shape}")
print(f"Projected image shape: {img_cleaned.shape}")


### 1. Feature Extraction and Dimensionality Reduction

First, extracting features from the CMB data and reducing dimensionality to visualize patterns.


### For Single Cluster

In [None]:
# For ML analysis, we'll work with the 2D projected map (img_cleaned)
# Let's extract patches from the image to use as features

def extract_patches(image, patch_size=16, stride=8):
    """Extract patches from a 2D image with a given stride."""
    patches = []
    positions = []
    h, w = image.shape

    for i in range(0, h - patch_size + 1, stride):
        for j in range(0, w - patch_size + 1, stride):
            # Skip patches with NaN or inf values
            patch = image[i:i+patch_size, j:j+patch_size]
            if np.isfinite(patch).all() and not np.isnan(patch).any():
                # Flatten the patch to a 1D array
                patches.append(patch.flatten())
                positions.append((i, j))

    return np.array(patches), positions

# Extract patches from the cleaned image
# patch_size = 4 , stride = 2	Very localized features, fast	No texture, easy to overfit noise
# patch_size = 8 , stride = 4	Ultra-local	Very sensitive to noise; good for edge detection
# patch_size = 16, stride = 8	Small patches, overlapping	Dense coverage, good for texture analysis
# patch_size = 32, stride = 32	Large, non-overlapping	Coarse, fewer samples but high information per patch
# patch_size = 64, stride = 16	Large and overlapping	Heavy computation; useful for large-scale pattern mining
#
#
# Try three experimental combinations:
# Test	Patch Size	Stride	Use Case
# A	    16	        8	    Balanced: local structure + manageable data
# B	    32	        16	    Large structures, good for global coherence
# C	    8	        4	    Detect sharp features (e.g., cosmic strings, shocks)
patch_size = 8
stride = 4
patches, positions = extract_patches(img_cleaned, patch_size, stride)

print(f"Extracted {len(patches)} patches of size {patch_size}x{patch_size}")

# Standardize the patches
scaler = StandardScaler()
patches_scaled = scaler.fit_transform(patches)

# Fit PCA with all components first to inspect explained variance
pca_full = PCA()
pca_full.fit(patches_scaled)

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

# Define your target variance threshold (e.g., 95%)
#     target_variance: A pre-defined threshold (e.g., 0.95 for 95%) indicating how much of the
#     original data's variability PCA should preserve.
#
#     Purpose: Balances dimensionality reduction against information loss.
target_variance = 0.95

# Find the number of components that meet or exceed this threshold
n_components = np.argmax(cumulative_variance >= target_variance) + 1
print(f"Number of components to retain {target_variance*100:.0f}% variance: {n_components}")


# Apply PCA again with the optimal number of components
pca = PCA(n_components=n_components)
patches_pca = pca.fit_transform(patches_scaled)

# Reconstruct data with selected components
patches_reduced = pca.fit_transform(patches_scaled)
patches_reconstructed = pca.inverse_transform(patches_reduced)

# Calculate mean squared error
mse = np.mean((patches_scaled - patches_reconstructed) ** 2)
print(f"Reconstruction MSE: {mse:.2e}")

# Optional: plot the explained variance curve with a line at the threshold
plt.figure(figsize=(10, 5))
plt.plot(cumulative_variance, label='Cumulative Explained Variance')
plt.axhline(y=target_variance, color='r', linestyle='--', label=f'{target_variance*100:.0f}% Variance')
plt.axvline(x=n_components - 1, color='g', linestyle='--', label=f'{n_components} Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.legend()
plt.grid(True)
plt.show()


# Apply t-SNE for visualization
# t-Distributed Stochastic Neighbor Embedding
tsne = TSNE(n_components=2, random_state=42)
patches_tsne = tsne.fit_transform(patches_pca)

plt.figure(figsize=(10, 8))
plt.scatter(patches_tsne[:, 0], patches_tsne[:, 1], alpha=0.5, s=5)
plt.title('t-SNE Visualization of CMB Patches')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar()
plt.show()


Is This MSE Value Good?

Typical MSE Range for CMB Data:

Excellent Reconstruction: MSE < 1e-02 (near-perfect preservation of CMB features)

Good Reconstruction: MSE ~1e-02 to 1e-01 (retains most cosmological signals)

Moderate Reconstruction: MSE ~1e-01 to 5e-01 (some loss of small-scale fluctuations)

Poor Reconstruction: MSE > 5e-01 (significant smoothing/feature loss)

Aim for MSE < 1e-02 to ensure topology-sensitive features survive dimensionality reduction.

In [None]:
if mse < 1e-02:
    print(f"Reconstruction MSE: {mse:.2e} is excellent for CMB data")
elif mse < 1e-01:
    print(f"Reconstruction MSE: {mse:.2e} is good for CMB data")
elif mse < 5e-01:
    print(f"Reconstruction MSE: {mse:.2e} is moderate for CMB data")
else:
    print(f"Reconstruction MSE: {mse:.2e} is not good for CMB data")

### For Combined Clusters

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# --- PATCH EXTRACTION FUNCTION ---
def extract_patches(image, patch_size=16, stride=8):
    """Extract patches from a 2D image with a given stride."""
    patches = []
    positions = []
    h, w = image.shape

    for i in range(0, h - patch_size + 1, stride):
        for j in range(0, w - patch_size + 1, stride):
            patch = image[i:i+patch_size, j:j+patch_size]
            if np.isfinite(patch).all() and not np.isnan(patch).any():
                patches.append(patch.flatten())
                positions.append((i, j))

    return np.array(patches), positions

# --- PARAMETERS ---
patch_sizes = [8, 16, 32, 64]  # Multi-scale patch sizes
stride = 4
target_variance = 0.95

# --- COLLECT ALL FEATURES ---
all_features = []
n_components_dict = {}
explained_variances = {}
mse_dict = {}

pca_models = {}
scalers = {}


for patch_size in patch_sizes:
    print(f"\n🔍 Processing patch size: {patch_size}x{patch_size}")

    # Extract patches
    patches, _ = extract_patches(img_cleaned, patch_size, stride)
    print(f"Extracted {len(patches)} patches of size {patch_size}x{patch_size}")

    # Standardize
    scaler = StandardScaler()
    patches_scaled = scaler.fit_transform(patches)

    # PCA to inspect variance
    pca_full = PCA()
    pca_full.fit(patches_scaled)
    cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

    # Determine components needed
    n_components = np.argmax(cumulative_variance >= target_variance) + 1
    n_components_dict[patch_size] = n_components
    explained_variances[patch_size] = cumulative_variance

    print(f"✅ {n_components} components retain {target_variance*100:.0f}% variance")

    # Apply PCA with optimal components
    pca = PCA(n_components=n_components)
    patches_pca = pca.fit_transform(patches_scaled)

    # Reconstruct from PCA
    patches_reconstructed = pca.inverse_transform(patches_pca)

    # Compute reconstruction MSE
    mse = np.mean((patches_scaled - patches_reconstructed) ** 2)
    mse_dict[patch_size] = mse
    print(f"📉 Reconstruction MSE for {patch_size}x{patch_size} patches: {mse:.2e}")

    # Store PCA and scaler for later analysis
    pca_models[patch_size] = pca
    scalers[patch_size] = scaler

    # Save features
    all_features.append(patches_pca)


# --- COMBINE MULTISCALE FEATURES ---
# --- Find common patch count across all scales ---
min_len = min(f.shape[0] for f in all_features)
print(f"\n🔧 Truncating all features to {min_len} samples for alignment")

# Truncate all feature arrays to the minimum number of patches
all_features_trimmed = [f[:min_len] for f in all_features]

# Concatenate safely
combined_features = np.concatenate(all_features_trimmed, axis=1)
print(f"🔗 Combined feature shape: {combined_features.shape}")


# --- OPTIONAL: Plot explained variance curves for each scale ---
plt.figure(figsize=(10, 6))
for size in patch_sizes:
    plt.plot(explained_variances[size], label=f'{size}x{size}')
plt.axhline(y=target_variance, color='r', linestyle='--', label='Target Variance')
plt.title('Cumulative Explained Variance per Patch Size')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.legend()
plt.grid(True)
plt.show()

# --- APPLY t-SNE ON COMBINED FEATURES ---
print("\n🎯 Applying t-SNE...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate='auto')
features_tsne = tsne.fit_transform(combined_features)

# --- PLOT T-SNE ---
plt.figure(figsize=(10, 8))
plt.scatter(features_tsne[:, 0], features_tsne[:, 1], alpha=0.5, s=5)
plt.title('t-SNE of Multi-Scale PCA-Reduced CMB Patches')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.grid(True)
plt.show()

# --- PRINT MSE SUMMARY ---
print("\n📊 MSE Summary by Patch Size:")
for size in patch_sizes:
    print(f"  - {size}x{size}: MSE = {mse_dict[size]:.2e}")


Is This MSE Value Good?

Typical MSE Range for CMB Data:

Excellent Reconstruction: MSE < 1e-02 (near-perfect preservation of CMB features)

Good Reconstruction: MSE ~1e-02 to 1e-01 (retains most cosmological signals)

Moderate Reconstruction: MSE ~1e-01 to 5e-01 (some loss of small-scale fluctuations)

Poor Reconstruction: MSE > 5e-01 (significant smoothing/feature loss)

Aim for MSE < 1e-02 to ensure topology-sensitive features survive dimensionality reduction.

In [None]:
# --- Evaluate Reconstruction Quality ---
print("📌 MSE Quality Assessment per Patch Size:")
for size in patch_sizes:
    mse = mse_dict[size]
    if mse < 1e-02:
        print(f"  ✅ {size}x{size}: Reconstruction MSE = {mse:.2e} is **excellent** for CMB data")
    elif mse < 1e-01:
        print(f"  👍 {size}x{size}: Reconstruction MSE = {mse:.2e} is **good** for CMB data")
    elif mse < 5e-01:
        print(f"  ⚠️ {size}x{size}: Reconstruction MSE = {mse:.2e} is **moderate** for CMB data")
    else:
        print(f"  ❌ {size}x{size}: Reconstruction MSE = {mse:.2e} is **not good** for CMB data")


### 2. Unsupervised Learning: Clustering Analysis

Now we'll apply clustering algorithms to identify groups of similar patterns in the CMB data.


#### Dynamic Optimization of Clustering Parameters

To find the optimal clustering of the CMB data, a dynamic approach is used that:

1. Tries different numbers of clusters (from 3 to 50)
2. For each number of clusters, runs K-means multiple times with different random initializations
3. Calculates the silhouette score for each clustering attempt
4. Selects the clustering with the highest silhouette score

This approach helps find the best clustering configuration automatically, rather than manually tuning parameters. The silhouette score measures how well-separated the clusters are, with higher scores indicating better-defined clusters.


1. Expected Silhouette Score Range for CMB Data

    Good clustering:

    0.5 - 1.0 → Strong evidence of cluster structure (rare for CMB unless studying clear anomalies like cold spots or non-Gaussian features).

    0.3 - 0.5 → Reasonable separation (may indicate subtle non-Gaussianities or foreground contamination).

    Ambiguous clustering:

    0.1 - 0.3 → Weak structure (common for Gaussian CMB fluctuations; clusters may be artificial).

    No meaningful clusters:

    ≤ 0.1 or negative → Likely noise or overfitting (common if forcing clusters on Gaussian random fields).

Key Insight:
CMB is mostly Gaussian, so high silhouette scores are unexpected unless you're targeting specific anomalies or foregrounds. A score of 0.2-0.4 might be the realistic upper limit for most analyses.


### For Single Cluster

In [None]:
# Function to run K-means clustering multiple times and find the best silhouette score
def find_best_kmeans(data, min_clusters=2, max_clusters=10, n_attempts=10):
    """
    Run K-means clustering multiple times with different parameters to find the best silhouette score.

    Parameters:
    -----------
    data : array-like
        The data to cluster
    min_clusters : int
        Minimum number of clusters to try
    max_clusters : int
        Maximum number of clusters to try
    n_attempts : int
        Number of random initializations to try for each number of clusters

    Returns:
    --------
    best_kmeans : KMeans
        The best KMeans model
    best_labels : array
        The cluster labels from the best model
    best_n_clusters : int
        The number of clusters in the best model
    best_score : float
        The silhouette score of the best model
    """
    best_score = -1
    current_best_kmeans = -100
    best_kmeans = None
    best_labels = None
    best_n_clusters = 0
    break_score = 0

    # Try different numbers of clusters
    for n_clusters in range(min_clusters, max_clusters + 1):
        print(f"Trying {n_clusters} clusters...")
        if break_score > 3:
            break

        # Try multiple random initializations for each number of clusters
        for attempt in range(n_attempts):
            # Initialize and fit KMeans
            kmeans = KMeans(n_clusters=n_clusters, random_state=attempt)
            labels = kmeans.fit_predict(data)
            if break_score  > 3:
                break

            # Calculate silhouette score
            score = silhouette_score(data, labels)
            if score < current_best_kmeans:
                print(f"scored lower ({score}) than previous best, breaking out of loop")
                break_score = break_score + 1
                break
            current_best_kmeans = score
            print(f"  Attempt {attempt+1}/{n_attempts}: Silhouette Score = {score:.9f}")

            # Update best model if this one is better
            if score > best_score:
                best_score = score
                best_kmeans = kmeans
                best_labels = labels
                best_n_clusters = n_clusters
                print(f"  New best score: {best_score:.9f} with {best_n_clusters} clusters")
            if score <= 0.1:
                print(f"  No meaningful clusters found for {n_clusters} clusters")
                break
            if score <= best_score-0.05:
                print(f"  No improvement in silhouette score for {n_clusters} clusters")
                break
            break_score = 0

    print(f"\nBest clustering: {best_n_clusters} clusters with silhouette score {best_score:.9f}")
    return best_kmeans, best_labels, best_n_clusters, best_score

# Apply K-means clustering with multiple attempts to find the best silhouette score
min_clusters = 3
max_clusters = 10
n_attempts = 5
print(f"Finding best K-means clustering (trying {min_clusters}-{max_clusters} clusters, {n_attempts} attempts each)...")
best_kmeans, cluster_labels, best_n_clusters, best_silhouette = find_best_kmeans(
    patches_pca, min_clusters=min_clusters, max_clusters=max_clusters, n_attempts=n_attempts
)

# Visualize clusters in t-SNE space
plt.figure(figsize=(12, 10))
scatter = plt.scatter(patches_tsne[:, 0], patches_tsne[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7, s=10)
plt.colorbar(scatter, label='Cluster')
plt.title(f'K-means Clustering (k={best_n_clusters}, Silhouette={best_silhouette:.3f})')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

# Calculate silhouette score to evaluate clustering quality
#     Higher Silhouette Score → Better clustering quality 
#     (points are correctly assigned to tight, well-separated clusters).
# 
#     Lower Silhouette Score → Worse clustering quality 
#     (points may be misassigned or clusters overlap).
# 0.2-0.4 would be a good score for CMB data due to the nature of it
print(f"Best Silhouette Score: {best_silhouette:.3f} with {best_n_clusters} clusters")

# Visualize cluster centers in image space
plt.figure(figsize=(15, 3))
for i in range(best_n_clusters):
    plt.subplot(1, best_n_clusters, i+1)
    # Get the center of the cluster in original space
    center = pca.inverse_transform(best_kmeans.cluster_centers_[i])
    center = scaler.inverse_transform([center])[0]
    # Reshape to patch size
    center = center.reshape(patch_size, patch_size)
    plt.imshow(center, cmap='inferno')
    plt.title(f'Cluster {i}')
    plt.axis('off')
plt.tight_layout()
plt.show()

# Map clusters back to the original image
cluster_map = np.zeros(img_cleaned.shape)
cluster_count = np.zeros(img_cleaned.shape)

for (i, j), label in zip(positions, cluster_labels):
    cluster_map[i:i+patch_size, j:j+patch_size] += label
    cluster_count[i:i+patch_size, j:j+patch_size] += 1

# Average the cluster labels where patches overlap
mask = cluster_count > 0
cluster_map[mask] /= cluster_count[mask]

plt.figure(figsize=(15, 10))
plt.imshow(cluster_map, cmap='viridis')
plt.title(f'Cluster Map of CMB Data (k={best_n_clusters}, Silhouette={best_silhouette:.3f})')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
# Check variance retention
# Aim for ≥95% for CMB (unlike images, where 80-90% may suffice).
print(f"Variance retained: {np.sum(pca.explained_variance_ratio_):.2%}")

## For Combined Clusters

In [None]:
# --- PREP: Store clustering results ---
best_kmeans_models = {}
cluster_labels_dict = {}

# Choose patch sizes for fusion
selected_patch_sizes = patch_sizes  # example: use 4x4 and 16x16

# Apply silhouette-optimized KMeans per selected patch size
for size in selected_patch_sizes:
    print(f"\n🔍 Running KMeans with silhouette scoring on {size}x{size} patches")

    features = all_features[patch_sizes.index(size)][:min_len]  # truncate to match others
    best_kmeans, labels, n_clusters, silhouette = find_best_kmeans(
        features, min_clusters=3, max_clusters=20, n_attempts=10
    )

    best_kmeans_models[size] = best_kmeans
    cluster_labels_dict[size] = labels
    print(f"✅ Best k={n_clusters} with silhouette score={silhouette:.3f}")

# --- COMBINE MULTISCALE LABELS INTO META-FEATURE VECTOR ---
print("\n🔗 Building meta-feature vector from cluster labels")
meta_features = np.column_stack([cluster_labels_dict[size] for size in selected_patch_sizes])
print(f"Meta-feature shape: {meta_features.shape}")

# --- APPLY ANOMALY DETECTION TO META-FEATURE VECTOR ---
print("\n🌌 Running Isolation Forest on cluster fusion features")
iso = IsolationForest(contamination=0.05, random_state=42)
anomaly_labels = iso.fit_predict(meta_features)

# Convert to anomaly score (1 = normal, -1 = anomaly → flip)
anomaly_scores = -1 * anomaly_labels
print(f"Anomaly scores: {np.unique(anomaly_scores, return_counts=True)}")


In [None]:
# --- VISUALIZE IN T-SNE SPACE ---
plt.figure(figsize=(12, 10))
plt.scatter(features_tsne[:len(anomaly_scores), 0], features_tsne[:len(anomaly_scores), 1],
            c=anomaly_scores, cmap='coolwarm', alpha=0.7, s=10)
plt.title('Anomaly Detection from Multi-Scale Cluster Fusion')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar(label='Anomaly Score')
plt.grid(True)
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Apply K-means clustering with multiple attempts to find the best silhouette score
min_clusters = 3
max_clusters = 150
n_attempts = 50

print(f"\n🔎 Finding best K-means clustering on multi-scale features ({min_clusters}-{max_clusters} clusters, {n_attempts} attempts each)...")

# Use the multi-scale features for clustering
best_kmeans, cluster_labels, best_n_clusters, best_silhouette = find_best_kmeans(
    combined_features, min_clusters=min_clusters, max_clusters=max_clusters, n_attempts=n_attempts
)

# Visualize clusters in t-SNE space (already computed from combined_features)
plt.figure(figsize=(12, 10))
scatter = plt.scatter(features_tsne[:, 0], features_tsne[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7, s=10)
plt.colorbar(scatter, label='Cluster')
plt.title(f'K-means Clustering on Multi-Scale Features (k={best_n_clusters}, Silhouette={best_silhouette:.3f})')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

# Report silhouette quality (typical good range is 0.2–0.4 for noisy CMB)
print(f"📈 Best Silhouette Score: {best_silhouette:.3f} with {best_n_clusters} clusters")


#### Alternate Method

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from scipy.stats import entropy

# --- PREP: Store clustering results ---
best_kmeans_models = {}
cluster_labels_dict = {}
silhouette_scores_per_size = {}
entropy_scores = {}

# --- CHOOSE PATCH SIZES FOR META-FUSION ---
selected_patch_sizes = patch_sizes  # or e.g., [4, 8, 16]

# --- SILHOUETTE-OPTIMIZED KMEANS CLUSTERING ---
for size in selected_patch_sizes:
    print(f"\n🔍 Running KMeans on {size}x{size} features with silhouette scoring")

    features = all_features[patch_sizes.index(size)][:min_len]  # align patch count

    sil_scores = []
    all_models = []
    all_labels = []

    for k in range(3, 21):
        km = KMeans(n_clusters=k, random_state=42)
        labels = km.fit_predict(features)
        sil = silhouette_score(features, labels)
        sil_scores.append(sil)
        all_models.append(km)
        all_labels.append(labels)

    best_idx = int(np.argmax(sil_scores))
    best_k = best_idx + 3
    best_kmeans = all_models[best_idx]
    best_labels = all_labels[best_idx]
    best_score = sil_scores[best_idx]

    best_kmeans_models[size] = best_kmeans
    cluster_labels_dict[size] = best_labels
    silhouette_scores_per_size[size] = best_score

    # Compute entropy of cluster label distribution
    label_counts = np.bincount(best_labels)
    probs = label_counts / np.sum(label_counts)
    ent = entropy(probs)
    entropy_scores[size] = ent

    print(f"✅ Best k={best_k} | Silhouette={best_score:.3f} | Entropy={ent:.3f}")

    # Plot silhouette curve
    plt.figure(figsize=(6, 3))
    plt.plot(range(3, 21), sil_scores, marker='o')
    plt.title(f'Silhouette Scores for {size}x{size}')
    plt.xlabel('k')
    plt.ylabel('Score')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# --- META-FEATURE COMBINATION ---
print("\n🔗 Combining cluster labels into meta-feature vectors")
meta_features = np.column_stack([cluster_labels_dict[size] for size in selected_patch_sizes])
print(f"Meta-feature shape: {meta_features.shape}")

# --- META-FEATURE DIAGNOSTICS ---

# Correlation Matrix
print("\n📊 Label Correlation Matrix (Pearson):")
corr_matrix = np.corrcoef(meta_features.T)
print(np.round(corr_matrix, 3))

# Entropy of each label column
print("\n🧠 Entropy of Cluster Label Distributions:")
for size in selected_patch_sizes:
    print(f"  - {size}x{size}: Entropy = {entropy_scores[size]:.3f}")

# --- ISOLATION FOREST FOR ANOMALY DETECTION ---
print("\n🌌 Running Isolation Forest on meta-cluster features...")
iso = IsolationForest(contamination=0.05, random_state=42)
anomaly_labels = iso.fit_predict(meta_features)

# Convert -1 to 1 (anomaly), 1 to 0 (normal)
anomaly_scores = -1 * (anomaly_labels - 1) // 2

# Summary
unique_vals, counts = np.unique(anomaly_scores, return_counts=True)
print(f"🧭 Anomaly score distribution: {dict(zip(unique_vals, counts))} (1 = Anomaly)")

# --- T-SNE FOR ANOMALY VISUALIZATION ---
print("\n🧪 Visualizing anomalies in t-SNE space...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
meta_tsne = tsne.fit_transform(meta_features)

plt.figure(figsize=(10, 8))
plt.scatter(meta_tsne[:, 0], meta_tsne[:, 1], c=anomaly_scores, cmap='coolwarm', s=10, alpha=0.7)
plt.title('t-SNE of Meta-Cluster Features (Anomaly Detection)')
plt.xlabel('t-SNE Component 1')
plt.ylabel('Component 2')
plt.colorbar(label='Anomaly Score (1 = Anomaly)')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy

# --- CHECK: Silhouette scores for each selected patch size ---
print("\n📈 Silhouette Scores per Patch Size:")
for size in selected_patch_sizes:
    sil = silhouette_scores_per_size.get(size, None)
    if sil is not None:
        print(f"  - {size}x{size}: Silhouette = {sil:.3f}")
    else:
        print(f"  - {size}x{size}: ❌ Not found")

# --- CHECK: Entropy of cluster label distributions ---
print("\n🧠 Entropy of Cluster Label Distributions:")
for size in selected_patch_sizes:
    labels = cluster_labels_dict[size]
    counts = np.bincount(labels)
    probs = counts / np.sum(counts)
    ent = entropy(probs)
    print(f"  - {size}x{size}: Entropy = {ent:.3f}")

# --- CHECK: Label correlation matrix ---
print("\n🔗 Correlation Matrix Between Cluster Label Sets:")

# Create matrix from all selected label vectors
label_matrix = np.column_stack([cluster_labels_dict[size] for size in selected_patch_sizes])

# Compute correlation matrix
corr_matrix = np.corrcoef(label_matrix.T)

# Display as matrix
print("    Patch Sizes:", selected_patch_sizes)
print(np.round(corr_matrix, 3))

# Optional: visualize the correlation matrix
plt.figure(figsize=(6, 5))
plt.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(label='Correlation')
plt.xticks(range(len(selected_patch_sizes)), [f'{s}x{s}' for s in selected_patch_sizes])
plt.yticks(range(len(selected_patch_sizes)), [f'{s}x{s}' for s in selected_patch_sizes])
plt.title("Cluster Label Correlation Matrix")
plt.tight_layout()
plt.show()


### Misc.

In [None]:
# Compare to UMAP (often more stable)
from umap import UMAP
umap_emb = UMAP(n_components=2).fit_transform(patches_pca)

In [None]:
# Overlay clusters on a CMB map
plt.imshow(cluster_map, cmap='viridis', alpha=0.5)
plt.imshow(cmb_map, cmap='inferno', alpha=0.5)

In [None]:
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
labels = clusterer.fit_predict(patches_pca)

In [None]:
inertias = []
for k in range(min_clusters, max_clusters+1):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(patches_pca)
    inertias.append(kmeans.inertia_)

plt.plot(range(min_clusters, max_clusters+1), inertias, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

In [None]:
# Check cluster sizes
unique, counts = np.unique(cluster_labels, return_counts=True)
print(dict(zip(unique, counts)))

# Visualize problematic clusters
problem_cluster = 0  # Example
problem_indices = np.where(cluster_labels == problem_cluster)[0]
plt.scatter(patches_tsne[problem_indices, 0],
           patches_tsne[problem_indices, 1])
plt.title(f'Problematic Cluster {problem_cluster}')
plt.show()

### 3. Anomaly Detection

Using Isolation Forest to detect anomalies in the CMB data that might correspond to cosmic strings or other interesting features.


### For Singular Clusters

In [None]:
# Apply Isolation Forest for anomaly detection
iso_forest = IsolationForest(contamination=0.05, random_state=42)
anomaly_scores = iso_forest.fit_predict(patches_pca)

# Convert to anomaly score (higher = more anomalous)
anomaly_scores = -1 * anomaly_scores  # -1 becomes +1 (anomaly), 1 becomes -1 (normal)
print(f"Anomaly Score Range: {anomaly_scores.min()} to {anomaly_scores.max()}")

# Visualize anomalies in t-SNE space
plt.figure(figsize=(12, 10))
scatter = plt.scatter(patches_tsne[:, 0], patches_tsne[:, 1], c=anomaly_scores, cmap='coolwarm', alpha=0.7, s=10)
plt.colorbar(scatter, label='Anomaly Score')
plt.title('Anomaly Detection in CMB Patches')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

# Map anomaly scores back to the original image
anomaly_map = np.zeros(img_cleaned.shape)
anomaly_count = np.zeros(img_cleaned.shape)

for (i, j), score in zip(positions, anomaly_scores):
    anomaly_map[i:i+patch_size, j:j+patch_size] += score
    anomaly_count[i:i+patch_size, j:j+patch_size] += 1

# Average the anomaly scores where patches overlap
mask = anomaly_count > 0
anomaly_map[mask] /= anomaly_count[mask]

plt.figure(figsize=(15, 10))
plt.imshow(anomaly_map, cmap='coolwarm')
plt.title('Anomaly Map of CMB Data (Potential Cosmic String Candidates)')
plt.colorbar(label='Anomaly Score')
plt.show()


In [None]:
# Use 8x8 patch size for mapping (or adjust to your preference)
chosen_patch_size = 8
_, positions = extract_patches(img_cleaned, patch_size=chosen_patch_size, stride=stride)

# Make sure anomaly scores are the same length as positions
min_len = min(len(positions), len(anomaly_scores))
positions = positions[:min_len]
anomaly_scores = anomaly_scores[:min_len]

# --- Build anomaly map ---
anomaly_map = np.zeros_like(img_cleaned)
anomaly_count = np.zeros_like(img_cleaned)

for (i, j), score in zip(positions, anomaly_scores):
    anomaly_map[i:i+chosen_patch_size, j:j+chosen_patch_size] += score
    anomaly_count[i:i+chosen_patch_size, j:j+chosen_patch_size] += 1

# Average overlapping patches
mask = anomaly_count > 0
anomaly_map[mask] /= anomaly_count[mask]

# Optional: clip for visualization clarity
vmin, vmax = np.percentile(anomaly_map[mask], [5, 95])

plt.figure(figsize=(15, 10))
plt.imshow(anomaly_map, cmap='coolwarm', vmin=vmin, vmax=vmax)
plt.colorbar(label='Anomaly Score')
plt.title('CMB Anomaly Map (Potential Topological Signatures)')
plt.axis('off')
plt.show()


In [None]:
# Show top N anomalies
N = 10
top_indices = np.argsort(anomaly_scores)[-N:]

plt.figure(figsize=(15, 3))
for idx, i in enumerate(top_indices):
    patch = scaler.inverse_transform(pca.inverse_transform(combined_features[i]))[:chosen_patch_size**2]
    patch_img = patch.reshape(chosen_patch_size, chosen_patch_size)
    plt.subplot(1, N, idx + 1)
    plt.imshow(patch_img, cmap='inferno')
    plt.title(f'Anomaly {idx+1}')
    plt.axis('off')
plt.suptitle("Top Anomalous Patches (Multi-scale features)", fontsize=16)
plt.tight_layout()
plt.show()


### For Multiple Clusters

In [None]:
from sklearn.ensemble import IsolationForest

# --- Apply Isolation Forest on combined multi-scale features ---
print("\n🌌 Running Isolation Forest for anomaly detection...")
iso_forest = IsolationForest(contamination=0.05, random_state=42)
anomaly_labels = iso_forest.fit_predict(combined_features)
anomaly_scores = iso_forest.decision_function(combined_features)  # Higher = more normal

# Invert for easier interpretation (higher = more anomalous)
anomaly_scores = -anomaly_scores
print(f"Anomaly score range: {anomaly_scores.min():.4f} to {anomaly_scores.max():.4f}")

# --- Visualize in t-SNE space ---
plt.figure(figsize=(12, 10))
scatter = plt.scatter(features_tsne[:, 0], features_tsne[:, 1], c=anomaly_scores, cmap='coolwarm', alpha=0.7, s=10)
plt.colorbar(scatter, label='Anomaly Score')
plt.title('t-SNE Projection with Anomaly Scores')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.grid(True)
plt.show()


### 4. Feature Importance Analysis

Let's analyze which features (principal components) are most important for distinguishing clusters and anomalies.


### For Singular Clusters

In [None]:
# Analyze the most important principal components
plt.figure(figsize=(12, 6))
plt.bar(range(n_components), pca.explained_variance_ratio_)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Importance of Principal Components')
plt.grid(True)
plt.show()

# Analyze the first few principal components
n_display = min(5, n_components)
plt.figure(figsize=(15, 3*n_display))
for i in range(n_display):
    plt.subplot(n_display, 1, i+1)
    component = pca.components_[i].reshape(1, -1)
    component_image = scaler.inverse_transform(component)[0].reshape(patch_size, patch_size)
    plt.imshow(component_image, cmap='coolwarm')
    plt.title(f'Principal Component {i+1}')
    plt.colorbar()
plt.tight_layout()
plt.show()


### For Multiple Clusters

In [None]:
# --- Bar Plot: Explained Variance per Scale ---
for size in patch_sizes:
    pca = pca_models[size]
    plt.figure(figsize=(10, 4))
    plt.bar(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)
    plt.title(f'Explained Variance by PCA Components — {size}x{size} patches')
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# --- Visualize Top N PCA Components ---
n_display = 5

for size in patch_sizes:
    pca = pca_models[size]
    scaler = scalers[size]
    components = pca.components_
    patch_len = size * size

    print(f"\n🔬 Visualizing top {n_display} PCA components for {size}x{size} patches")

    plt.figure(figsize=(15, 3 * n_display))
    for i in range(min(n_display, components.shape[0])):
        # Get component and inverse-scale it
        component = components[i].reshape(1, -1)
        restored = scaler.inverse_transform(pca.inverse_transform(component))[0]
        patch = restored[:patch_len].reshape(size, size)

        plt.subplot(n_display, 1, i + 1)
        plt.imshow(patch, cmap='coolwarm')
        plt.colorbar()
        plt.title(f'{size}x{size} Patch — Principal Component {i+1}')
        plt.axis('off')
    plt.tight_layout()
    plt.show()


### 5. Correlation with Edge Detection

Let's compare our ML-based anomaly detection with the edge detection performed earlier to see if they identify similar features.


### For Singular Cluster

In [None]:
# Ensure we have the edge detection results
if 'edges' not in locals():
    edges = ndimage.sobel(img_cleaned)

# Normalize both maps for comparison
edges_norm = (edges - np.min(edges)) / (np.max(edges) - np.min(edges))
anomaly_norm = (anomaly_map - np.min(anomaly_map)) / (np.max(anomaly_map) - np.min(anomaly_map))

# Calculate correlation between edge detection and anomaly detection
valid_mask = ~np.isnan(edges_norm) & ~np.isnan(anomaly_norm)
correlation = np.corrcoef(edges_norm[valid_mask].flatten(), anomaly_norm[valid_mask].flatten())[0, 1]
print(f"Correlation between edge detection and anomaly detection: {correlation:.3f}")

# Visualize the comparison
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.imshow(edges_norm, cmap='inferno')
plt.title('Edge Detection')
plt.colorbar()

plt.subplot(1, 3, 2)
plt.imshow(anomaly_norm, cmap='coolwarm')
plt.title('Anomaly Detection')
plt.colorbar()

plt.subplot(1, 3, 3)
plt.imshow(edges_norm, cmap='inferno', alpha=0.7)
plt.imshow(anomaly_norm, cmap='coolwarm', alpha=0.3)
plt.title('Edge + Anomaly Overlay')
plt.colorbar()

plt.tight_layout()
plt.show()


### For Multiple Clusters

In [None]:
from scipy import ndimage

# --- ENSURE EDGE DETECTION EXISTS ---
if 'edges' not in locals():
    print("🔍 Computing Sobel edge detection...")
    edges = ndimage.sobel(img_cleaned)

# --- NORMALIZE MAPS FOR COMPARISON ---
def normalize_map(x):
    x = np.nan_to_num(x)
    return (x - np.min(x)) / (np.max(x) - np.min(x) + 1e-8)

edges_norm = normalize_map(edges)
anomaly_norm = normalize_map(anomaly_map)

# --- CREATE VALID MASK ---
valid_mask = (~np.isnan(edges_norm)) & (~np.isnan(anomaly_norm))
edges_flat = edges_norm[valid_mask].flatten()
anomaly_flat = anomaly_norm[valid_mask].flatten()

# --- CORRELATION CALCULATION ---
if edges_flat.size > 0 and anomaly_flat.size > 0:
    correlation = np.corrcoef(edges_flat, anomaly_flat)[0, 1]
    print(f"📈 Correlation between edge detection and anomaly detection: {correlation:.3f}")
else:
    correlation = np.nan
    print("⚠️ No valid pixels for correlation.")

# --- VISUALIZATION ---
plt.figure(figsize=(18, 5))

plt.subplot(1, 3, 1)
plt.imshow(edges_norm, cmap='inferno')
plt.title('🟠 Edge Detection (Sobel)')
plt.colorbar()

plt.subplot(1, 3, 2)
plt.imshow(anomaly_norm, cmap='coolwarm')
plt.title('🔵 Anomaly Score (IForest)')
plt.colorbar()

plt.subplot(1, 3, 3)
plt.imshow(edges_norm, cmap='inferno', alpha=0.7)
plt.imshow(anomaly_norm, cmap='coolwarm', alpha=0.3)
plt.title('🎯 Edge + Anomaly Overlay')
plt.colorbar()

plt.tight_layout()
plt.show()


In [None]:
overlap_map = edges_norm * anomaly_norm  # or np.abs(edges_norm - anomaly_norm)

plt.figure(figsize=(10, 6))
plt.imshow(overlap_map, cmap='plasma')
plt.title('Overlap Map (Edge × Anomaly)')
plt.colorbar(label='Overlap Intensity')
plt.show()


### 6. Conclusion

We've applied several machine learning techniques to analyze the CMB data:

1. **Feature Extraction**: Extracted patches from the CMB map and reduced dimensionality with PCA
2. **Clustering**: Identified distinct patterns in the CMB data using K-means
3. **Anomaly Detection**: Used Isolation Forest to find unusual patterns that might correspond to cosmic strings
4. **Feature Importance**: Analyzed which principal components are most significant
5. **Correlation Analysis**: Compared ML-based anomaly detection with traditional edge detection

These techniques provide complementary views of the CMB data and can help identify potential cosmic string candidates or other interesting features that might not be apparent through traditional analysis methods.

The correlation between edge detection and anomaly detection suggests that ML methods can identify similar structures to traditional methods, but may also reveal additional patterns not captured by edge detection alone.
