# Galaxies as Point Clouds ✨

I got fascinated by how galaxy morphology classification is fundamentally a point cloud problem — each galaxy is just a collection of stellar sources with positions and fluxes, and the challenge is extracting meaningful observables from these variable-length, noisy point clouds. Coming from a physics background, this felt very similar to how particle physicists analyze jets. So I built a proper observable library for it!

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from galaxyclouds.io import generate_synthetic_galaxies

plt.style.use('dark_background')

# Load data
X, y = generate_synthetic_galaxies(n_per_class=1000, seed=42)
print(f"X shape: {X.shape} -> (N_galaxies, max_stars, features)")
print(f"y shape: {y.shape}")

In [None]:
# Handle zero-padding (mask computation)
mask = X[:, :, 0] > 0
print(f"Total real observations: {mask.sum()}")
print(f"Total padded entries: {(~mask).sum()}")

In [None]:
# Multiplicity distributions
multiplicity = mask.sum(axis=1)
class_names = {0: 'Elliptical', 1: 'Spiral', 2: 'Irregular'}

plt.figure(figsize=(10, 6))
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

for i in range(3):
    plt.hist(multiplicity[y == i], bins=20, alpha=0.6, label=class_names[i], color=colors[i], density=True)

plt.xlabel('Stellar Multiplicity')
plt.ylabel('Density')
plt.legend()
plt.title('Multiplicity by Morphology Class');

In [None]:
# Flux distributions of brightest source
leading_flux = X[:, 0, 0]
plt.figure(figsize=(10, 6))
for i in range(3):
    plt.hist(leading_flux[y == i], bins=30, alpha=0.6, label=class_names[i], color=colors[i], density=True)
plt.xlabel('Leading Flux Counts')
plt.legend();

### Key observations
What the data is already telling us before any ML:
Ellipticals are strongly concentrated with fewer sources but highly central flux, while irregulars show extended, clumpy distributions with high stellar multiplicity.