# 01 — Pipeline Implementation
Цель: инженерно воспроизвести пайплайн (данные → графы → фичи → модели → метрики), сохранять артефакты в `outputs/pipeline` и фиксировать метаданные.

## Контекст и план
- Читаем конфиг `configs/research.yaml` для путей, параметров графов и моделей.
- Используем синтетические данные для космологии (ra/dec/z) и квантовой сетки (x/y/potential), чтобы воспроизводимо прогнать цепочку.
- Шаги: загрузка конфигурации → установка сидов → генерация и очистка данных → построение графов → вычисление признаков → обучение базовой модели → метрики и артефакты в `outputs/pipeline` и `reports/figures`.

In [None]:
import json
import math
import os
import random
from pathlib import Path
from typing import Dict, Tuple

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import yaml
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

CONFIG_PATH = Path("../configs/research.yaml").resolve()
with CONFIG_PATH.open("r", encoding="utf-8") as f:
    CFG: Dict = yaml.safe_load(f)

DATA_ROOT = Path(CFG["paths"]["data_root"]).resolve()
OUT_PIPELINE = Path(CFG["paths"]["outputs_pipeline"]).resolve()
OUT_PIPELINE.mkdir(parents=True, exist_ok=True)
OUT_FIGS = Path(CFG["paths"]["reports_figures"]).resolve()
OUT_FIGS.mkdir(parents=True, exist_ok=True)

print("Loaded config from", CONFIG_PATH)
CFG

In [None]:
def set_seeds(seed_python: int, seed_numpy: int) -> None:
    """Deterministic seeds for reproducibility."""
    random.seed(seed_python)
    np.random.seed(seed_numpy)


set_seeds(CFG["seed"]["python"], CFG["seed"]["numpy"])
print("Seeds set to", CFG["seed"]["python"], CFG["seed"]["numpy"])


In [None]:
def generate_cosmology_sample(n: int) -> pd.DataFrame:
    """Synthetic RA/Dec/redshift sample within configured ranges."""
    ra_min, ra_max = CFG["cosmology"]["ra_range"]
    dec_min, dec_max = CFG["cosmology"]["dec_range"]
    z_min, z_max = CFG["cosmology"]["redshift_range"]
    df = pd.DataFrame(
        {
            "ra": np.random.uniform(ra_min, ra_max, size=n),
            "dec": np.random.uniform(dec_min, dec_max, size=n),
            "redshift": np.random.uniform(z_min, z_max, size=n),
            "mag_g": np.random.normal(22, 1.5, size=n),
            "mag_r": np.random.normal(21, 1.2, size=n),
        }
    )
    df["system_type"] = "cosmology"
    return df


def generate_quantum_grid(grid: int) -> pd.DataFrame:
    """Synthetic quantum lattice with simple potential pattern."""
    coords = np.linspace(-1.0, 1.0, grid)
    xs, ys = np.meshgrid(coords, coords)
    potential = np.sin(np.pi * xs) * np.cos(np.pi * ys)
    df = pd.DataFrame(
        {
            "x": xs.ravel(),
            "y": ys.ravel(),
            "potential": potential.ravel(),
        }
    )
    df["system_type"] = "quantum"
    return df


cosmo_df = generate_cosmology_sample(min(500, CFG["cosmology"]["max_galaxies"]))
quantum_df = generate_quantum_grid(grid=CFG["quantum"]["grid_size"])
print("Cosmology rows", len(cosmo_df), "Quantum rows", len(quantum_df))

In [None]:
def clean_cosmology(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(subset=["ra", "dec", "redshift"])
    df = df[(df["ra"].between(0, 360)) & (df["dec"].between(-90, 90))]
    df = df[(df["redshift"] >= 0) & (df["redshift"] <= 1.0)]
    df = df[df["mag_g"].between(15, 30) & df["mag_r"].between(15, 30)]
    df = df.drop_duplicates().reset_index(drop=True)
    df["x"] = np.cos(np.radians(df["dec"])) * np.cos(np.radians(df["ra"]))
    df["y"] = np.cos(np.radians(df["dec"])) * np.sin(np.radians(df["ra"]))
    df["z"] = np.sin(np.radians(df["dec"]))
    return df


def clean_quantum(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["ra"] = (df["x"] + 1.0) * 180  # place lattice into RA-like space
    df["dec"] = (df["y"] + 1.0) * 45 - 45
    df["redshift"] = 0.05 + 0.02 * df["potential"]
    df["mag_g"] = 20 + 0.5 * df["potential"]
    df["mag_r"] = 19.5 + 0.4 * df["potential"]
    return df


cosmo_clean = clean_cosmology(cosmo_df)
quantum_clean = clean_quantum(quantum_df)
combined = pd.concat([cosmo_clean, quantum_clean], ignore_index=True)
print("Combined rows after cleaning", len(combined))
combined.head()

In [None]:
def build_knn_graph(df: pd.DataFrame, k: int = 10) -> nx.Graph:
    coords = df[["x", "y", "z"]].to_numpy()
    g = nx.Graph(system_type=df["system_type"].iloc[0])
    for idx, row in df.iterrows():
        g.add_node(idx, ra=row.ra, dec=row.dec, redshift=row.redshift)
    # Compute pairwise distances and connect k nearest for each node
    for i, origin in enumerate(coords):
        dists = np.linalg.norm(coords - origin, axis=1)
        nearest = np.argsort(dists)[1 : k + 1]
        for j in nearest:
            weight = float(dists[j])
            g.add_edge(i, j, weight=weight)
    return g


def build_quantum_graph(df: pd.DataFrame, max_nodes: int = 1500) -> nx.Graph:
    # Downsample grid if too large
    df_sampled = df.sample(n=min(max_nodes, len(df)), random_state=CFG["seed"]["python"])
    g = nx.Graph(system_type="quantum")
    for idx, row in df_sampled.iterrows():
        g.add_node(idx, x=row.x, y=row.y, potential=row.potential)
    # Connect 4-neighborhood on lattice approximation
    arr = df_sampled[["x", "y"]].to_numpy()
    for i, origin in enumerate(arr):
        dists = np.linalg.norm(arr - origin, axis=1)
        nearest = np.argsort(dists)[1:5]
        for j in nearest:
            g.add_edge(i, j, weight=float(dists[j]))
    return g


cosmo_graph = build_knn_graph(cosmo_clean, k=CFG["cosmology"]["graph"]["k_neighbors"])
quantum_graph = build_quantum_graph(quantum_clean, max_nodes=CFG["quantum"]["graph"]["max_nodes"])
print(nx.info(cosmo_graph))
print(nx.info(quantum_graph))

In [None]:
def graph_features(g: nx.Graph, spectral_k: int = 10) -> Dict[str, float]:
    degrees = np.array([deg for _, deg in g.degree()])
    clustering = nx.average_clustering(g)
    assort = nx.degree_assortativity_coefficient(g) if g.number_of_edges() > 0 else 0.0
    lap = nx.normalized_laplacian_matrix(g).toarray()
    eigvals = np.linalg.eigvalsh(lap)
    eigvals_sorted = np.sort(eigvals)[:spectral_k]
    return {
        "nodes": g.number_of_nodes(),
        "edges": g.number_of_edges(),
        "degree_mean": float(degrees.mean()),
        "degree_std": float(degrees.std()),
        "clustering": float(clustering),
        "assortativity": float(assort),
        **{f"eig_{i}": float(v) for i, v in enumerate(eigvals_sorted)},
    }


cosmo_feats = graph_features(cosmo_graph, spectral_k=CFG["features"]["spectral_k"])
quantum_feats = graph_features(quantum_graph, spectral_k=CFG["features"]["spectral_k"])
features_df = pd.DataFrame([cosmo_feats, quantum_feats])
features_df.insert(0, "system_type", ["cosmology", "quantum"])
features_df

In [None]:
def train_baseline_classifier(df: pd.DataFrame) -> Tuple[LogisticRegression, Dict[str, float]]:
    feat_cols = ["ra", "dec", "redshift", "mag_g", "mag_r"]
    X = df[feat_cols].to_numpy()
    y = (df["system_type"] == "cosmology").astype(int)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=CFG["models"]["classification"]["test_size"], random_state=CFG["models"]["classification"]["random_state"], stratify=y
    )
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model = LogisticRegression(max_iter=200)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    return model, {"accuracy": acc, "report": report, "scaler": scaler}


baseline_model, baseline_metrics = train_baseline_classifier(combined)
print("Baseline accuracy", baseline_metrics["accuracy"])
list(baseline_metrics["report"].keys())

In [None]:
def plot_projection(df: pd.DataFrame, path: Path) -> Path:
    fig, ax = plt.subplots(figsize=(6, 4))
    for name, group in df.groupby("system_type"):
        ax.scatter(group["ra"], group["dec"], s=10, alpha=0.6, label=name)
    ax.set_xlabel("RA")
    ax.set_ylabel("Dec")
    ax.legend()
    fig.tight_layout()
    fig.savefig(path, dpi=200)
    plt.close(fig)
    return path


def plot_degree_hist(g: nx.Graph, path: Path) -> Path:
    degrees = [d for _, d in g.degree()]
    fig, ax = plt.subplots(figsize=(5, 3))
    ax.hist(degrees, bins=20)
    ax.set_title(f"Degree distribution: {g.graph.get('system_type', 'graph')}")
    ax.set_xlabel("Degree")
    ax.set_ylabel("Count")
    fig.tight_layout()
    fig.savefig(path, dpi=200)
    plt.close(fig)
    return path


proj_path = OUT_FIGS / "pipeline_projection.png"
deg_cosmo_path = OUT_FIGS / "pipeline_degree_cosmo.png"
deg_quantum_path = OUT_FIGS / "pipeline_degree_quantum.png"

plot_projection(combined, proj_path)
plot_degree_hist(cosmo_graph, deg_cosmo_path)
plot_degree_hist(quantum_graph, deg_quantum_path)

proj_path, deg_cosmo_path, deg_quantum_path

In [None]:
clean_path = OUT_PIPELINE / "cleaned_combined.parquet"
features_path = OUT_PIPELINE / "graph_features.parquet"
metrics_path = OUT_PIPELINE / "baseline_metrics.json"
cosmo_graph_path = OUT_PIPELINE / "cosmo_graph.gpickle"
quantum_graph_path = OUT_PIPELINE / "quantum_graph.gpickle"

combined.to_parquet(clean_path, index=False)
features_df.to_parquet(features_path, index=False)
with metrics_path.open("w", encoding="utf-8") as f:
    json.dump({"accuracy": baseline_metrics["accuracy"], "report": baseline_metrics["report"]}, f, indent=2)

nx.write_gpickle(cosmo_graph, cosmo_graph_path)
nx.write_gpickle(quantum_graph, quantum_graph_path)

{"data": str(clean_path), "features": str(features_path), "metrics": str(metrics_path)}

In [None]:
try:
    import requests

    resp = requests.get("http://localhost:8000/ping", timeout=2)
    print("API /ping status", resp.status_code, "body", resp.text[:200])
except Exception as exc:  # noqa: BLE001
    print("API check skipped or failed:", exc)


### Что сохранили
- Данные: `outputs/pipeline/cleaned_combined.parquet`
- Графы: `outputs/pipeline/cosmo_graph.gpickle`, `outputs/pipeline/quantum_graph.gpickle`
- Признаки: `outputs/pipeline/graph_features.parquet`
- Метрики модели: `outputs/pipeline/baseline_metrics.json`
- Визуализации: `reports/figures/pipeline_projection.png` и degree-hist для обоих графов.