Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions infrastructure/compute/talos/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Generated by talhelper genconfig
# These contain sensitive data (certs, keys) and should not be committed
clusterconfig/
252 changes: 252 additions & 0 deletions infrastructure/compute/talos/justfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
# Talos Platform Cluster Management
#
# Prerequisites:
# - talhelper (brew install talhelper)
# - talosctl (brew install siderolabs/tap/talosctl)
# - sops (brew install sops) - for decrypting secrets
#
# Usage:
# just genconfig # Generate machine configs
# just bootstrap # Bootstrap CP-1 (first run only)
# just kubeconfig # Fetch kubeconfig
# just dashboard # Open talosctl dashboard (all nodes)
# just status cp-1 # Check status of a specific node
# just logs kubelet cp-2 # View logs from a service on a node

# Node name to IP mapping
# Use these names with any command that takes a 'node' parameter
node_cp1 := "10.10.30.10"
node_cp2 := "10.10.30.11"
node_cp3 := "10.10.30.12"

# All control plane nodes (comma-separated for talosctl)
all_nodes := node_cp1 + "," + node_cp2 + "," + node_cp3

# Generated config paths
config_dir := "clusterconfig"
talosconfig := config_dir / "talosconfig"

# Helper to resolve node name to IP
[private]
_resolve node:
#!/usr/bin/env bash
case "{{ node }}" in
cp-1|cp1|1) echo "{{ node_cp1 }}" ;;
cp-2|cp2|2) echo "{{ node_cp2 }}" ;;
cp-3|cp3|3) echo "{{ node_cp3 }}" ;;
all) echo "{{ all_nodes }}" ;;
10.10.30.*) echo "{{ node }}" ;;
*) echo "Error: Unknown node '{{ node }}'. Use cp-1, cp-2, cp-3, or all" >&2; exit 1 ;;
esac

# Generate machine configurations using talhelper
genconfig:
talhelper genconfig --config-file talconfig.yaml --secret-file talsecret.sops.yaml --out-dir {{ config_dir }}
@echo "Configs generated in {{ config_dir }}/"

# Bootstrap the first control plane node (CP-1)
# Only run this ONCE when initializing a new cluster
bootstrap: _require-config
talosctl bootstrap \
--talosconfig {{ talosconfig }} \
--nodes {{ node_cp1 }}

# Fetch kubeconfig and merge into default location
kubeconfig: _require-config
talosctl kubeconfig \
--talosconfig {{ talosconfig }} \
--nodes {{ node_cp1 }} \
--force
@echo "Kubeconfig merged into ~/.kube/config"

# Fetch kubeconfig to a specific file
kubeconfig-file file: _require-config
talosctl kubeconfig {{ file }} \
--talosconfig {{ talosconfig }} \
--nodes {{ node_cp1 }} \
--force
@echo "Kubeconfig written to {{ file }}"

# Open talosctl dashboard
# Usage: just dashboard [node]
# Examples:
# just dashboard # All nodes
# just dashboard cp-1 # Single node
# just dashboard all # Explicit all nodes
dashboard node="all": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl dashboard \
--talosconfig {{ talosconfig }} \
--nodes "$nodes"

# Check cluster health (all nodes)
health: _require-config
talosctl health \
--talosconfig {{ talosconfig }} \
--nodes {{ all_nodes }}

# Get machine status
# Usage: just status [node]
status node="all": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl get machinestatus \
--talosconfig {{ talosconfig }} \
--nodes "$nodes"

# Get detailed machine config from a node
get-config node="cp-1": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl get machineconfig \
--talosconfig {{ talosconfig }} \
--nodes "$nodes" \
-o yaml

# View services on node(s)
# Usage: just services [node]
services node="all": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl services \
--talosconfig {{ talosconfig }} \
--nodes "$nodes"

# View logs from a service
# Usage: just logs <service> [node]
# Examples:
# just logs kubelet # kubelet logs from cp-1
# just logs etcd cp-2 # etcd logs from cp-2
# just logs apid all # apid logs from all nodes
logs service node="cp-1": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl logs {{ service }} \
--talosconfig {{ talosconfig }} \
--nodes "$nodes"

# Follow logs from a service (streaming)
logs-follow service node="cp-1": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl logs {{ service }} \
--talosconfig {{ talosconfig }} \
--nodes "$nodes" \
--follow

# Apply configuration to a node
# Usage: just apply-config <node>
apply-config node: _require-config
#!/usr/bin/env bash
set -euo pipefail

# Resolve node name to IP and config file
case "{{ node }}" in
cp-1|cp1|1|{{ node_cp1 }})
node_ip="{{ node_cp1 }}"
config_file="{{ config_dir }}/platform-cp-1.yaml"
;;
cp-2|cp2|2|{{ node_cp2 }})
node_ip="{{ node_cp2 }}"
config_file="{{ config_dir }}/platform-cp-2.yaml"
;;
cp-3|cp3|3|{{ node_cp3 }})
node_ip="{{ node_cp3 }}"
config_file="{{ config_dir }}/platform-cp-3.yaml"
;;
*)
echo "Error: Unknown node '{{ node }}'. Use cp-1, cp-2, or cp-3"
exit 1
;;
esac

echo "Applying $config_file to $node_ip..."
talosctl apply-config \
--talosconfig {{ talosconfig }} \
--nodes "$node_ip" \
--file "$config_file"

# Upgrade Talos on a node
# Usage: just upgrade <node> <version>
# Example: just upgrade cp-1 v1.12.0
upgrade node version: _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl upgrade \
--talosconfig {{ talosconfig }} \
--nodes "$nodes" \
--image "ghcr.io/siderolabs/installer:{{ version }}"

# Reset a node (DANGEROUS - removes from cluster)
[confirm("This will RESET the node and remove it from the cluster. Continue?")]
reset node: _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl reset \
--talosconfig {{ talosconfig }} \
--nodes "$nodes" \
--graceful=false \
--reboot

# Get dmesg from node(s)
dmesg node="cp-1": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl dmesg \
--talosconfig {{ talosconfig }} \
--nodes "$nodes"

# Get memory/cpu info from node(s)
resources node="all": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl get cpu,memory \
--talosconfig {{ talosconfig }} \
--nodes "$nodes"

# List disks on node(s)
disks node="all": _require-config
#!/usr/bin/env bash
set -euo pipefail
nodes=$(just _resolve {{ node }})
talosctl disks \
--talosconfig {{ talosconfig }} \
--nodes "$nodes"

# Get etcd member list
etcd-members: _require-config
talosctl etcd members \
--talosconfig {{ talosconfig }} \
--nodes {{ node_cp1 }}

# Get etcd status
etcd-status: _require-config
talosctl etcd status \
--talosconfig {{ talosconfig }} \
--nodes {{ all_nodes }}

# List all available recipes
help:
@just --list

# Run any talosctl command with correct config
# Usage: just talosctl <args>
# Example: just talosctl get members --nodes 10.10.30.10
talosctl *args: _require-config
talosctl --talosconfig {{ talosconfig }} {{ args }}

# Internal: Check that configs have been generated
[private]
_require-config:
@test -f {{ talosconfig }} || (echo "Error: Run 'just genconfig' first" && exit 1)
3 changes: 2 additions & 1 deletion infrastructure/compute/talos/talconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@ nodes:
type: nvme
# UM760 uses hybrid trunk: VLAN 20 native, VLAN 30 tagged
# Configure VLAN 30 sub-interface for platform traffic
# MAC address is for enp2s0 (2.5GbE port connected to VyOS eth2)
networkInterfaces:
- deviceSelector:
hardwareAddr: "*"
hardwareAddr: "38:05:25:34:25:d0"
dhcp: false
vlans:
- vlanId: 30
Expand Down
27 changes: 24 additions & 3 deletions infrastructure/network/vyos/configs/gateway.conf
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@
* VLAN Architecture:
* 10 - LAB_MGMT (10.10.10.0/24) - Infrastructure management
* 20 - LAB_PROV (10.10.20.0/24) - Provisioning (PXE)
* 30 - LAB_PLATFORM (10.10.30.0/24) - Platform cluster
* 30 - LAB_PLATFORM (10.10.30.0/24) - Platform cluster (via br30 bridge)
* 40 - LAB_CLUSTER (10.10.40.0/24) - Tenant clusters
* 50 - LAB_SERVICE (10.10.50.0/24) - Service VIPs (BGP)
* 60 - LAB_STORAGE (10.10.60.0/24) - Storage replication
*
* Bridge Architecture:
* br30 - Bridges eth1.30 (switch trunk) and eth2.30 (UM760 direct connect)
* This allows the UM760 to participate in VLAN 30 via eth2
* while other devices access VLAN 30 via the switch trunk
*/

firewall {
Expand Down Expand Up @@ -180,6 +185,20 @@ firewall {
}
}
interfaces {
/* Bridge for VLAN 30 - allows UM760 (eth2) to join platform network
* Both eth1.30 (switch trunk) and eth2.30 (UM760) are bridge members
* The gateway IP lives on the bridge, not the individual interfaces
*/
bridge br30 {
address 10.10.30.1/24
description "LAB_PLATFORM - Platform Cluster"
member {
interface eth1.30 {
}
interface eth2.30 {
}
}
}
ethernet eth0 {
address 10.0.0.2/30
description "WAN - Transit to Home (CCR2004)"
Expand All @@ -195,8 +214,7 @@ interfaces {
description "LAB_PROV - Provisioning (PXE)"
}
vif 30 {
address 10.10.30.1/24
description "LAB_PLATFORM - Platform Cluster"
description "LAB_PLATFORM - Bridge member (br30)"
}
vif 40 {
address 10.10.40.1/24
Expand All @@ -213,6 +231,9 @@ interfaces {
}
ethernet eth2 {
description "LAN - UM760 Platform Anchor Node"
vif 30 {
description "LAB_PLATFORM - Bridge member (br30)"
}
}
}
nat {
Expand Down
22 changes: 21 additions & 1 deletion infrastructure/network/vyos/tests/test_operational.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def test_trunk_interface_up(self, vyos_show, test_topology):
[
("10", "10.10.10.1"),
("20", "10.10.20.1"),
("30", "10.10.30.1"),
("40", "10.10.40.1"),
("50", "10.10.50.1"),
("60", "10.10.60.1"),
Expand All @@ -45,6 +44,27 @@ def test_vlan_interface_up(self, vyos_show, test_topology, vif, gateway_ip):
assert "up" in output.lower(), f"VLAN {vif} interface is not up"
assert gateway_ip in output, f"VLAN {vif} missing IP {gateway_ip}"

def test_vlan30_bridge_interface_up(self, vyos_show, test_topology):
"""VLAN 30 uses a bridge for the platform network.

The platform network (VLAN 30) is bridged to allow the UM760 anchor node
to participate via a direct connection (eth4 in test, eth2 in production).
The gateway IP lives on br30, not on the VLAN interface directly.
"""
# Check that the VLAN interface is up and is a bridge member
vlan_output = vyos_show(
f"show interfaces ethernet {test_topology.trunk_iface} vif 30"
)
assert "up" in vlan_output.lower(), "VLAN 30 interface is not up"
assert "br30" in vlan_output, "VLAN 30 should be a member of br30"

# Check that the bridge has the gateway IP
bridge_output = vyos_show("show interfaces bridge br30")
assert "up" in bridge_output.lower(), "Bridge br30 is not up"
assert test_topology.platform_gateway in bridge_output, (
f"Bridge br30 missing IP {test_topology.platform_gateway}"
)


class TestRoutingState:
"""Test routing table state."""
Expand Down