# Standing up EJFAT LB on a FABRIC U280-equipped node

Your compute nodes can include FPGAs. These devices are made available as FABRIC components and can be added to your nodes like any other component. Your project must have Component.FPGA permission tag in order to be able to provision them. 

This notebook stands up a single VM with an attached U280 and starts up the necessary stacks on it to get the Load Balancer running. There is an optional set of steps in the middle to help build the needed docker containers. Alternatively they can be fetched from a storage VM on EDC (there is another notebook that shows how to stand it up - it is attached to a persistent storage volume that contains pre-built artifacts).

<div>
    <img src="figs/u280-slice.png" width=500>
</div>



## Setup the Experiment

In [None]:
import json
from fabrictestbed_extensions.fablib.fablib import FablibManager as fablib_manager

fablib = fablib_manager()
                     
fablib.show_config();


# until fablib fixes this
def get_management_os_interface(node) -> str or None:
        """
        Gets the name of the management interface used by the node's
        operating system. 

        :return: interface name
        :rtype: String
        """
        stdout, stderr = node.execute("sudo ip -j route list", quiet=True)
        stdout_json = json.loads(stdout)

        for i in stdout_json:
            if i["dst"] == "default":
                return i["dev"]

        stdout, stderr = node.execute("sudo ip -6 -j route list", quiet=True)
        stdout_json = json.loads(stdout)

        for i in stdout_json:
            if i["dst"] == "default":
                return i["dev"]

        return None

def execute_single_node(node, commands):
    for command in commands:
        print(f'\tExecuting "{command}" on node {node.get_name()}')
        #stdout, stderr = node.execute(command, quiet=True, output_file=node.get_name() + '_install.log')
        stdout, stderr = node.execute(command)
    if not stderr and len(stderr) > 0:
        print(f'Error encountered with "{command}": {stderr}')
        
def execute_commands(node, commands):
    if isinstance(node, list):
        for n in node:
            execute_single_node(n, commands)
    else:
        execute_single_node(node, commands)

def execute_single_node_on_thread(node, commands):
    # concatenate the commands using ';' and execute
    allcommands = ';'.join(commands)
    node.execute_thread(allcommands, output_file=node.get_name() + '_thread.log')

def execute_commands_on_threads(node, commands):
    if isinstance(node, list):
        for n in node:
            execute_single_node_on_thread(n, commands)
    else:
        execute_single_node_on_thread(node, commands)

## Select a site with E2SAR-assigned FPGA

The cells below help you create a slice that contains a single node with an attached FPGA. 

In [None]:
# check which sites have available FPGAs on hosts (and also list memory, disk and core on those hosts)
# Overall list of sites that are usable with ESnet workflow
sites_to_check = ['STAR', 'TACC', 'MICH', 'UTAH', 'NCSA', 'WASH', 'DALL', 'SALT', 'UCSD', 'CLEM', 'LOSA', 'KANS', 'PRIN', 'SRI']
# sites that were cleared for this project
sites_to_check = ['LOSA', 'KANS', 'WASH']

# worker name is <site in lower case>-w[0-9]+.fabric.net
hosts = fablib.list_hosts(fields=['name','cores_available','ram_available','disk_available','fpga_u280_available'], 
                          filter_function=lambda s: (s['name'].split('-')[0].upper() in sites_to_check) and 
                          s['fpga_u280_capacity']>0)


In [None]:
# recommend a site with availability
ram = 32
cores = 8
disk = 100

# hosts is a Styler over a DataFrame, we want to get the underlying numpy array
recommended_sites = []
for host in hosts.data.to_numpy():
    if host[1] > cores and host[2] > ram and host[3] > disk and host[4] > 0:
        recommended_sites.append(host[0].split('-')[0].upper())

if recommended_sites is not None:
    print(f'Recommended sites are {recommended_sites}')
else:
    print(f'Unable to find a usable site among {sites_to_check}')

In [None]:
# FPGA site should only be one of these as these are assigned to the project
# Pick one of the outcomes from recommended sites above
site='KANS'

FPGA_CHOICE='FPGA_Xilinx_U280'

# name the slice and the node 
slice_name=f'E2SAR U280 LB Slice on {site}'

fpga_node_name='LB-node'
image = 'default_ubuntu_22'
net_name = '_'.join(['fabnetv4', site])

# storage VM - update this after you provision it using the other notebook
storage_vm_ip = "10.132.137.2"
nginx_user = "fpga_tools"
nginx_password = "changemenow123"

# version to use for saving docker images
docker_image_version = "12162024"

print(f'Will create slice "{slice_name}" with node "{fpga_node_name}"')

## Create a slice with a node with FPGA at desired site

This slice has one VM with an FPGA and a basic NIC to talk to the storage VM. 

In [None]:
# Create Slice. Note that by default submit() call will poll for 360 seconds every 10-20 seconds
# waiting for slice to come up. Normal expected time is around 2 minutes. 
slice = fablib.new_slice(name=slice_name)
storage_nic_name = 'storage_nic'

# Add node with a 100G drive and 8 of CPU cores using Ubuntu 22 image
node = slice.add_node(name=fpga_node_name, site=site, cores=cores, ram=ram, disk=disk, image=image)
# and a basic NIC to talk to storage
iface1 = node.add_component(model='NIC_Basic', name=storage_nic_name).get_interfaces()[0]

# postboot configuration is under 'post-boot' directory
node.add_post_boot_upload_directory('post-boot','.')
node.add_post_boot_execute(f'chmod +x post-boot/lb-node.sh && ./post-boot/lb-node.sh')
# FABNetv4 on shared NIC (to talk to storage)
#node.add_fabnet()

fpga_comp = node.add_component(model=FPGA_CHOICE, name='fpga1')
fpga_p1 = fpga_comp.get_interfaces()[0]
fpga_p2 = fpga_comp.get_interfaces()[1]

# use FABNetv4 to connect port 1 of FPGA and the basic NIC (yes it is the same network)
net = slice.add_l3network(name=net_name, interfaces=[fpga_p1, iface1], type='IPv4')

# Submit Slice Request
slice.submit();

In [None]:
# Get slice details. You can rerun this as many times as you want
slice = fablib.get_slice(name=slice_name)
node = slice.get_node(name=fpga_node_name)

# we will use these IPs on the basic NIC and as a pool of IPv4 addresses for the
# FPGA. Pool of FPGA IPv6 addresses will be fake in this scenario - we cannot attach
# more than one network service to an interface
network = slice.get_network(name=net_name)
network_available_ips = list(network.get_subnet().hosts())
print(network)

## Setup IOMMU and Hugepages
For DPDK to function properly we need to setup hugepages and IOMMU on the VM

In [None]:
# configure GRUB
slice = fablib.get_slice(name=slice_name)
node = slice.get_node(name=fpga_node_name)

commands = list()
#commands.append("sudo sed -i 's/GRUB_CMDLINE_LINUX=\"\\(.*\\)\"/GRUB_CMDLINE_LINUX=\"\\1 amd_iommu=on iommu=pt default_hugepagesz=1G hugepagesz=1G hugepages=8\"/' /etc/default/grub")
commands.append("sudo sed -i 's/GRUB_CMDLINE_LINUX=\"\"/GRUB_CMDLINE_LINUX=\"amd_iommu=on iommu=pt default_hugepagesz=1G hugepagesz=1G hugepages=8\"/' /etc/default/grub")
commands.append("sudo grub-mkconfig -o /boot/grub/grub.cfg")
commands.append("sudo update-grub")

for command in commands:
    print(f'Executing {command}')
    stdout, stderr = node.execute(command)
    
print('Done')

Reboot the node (this sometimes generates an EOFError exception - ignore it and continue)

In [None]:
reboot = 'sudo reboot'

print(reboot)
node.execute(reboot)

slice.wait_ssh(timeout=360,interval=10,progress=True)

print("Now testing SSH abilites to reconnect...",end="")
slice.update()
slice.test_ssh()
print("Reconnected!")

Check that IOMMU was enabled

In [None]:
command = 'sudo dmesg | grep -i IOMMU'

print('Observe that the modifications to boot configuration took place and IOMMU is detected')
stdout, stderr = node.execute(command)

node.config()

Disable IOMMU support in VFIO (the passing through doesn't actually work)

In [None]:
# Enable unsafe_noiommu_mode for the vfio module
command = "echo 1 | sudo tee /sys/module/vfio/parameters/enable_unsafe_noiommu_mode"

stdout, stderr = node.execute(command)

## Configure the NIC interface for accessing storage and inbound control plane and get a pool of IPs for the FPGA

In [None]:
import ipaddress 

def get_valid_ips(ip_list, gateway, qty=1):
    # make sure the returned IP isn't the gateway
    ips = []
    while qty > 0:
        ip = ip_list.pop()
        while ip == gateway:
            ip = ip_list.pop()
        ips.append(ip)
        qty -= 1

    return ips
    
storage_nic_ip = get_valid_ips(network_available_ips, network.get_gateway())[0]

# number of fpga IP addresses we want
fpga_ips_qty = 8

fpga_ips = get_valid_ips(network_available_ips, network.get_gateway(), qty=fpga_ips_qty)

print(f'Using this ip to talk to storage: {storage_nic_ip}')
print(f'Using these IPs for FPGA IPv4 pool: {fpga_ips}')

# FABRIC pretends to use /24, but really it is a /10 routable space
site_subnet= network.get_subnet()
full_subnet = ipaddress.IPv4Network('10.128.0.0/10')

In [None]:
# note that we can't get an interface by network name as in this example both FPGA and Shared NIC 
# are hanging off the same network service
node_iface = node.get_interface(name='-'.join([fpga_node_name, storage_nic_name, 'p1']))
print(f'Configuring interface {node_iface.get_name()}')
node_iface.ip_addr_add(addr=storage_nic_ip, subnet=site_subnet)
node.ip_route_add(subnet=full_subnet, gateway=network.get_gateway())
commands = [
    "ip a",
    "ip r"
]
execute_commands(node, commands)

In [None]:
# since firewalld is running, lets tell it all interfaces are trusted
# WATCH OUT FOR WHAT IS TYPICALLY enp7s0 (data interface) SPONTANEOUSLY CHANGING ITS NAME
# IF THIS HAPPENS YOU MIGHT SEE REPORTED ROUTING PROBLEMS, BUT  IT IS THE FIREWALL
mgmt_iface_name = get_management_os_interface(node)
data_iface = node.get_interface(network_name=network.get_name())
data_iface_name = data_iface.get_os_interface()
commands = [
    f'sudo firewall-cmd --permanent --zone=trusted --add-interface={data_iface_name}',
    f'sudo firewall-cmd --permanent --zone=trusted --add-interface=lo',
    f'sudo firewall-cmd --permanent --zone=trusted --add-interface={mgmt_iface_name}',
    f'for i in $(sudo firewall-cmd --zone=public --list-services); do sudo firewall-cmd --zone=public --permanent --remove-service=$i; done',
]
commands.append(f'sudo firewall-cmd --reload')
commands.append(f'sudo firewall-cmd --list-all --zone=trusted')

execute_commands([node], commands)

In [None]:
# test connectivity to storage node
commands = [
    f"sudo ping -q -f -c 100 {storage_vm_ip}"
]
execute_commands(node, commands)

## Update Docker daemon configuration to make sure builds work on IPv6 hosts

In [None]:
node.upload_file('config/daemon.json', 'daemon.json')
commands = [
    "sudo mv daemon.json /etc/docker/; sudo chown root:root /etc/docker/daemon.json",
    "sudo systemctl restart docker",
    "sudo systemctl status docker"
]

execute_commands(node, commands)

## Build the docker containers and upload to storage VM (optional)

We have to build 3 container images:
- xilinx-labtools-docker (requires Xilinx labtools)
- smartnic-dpdk-docker 
- esnet-smartnic-fw (requires P4 artifact from ESnet team)

These are `docker compose`d together into a running stack. In addition UDPLBd must be run on top of this stack to provide the control plane functionality.

### Building xilinx-labtools-docker container

Following these instructions:
- https://github.com/esnet/xilinx-labtools-docker/

Overall steps are:
- Checkout the repo https://github.com/esnet/xilinx-labtools-docker/
- Download the missing files from Storage VM
- Run docker build

In [None]:
# this is a map between destination directory names and files. On the storage VM all files are in the same directory"
fetch_file_list = {
    "sc-fw-downloads": ["SC_U280_4_3_31.zip", "SC_U55C_7_1_23.zip", "loadsc_v2.3.zip"],
    "vivado-installer": ["Vivado_Lab_Lin_2023.2_1013_2256.tar.gz"]
}
commands = [
    "rm -rf xilinx-labtools-docker",
    f"git clone https://github.com/esnet/xilinx-labtools-docker.git"
]

curl_command = f"curl -s -k -u {nginx_user}:{nginx_password} https://{storage_vm_ip}/ejfat-data/artifacts/Vivado-Labtools/"

for ddir, filelist in fetch_file_list.items():
    for file in filelist:
        commands.append(f"{curl_command}{file} > {file}")
        commands.append(f"mv {file} xilinx-labtools-docker/{ddir}")
          
execute_commands(node, commands)

In [None]:
# build step

commands = [
    "cd xilinx-labtools-docker; docker build --pull -t xilinx-labtools-docker:${USER}-dev .",
    "docker image ls"
]
execute_commands(node, commands)

In [None]:
# save image to file and upload to the storage VM
commands = [
    f"docker save xilinx-labtools-docker | gzip > xilinx-labtools-docker-{docker_image_version}.tar.gz",
    f"curl -s -k -u {nginx_user}:{nginx_password} -T xilinx-labtools-docker-{docker_image_version}.tar.gz https://{storage_vm_ip}/ejfat-data/smartnic-docker-images/"
]

execute_commands(node, commands)

### Building smartnic-dpdk-docker container

Following these instructions
- https://github.com/esnet/smartnic-dpdk-docker

In [None]:
# clone
commands = [
    "rm -rf smartnic-dpdk-docker",
    "git clone https://github.com/esnet/smartnic-dpdk-docker.git",
    "cd smartnic-dpdk-docker; git submodule update --init --recursive"
]

execute_commands(node, commands)

In [None]:
# build step

commands = [
    "cd smartnic-dpdk-docker; docker build --pull -t smartnic-dpdk-docker:${USER}-dev .",
    "docker image ls"
]

execute_commands(node, commands)

In [None]:
# save image to file and upload to the storage VM
commands = [
    f"docker save smartnic-dpdk-docker | gzip > smartnic-dpdk-docker-{docker_image_version}.tar.gz",
    f"curl -s -k -u {nginx_user}:{nginx_password} -T smartnic-dpdk-docker-{docker_image_version}.tar.gz https://{storage_vm_ip}/ejfat-data/smartnic-docker-images/"
]

execute_commands(node, commands)

### Building esnet-smartnic-fw container

Following these instructions:
- https://github.com/esnet/esnet-smartnic-fw/

In [None]:
# clone. You need to do this step even if you've built this before, sn-stack/ gets untarred into this.

# optionally use a specific commit
commit_hash = None
#commit_hash = "640b63981413f7fb5daef6e140c1e1896beff75f"

commands = [
    "rm -rf esnet-smartnic-fw",
    "git clone https://github.com/esnet/esnet-smartnic-fw.git",
    "cd esnet-smartnic-fw; git submodule init && git submodule update"
]

if commit_hash is not None:
    commands.append(f"cd esnet-smartnic-fw; git checkout {commit_hash}")

execute_commands(node, commands)

In [None]:
# Download the P4 artifact and install in the right place

# this is the hardware version that is part of the bitfile file name
sn_hw_ver="57684"
p4_artifact = f"artifacts.au280.udplb.{sn_hw_ver}.zip"

commands = [
    f"curl -s -k -u {nginx_user}:{nginx_password} https://{storage_vm_ip}/ejfat-data/artifacts/P4/{p4_artifact} > {p4_artifact}",
    f"mv {p4_artifact} esnet-smartnic-fw/sn-hw"
]

execute_commands(node, commands)

In [None]:
# create an env file for the build
# if the P4 bitfile was called artifacts.au280.udplb.57684.zip, then
# SN_HW_VER=57684
# SN_HW_BOARD=au280 
# SN_HW_APP_NAME=udplb

env_file = f"""
SN_HW_VER={sn_hw_ver}
SN_HW_BOARD=au280
SN_HW_APP_NAME=udplb
"""

commands = [
    f"cd esnet-smartnic-fw; rm -f .env; cp example.env .env",
    f"echo '{env_file}' >> ~/esnet-smartnic-fw/.env"
]
execute_commands(node, commands)

In [None]:
# run the build

commands = [
    f"cd esnet-smartnic-fw; ./build.sh",
    f"docker image ls"
]
execute_commands(node, commands)

In [None]:
# save image to file and upload to the storage VM
# also tar up the sn-stack/ directory and ship to storage VM
commands = [
    f"docker save esnet-smartnic-fw | gzip > esnet-smartnic-fw-{docker_image_version}.tar.gz",
    f"cd esnet-smartnic-fw; tar -zcf sn-stack-{docker_image_version}.tar.gz sn-stack/",
    f"curl -s -k -u {nginx_user}:{nginx_password} -T esnet-smartnic-fw-{docker_image_version}.tar.gz https://{storage_vm_ip}/ejfat-data/smartnic-docker-images/",
    f"cd esnet-smartnic-fw; curl -s -k -u {nginx_user}:{nginx_password} -T sn-stack-{docker_image_version}.tar.gz https://{storage_vm_ip}/ejfat-data/smartnic-docker-images/"
]

execute_commands(node, commands)

## Download containers from storage VM and install

You can use previously built containers here by downloading them from the storage VM and installing into Docker.



In [None]:
# docker image prefixes
docker_image_prefixes = ['smartnic-dpdk-docker', 'xilinx-labtools-docker', 'esnet-smartnic-fw']

# download and install all available docker images with the right version
commands = []

# we add sn-stack here - it's not a docker image, but just a zipped up tree
# we need to download it, but it doesn't get installed as a docker image
for prefix in docker_image_prefixes + ['sn-stack']:
    commands.append(f"curl -s -k -u {nginx_user}:{nginx_password} http://{storage_vm_ip}/ejfat-data/smartnic-docker-images/{prefix}-{docker_image_version}.tar.gz > {prefix}-{docker_image_version}.tar.gz")

execute_commands(node, commands)

In [None]:
# install the images in the docker on the node. 
# Remember to checkout esnet-smartnic-fw repo (no need to build, just check it out)
commands = [ f"if [ ! -e esnet-smartnic-fw ]; then git clone https://github.com/esnet/esnet-smartnic-fw.git; fi" ]
for prefix in docker_image_prefixes:
    commands.append(f"docker load --input {prefix}-{docker_image_version}.tar.gz")

commands.append(f"docker image ls")

# untar sn-stack into the previously checked out repo
commands.append(f"tar -zxf sn-stack-{docker_image_version}.tar.gz -C esnet-smartnic-fw/")

execute_commands(node, commands)

## Generate secrets

Secret tokens are in use both between control plane and FPGA as well as clients and the control plane. We generate them here

In [None]:
import secrets
import string

def generate_token(length=32):
    return ''.join(secrets.choice(string.ascii_uppercase + string.digits)
              for i in range(length))

sn_cfg_token = generate_token()
sn_p4_token = generate_token()
cp_admin_token = generate_token()

print(f'Generated the following tokens: \n\tsn_cfg={sn_cfg_token}\n\tsn_p4={sn_p4_token}\n\tcp_admin={cp_admin_token}')

## Stand up the stack

Here we bring up the FPGA and UDPLBd on top of it for control plane following these instructions: 
- https://github.com/esnet/esnet-smartnic-fw/blob/main/sn-stack/README.INSTALL.md 

In [None]:
# configure sn-stack/.env
# You can use `openssl rand -base64 24` to generate tokens
env_file = f"""
# block-start Added by the FABRIC notebook 
# relies on default already having SN_INGRESS_PORT=8440
FPGA_PCIE_DEV=0000:1f:00
# enables traefik
COMPOSE_PROFILES=smartnic-mgr-vfio-unlock,smartnic-ingress
SN_HOST=localhost
SN_CFG_AUTH_TOKEN={sn_cfg_token}
SN_P4_AUTH_TOKEN={sn_p4_token}
SMARTNIC_DPDK_IMAGE_URI=smartnic-dpdk-docker:ubuntu-dev
LABTOOLS_IMAGE_URI=xilinx-labtools-docker:ubuntu-dev
SMARTNIC_FW_IMAGE_URI=esnet-smartnic-fw:ubuntu-dev
# block-end 
"""

# upload the sn-cfg setup file to be executed from inside the container once it is up
sn_cfg_file="config/u280_setup.sh"
# scratch is mounted into the container
sn_cfg_install_path="/home/ubuntu/esnet-smartnic-fw/sn-stack/scratch/u280_setup.sh"
result = node.upload_file(sn_cfg_file, sn_cfg_install_path)

commands = [
    f"echo '{env_file}' >> ~/esnet-smartnic-fw/sn-stack/.env",
    f"chmod a+x {sn_cfg_install_path}"
]
execute_commands(node, commands)

In [None]:
# bring up the stack

commands = [
#    "cd esnet-smartnic-fw/sn-stack; mv traefik/config.d/certs.yml{,.hidden}; docker compose up -d"
    "cd esnet-smartnic-fw/sn-stack; docker compose up -d"
]
execute_commands(node, commands)

In [None]:
# wait for some time, check the logs
commands = [
    f"cd esnet-smartnic-fw/sn-stack; docker compose logs smartnic-hw"
]
execute_commands(node, commands)

In [None]:
# run initial configuration
# the command should print out details about the detected card something like 
# ----------------------------------------
# Device ID: 0
# ----------------------------------------
# PCI:
#     Bus ID:    0000:1f:00.0
#     Vendor ID: 0x10ee
#     Device ID: 0x903f
# Build:
#     Number: 0x0000e154
#     Status: 0x09231435
#     DNA[0]: 0x4cc061c5
#     DNA[1]: 0x016ad0a3
#     DNA[2]: 0x40020000
# Card:
#     Name:                  ALVEO U280 PQ
#     Profile:               U280
#     Serial Number:         21770329D004
#     Revision:              1.0
#     SC Version:            4.0
#     Config Mode:           MASTER_SPI_X4
#     Fan Presence:          P
#     Total Power Available: 225W
#     Cage Types:
#     MAC Addresses:
#         0: 00:0A:35:0E:26:36
#         1: 00:0A:35:0E:26:37
#         2: FF:FF:FF:FF:FF:FF
#         3: FF:FF:FF:FF:FF:FF
# Critically it should show 'Link: up' at the bottom for both ports
#
commands = [
    f"cd esnet-smartnic-fw/sn-stack; docker compose exec smartnic-fw /scratch/u280_setup.sh"
]

execute_commands(node, commands)

In [None]:
# bring the stack down
commands = [
    f"cd esnet-smartnic-fw/sn-stack;docker compose down -v --remove-orphans"
]
execute_commands(node, commands)

## Stand up UDPLBd Control Plane

In [None]:
# generate a config file based on issued IP addresses

# we need several inputs to this file:
# - secret tokens (to talk to FPGA and to clients)
# - list of IPv4 addresses to use (we provide fake IPv6 addresses for now)
# - IPv4 address to use for CP and Sync (usually the same)
# - list of UDP ports for sync messages
# - path to TLS cert

# we need to use ssh keys here because UDPLBd repo is private
# overwrite with location of your github key
github_key = '/home/fabric/work/fabric_config/github_ecdsa'
vm_key_location = f'/home/ubuntu/.ssh/github_ecdsa'

# which branch of UDPLBd code
udplbd_branch = "main"

# other parameter inputs
cp_ipv4 = storage_nic_ip
cp_listen_port = 18008
start_sync_port = 19010
sn_listen_port = 8440

# generate lists inserted into config
ipv4_list = "    - " + "\n    - ".join(map(str, fpga_ips))
udp_port_list = "    - " + "\n    - ".join(map(str, range(start_sync_port, start_sync_port + fpga_ips_qty)))

# config file template
cp_config_file = f"""
lb:
  id: 0
  ipv4:
{ipv4_list}
  ipv6:
    - "2001:400:a300::10"
    - "2001:400:a300::11"
    - "2001:400:a300::12"
    - "2001:400:a300::13"
    - "2001:400:a300::14"
    - "2001:400:a300::15"
    - "2001:400:a300::16"
    - "2001:400:a300::17"
  mac_unicast: "02:aa:bb:cc:dd:08"
  mac_broadcast: "33:33:ff:00:00:08"
  allow_private: true
event_numbers:
  host: {cp_ipv4}
  ports:
{udp_port_list}
controller:
  duration: 1s
  offset: 800ms
database:
  file: "/tmp/udplbd.db"
  collection:
    enable: true
    interval: "100ms"
    retention: "168h"
server:
  listen:
    - '{cp_ipv4}:{cp_listen_port}'
  auth_token: "{cp_admin_token}"
  tls:
    enable: true
    certFile: "/etc/udplbd/server_cert.pem"
    keyFile: "/etc/udplbd/server_key.pem"
http:
  listen: 127.0.0.1:8080
  dir: /frontend
log:
  level: debug
smartnic:
  - host: "localhost"
    port: {sn_listen_port}
    mock: false
    auth_token: "{sn_p4_token}"
    tls:
      enable: true
      verify: false
prometheus:
  enable: true
  listen: 127.0.0.1:2108
mockclient:
  status_update_interval_ms: 1000
  status_update_to_p4_update_latency_ms: 500
  buffer_slope_per_second: 100
  buf_count: 1000
  set_point_percent: 0.5
"""

# upload the GitHub SSH key onto the VM
result = node.upload_file(github_key, vm_key_location)

# clone the UDPLBd repo
# change permissions on imported key
# generate a private key and cert
# install config file where needed
commands = [
    f"rm -rf udplbd/",
    f"chmod go-rwx {vm_key_location}",
    f"GIT_SSH_COMMAND='ssh -i {vm_key_location} -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no' git clone -b {udplbd_branch} git@github.com:esnet/udplbd.git",
    f'openssl req -x509 -newkey rsa:4096 -keyout udplbd/etc/server_key.pem -out udplbd/etc/server_cert.pem -sha256 -days 365 -nodes -subj "/CN=cpnode/subjectAltName=IP:{cp_ipv4}" -nodes',
    f'echo "{cp_config_file}" > ./udplbd/etc/config.yml'
]

execute_commands(node, commands)

In [None]:
# start the stack
commands = [
    f'cd udplbd; docker compose up -d'
]
execute_commands(node, commands)

In [None]:
# check the logs
commands = [
    'docker compose ls',
    'cd udplbd; docker compose logs'
]

execute_commands(node, commands)

In [None]:
# if you need to restart it, this is the stop part
commands = [
    'cd udplbd; docker compose stop; docker compose rm -f; docker image rm udplbd'
]

execute_commands(node, commands)

## Run some tests to make sure things are up
- Run lbadm from a container to do commands like 'version' and 'overview'
- Run lbadm from a container to reserve an instance, make sure it succeeds and that we can ping the IP of the instance

In [None]:
# run version
commands = [
    f'docker run ibaldin/e2sar:latest lbadm --version -u "ejfats://{cp_admin_token}@{cp_ipv4}:{cp_listen_port}/" -v'
]

execute_commands(node, commands)

In [None]:
# run overview (unlikely to show anything)
commands = [
    f'docker run ibaldin/e2sar:latest lbadm --overview -u "ejfats://{cp_admin_token}@{cp_ipv4}:{cp_listen_port}/" -v'
]

execute_commands(node, commands)

In [None]:
# reserve an LB, observe the returned URI with 'data=' IPv4 address of the allocated LB, attempt to ping it
lbname = 'mylb'
duration = '02:00:00'
commands = [
    f'docker run ibaldin/e2sar:latest lbadm --reserve -u "ejfats://{cp_admin_token}@{cp_ipv4}:{cp_listen_port}/" -v -l {lbname} -d "{duration}" -a 192.168.1.1 -e'
]

execute_commands(node, commands)

In [None]:
# ping the data IP address returned in EJFAT_URI
lb_address = "10.138.1.253"
ping_from = cp_ipv4

commands = [
    f"sudo ping -q -f -I {ping_from} -c 100 {lb_address}"
]

execute_commands(node, commands)
print('Observe 0% packet loss')

In [None]:
# free the load balancer - we are good to go
# COPY INSTANCE TOKEN FROM THE OUTPUT OF RESERVE COMMAND ABOVE
instance_token = 'ejfats://<replaceme>'
commands = [
    f'docker run ibaldin/e2sar:latest lbadm --free -u "{instance_token}" -v -l {lbname} -d "{duration}" -a 192.168.1.1 -e'
]

execute_commands(node, commands)

## Extend the slice (as needed)

If you need to extend the storage slice, you can just execute the following two cells. They display the slice expiration date and optionally extend by 2 weeeks. 

In [None]:
slice = fablib.get_slice(name=slice_name)
a = slice.show()
nets = slice.list_networks()
nodes = slice.list_nodes()

Renew the slice

In [None]:
from datetime import datetime
from datetime import timezone
from datetime import timedelta

# Set end host to now plus 14 days
end_date = (datetime.now(timezone.utc) + timedelta(days=14)).strftime("%Y-%m-%d %H:%M:%S %z")

try:
    slice = fablib.get_slice(name=slice_name)

    slice.renew(end_date)
except Exception as e:
    print(f"Exception: {e}")

## Delete the Slice (as needed)

Please delete your slice when you are done with your experiment.


In [None]:
slice = fablib.get_slice(name=slice_name)
slice.delete()