# Creating a cluster/slice

## Please read carefully all the instructions. Specially the comments given in each cell. These comments are pertinent to run a cluster successfully. 

If the fabric environment is not set please read through the topic <code>FABRIC Environment Setup</code> in <code>start_here.ipynb</code> notebook given under <code>jupyter-example</code> folder in fabric's JupyterHub.

<b> At first read the comments in the cell and then only run the cell. </b>

## STEP-1

In [1]:
# Run this cell.

from fabrictestbed_extensions.fablib.fablib import FablibManager as fablib_manager

try:
    fablib = fablib_manager()
                     
    fablib.show_config()
except Exception as e:
    print(f"Exception: {e}")

0,1
Credential Manager,cm.fabric-testbed.net
Orchestrator,orchestrator.fabric-testbed.net
Token File,/home/fabric/.tokens.json
Project ID,68926660-da26-475d-9c40-50ebf0a5a812
Bastion Username,khawar_shehzad_0000059894
Bastion Private Key File,/home/fabric/work/fabric_config/bastion
Bastion Host,bastion.fabric-testbed.net
Bastion Private Key Passphrase,
Slice Public Key File,/home/fabric/work/fabric_config/sliver.pub
Slice Private Key File,/home/fabric/work/fabric_config/sliver


In [2]:
# Run this cell
# To find all the available resources at this time
#run the cell below for easier readability 
try:
    resources = fablib.list_sites()
except Exception as e:
    print(f"Exception: {e}")

Name,Address,Location,Hosts,CPUs,Cores Available,Cores Capacity,Cores Allocated,RAM Available,RAM Capacity,RAM Allocated,Disk Available,Disk Capacity,Disk Allocated,Basic NIC Available,Basic NIC Capacity,Basic NIC Allocated,ConnectX-6 Available,ConnectX-6 Capacity,ConnectX-6 Allocated,ConnectX-5 Available,ConnectX-5 Capacity,ConnectX-5 Allocated,NVMe Available,NVMe Capacity,NVMe Allocated,Tesla T4 Available,Tesla T4 Capacity,Tesla T4 Allocated,RTX6000 Available,RTX6000 Capacity,RTX6000 Allocated
SCM,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
SEAT,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,239,254,15,2,3,1,1,3,2,7,8,1,0,0,0,0,0,0
STAR,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,675,762,87,1,2,1,0,6,6,20,20,0,6,6,0,6,6,0
RUTG,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,606,635,29,2,2,0,4,4,0,16,16,0,0,0,0,0,0,0
TACC,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,593,635,42,2,2,0,4,4,0,16,16,0,3,4,1,2,6,4
PSC,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,369,381,12,2,2,0,2,2,0,10,10,0,0,0,0,0,0,0
MAX,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,608,635,27,1,2,1,2,4,2,14,16,2,4,4,0,4,6,2
UTAH,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,589,635,46,2,2,0,4,4,0,16,16,0,4,4,0,6,6,0
WASH,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,326,381,55,2,2,0,2,2,0,10,10,0,2,2,0,3,3,0
FIU,,"(0, 0)",0,0,0,0,0,0,0,0,0,0,0,608,635,27,2,2,0,3,4,1,16,16,0,4,4,0,6,6,0


<pandas.io.formats.style.Styler object at 0x7f67c68e32b0>


In [3]:
##for easier readability of available resources
import pandas as pd 
pd.DataFrame(resources.data)[['Hosts','CPUs','Name','Cores Available','RAM Available','Tesla T4 Available','RTX6000 Available']]

Unnamed: 0,Hosts,CPUs,Name,Cores Available,RAM Available,Tesla T4 Available,RTX6000 Available
0,0,0,SCM,0,0,0,0
1,0,0,SEAT,0,0,0,0
2,0,0,STAR,0,0,6,6
3,0,0,RUTG,0,0,0,0
4,0,0,TACC,0,0,3,2
5,0,0,PSC,0,0,0,0
6,0,0,MAX,0,0,4,4
7,0,0,UTAH,0,0,4,6
8,0,0,WASH,0,0,2,3
9,0,0,FIU,0,0,4,6


## Step-2: Creating a cluster from an instance type.

In [4]:
# Run this cell
# Initialize the variables appropriately. 
# Number of nodes in the cluster.
num_nodes=8

# Give a cluster name
slice_name='AVAH'

# Make it True if you want to include persistence storage to a single node. KEEP IT FALSE FOR THE TIME BEING.
storage=False

# get attached to the cluster. By default adding NVMe is false.
add_NVMe = True

# Make it True if you want to include GPUs to different nodes in the cluster [depends on availability].
add_gpu=False

# By default master node will not have GPUs in it. For CPU only cluster False means lower resources then the workers.
master_gpu=False

# If only t4 gpu needs to be added.
add_t4=True

# If only rtx gpu needs to be added.
add_rtx=True

# Site name, pick one site from the above list of resources.
site='UTAH'

# Select node resource:- It follows the pattern fabric.c#N.m#N.d#N , where c:cores, m:primary memory, d:disk and #N: capacity.
# eg: fabric.c4.m8.d50 means cores:4, Memory:8Gb, disk:50Gb  
# List of node instance type is provided in - https://github.com/fabric-testbed/InformationModel/blob/master/fim/slivers/data/instance_sizes.json
#instance_worker='fabric.c24.m128.d500'
instance_worker='fabric.c24.m128.d500'

# The resources of master node. It is better to be less than the worker nodes.
instance_master='fabric.c4.m8.d500'

# Operation system, Linux distribution e.g. default_ubuntu_18, default_ubuntu_20, etc.
image='default_ubuntu_18'

In [7]:
import pandas as pd

try:
    json_format=fablib.list_sites(output='json',quiet=True)
except Exception as e:
    print(f"Exception: {e}")

sites_df=pd.read_json(json_format)
site_df=sites_df[sites_df['name']==site]
type_t4=site_df['tesla_t4_available'].values
type_rtx=site_df['rtx6000_available'].values
nvme_available=site_df['nvme_available'].values
print("Number of Nvidia t4 available now at",site,"is:", type_t4[0])
print("Number of Nvidia rtx6000 available now at",site,"is:", type_rtx[0])
print("Number of NVMe available now at",site,"is:", nvme_available[0])

max_type_t4 = type_t4[0]
max_type_rtx = type_rtx[0]
total_gpus = max_type_t4 + max_type_rtx

Number of Nvidia t4 available now at UTAH is: 4
Number of Nvidia rtx6000 available now at UTAH is: 6
Number of NVMe available now at UTAH is: 16


  sites_df=pd.read_json(json_format)


In [9]:
# The max available GPU for the chosen site is mentioned above, to change it please make changes to the variables below.
# By default initialized with the available GPUs for the site.

#custom_number_of_t4 = max_type_t4
#custom_number_of_rtx = max_type_rtx

custom_number_of_t4 = 4
custom_number_of_rtx =4

if custom_number_of_t4 < max_type_t4 or custom_number_of_rtx < max_type_rtx:
    max_type_t4 = custom_number_of_t4
    max_type_rtx = custom_number_of_rtx
    total_gpus = max_type_t4 + max_type_rtx
print(total_gpus)
gpu_names=[]

for i in range(1,total_gpus+1):
    gpu_names.append("GPU_{0}".format(i))
 
print(gpu_names)
temp_t4=max_type_t4
temp_rtx=max_type_rtx

######################################################
#####                  NVMe                      #####
######################################################

# Assumption: If the number of available NVMes' are less than the number of cluster node minus the master NVMes' will not

if nvme_available >= num_nodes-1 and add_NVMe == True:
    add_NVMe = True
print(add_NVMe)

8
['GPU_1', 'GPU_2', 'GPU_3', 'GPU_4', 'GPU_5', 'GPU_6', 'GPU_7', 'GPU_8']
True


In [10]:
# Run this cell

node_names=[]
nic_names=[]
iface_names=[]
nvme_names=[]
network_name='cluster_network'
storage_name = 'gaf-storage'


for i in range(1,num_nodes+1):
    node_names.append("Node{0}".format(i))
    nic_names.append("Nic{0}".format(i))
    iface_names.append("iface{0}".format(i))
    nvme_names.append("nvme{0}".format(i))

print(node_names)
print(nic_names)
print(iface_names)
print(nvme_names)

['Node1', 'Node2', 'Node3', 'Node4', 'Node5', 'Node6', 'Node7', 'Node8']
['Nic1', 'Nic2', 'Nic3', 'Nic4', 'Nic5', 'Nic6', 'Nic7', 'Nic8']
['iface1', 'iface2', 'iface3', 'iface4', 'iface5', 'iface6', 'iface7', 'iface8']
['nvme1', 'nvme2', 'nvme3', 'nvme4', 'nvme5', 'nvme6', 'nvme7', 'nvme8']


In [11]:
# Only run when persistence storage needed to be attached. 
if storage == True:
    import traceback
    from plugins import Plugins
    try:
        Plugins.load()
    except Exception as e:
        traceback.print_exc()

In [12]:
# Run this cell, Visit the link below to find different instance type options. 
# Read the comments carefully given below and make changes as necessary. 

try:
    slice=fablib.new_slice(name=slice_name)
    
    for i in range(num_nodes):
        if master_gpu == False and i == 0:
            node=slice.add_node(name=node_names[i],
                                site=site, 
                                instance_type=instance_master,
                                image=image)
            iface_names[i]=node.add_component(model='NIC_Basic', name=node_names[i]).get_interfaces()[0]
        
        else:
            node=slice.add_node(name=node_names[i],
                                site=site,
                                instance_type=instance_worker,
                                image=image)
            iface_names[i]=node.add_component(model='NIC_Basic',name=node_names[i]).get_interfaces()[0]
            
            if add_gpu == True:
                if add_t4 == True and temp_t4 > 0:
                    node.add_component(model='GPU_TeslaT4', name=gpu_names[i-1])
                    temp_t4 = temp_t4 - 1
                elif add_rtx == True and temp_rtx >0 and temp_t4 <=0:
                    node.add_component(model='GPU_RTX6000',name=gpu_names[i-1])
                    temp_rtx = temp_rtx - 1
            if add_NVMe == True:
                node.add_component(model='NVME_P4510', name=nvme_names[i-1])

except Exception as e:
    print(f'Exception: {e}') 

In [13]:
# Run this cell
try:
    net_cluster=slice.add_l2network(name=network_name, interfaces=iface_names[:])
except Exception as e:
    print(f"Exception: {e}")

In [None]:
# Run this cell
# If this cell get executed successfully then IP address of the nodes will be displayed which can be used to ssh into the respective nodes.
# If there is an error while creating the slice/cluster, repeat from the third code cell block.

try:
    slice.submit()
except Exception as e:
    print(f'Exception: {e}')


Retry: 1, Time: 46 sec


0,1
ID,9ef48c1f-7167-41b4-bc4a-448b361fc5ea
Name,AVAH
Lease Expiration (UTC),2023-11-30 16:24:23 +0000
Lease Start (UTC),2023-11-29 16:24:23 +0000
Project ID,68926660-da26-475d-9c40-50ebf0a5a812
State,Configuring


ID,Name,Cores,RAM,Disk,Image,Image Type,Host,Site,Username,Management IP,State,Error,SSH Command,Public SSH Key File,Private SSH Key File
a227c119-e6d6-4e7b-a2f6-7329629d6d95,Node1,4,8,500,default_ubuntu_18,qcow2,utah-w4.fabric-testbed.net,UTAH,ubuntu,,Ticketed,,ssh -i {{ _self_.private_ssh_key_file }} -F /home/fabric/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }},/home/fabric/work/fabric_config/sliver.pub,/home/fabric/work/fabric_config/sliver
69d99c04-5ef2-4e0c-8632-8e074ace25db,Node2,24,128,500,default_ubuntu_18,qcow2,utah-w5.fabric-testbed.net,UTAH,ubuntu,,Ticketed,,ssh -i {{ _self_.private_ssh_key_file }} -F /home/fabric/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }},/home/fabric/work/fabric_config/sliver.pub,/home/fabric/work/fabric_config/sliver
8ce2feab-f027-421e-b7bb-33d62034334e,Node3,24,128,500,default_ubuntu_18,qcow2,utah-w5.fabric-testbed.net,UTAH,ubuntu,,Ticketed,,ssh -i {{ _self_.private_ssh_key_file }} -F /home/fabric/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }},/home/fabric/work/fabric_config/sliver.pub,/home/fabric/work/fabric_config/sliver
d412aeb9-a6f0-4135-b6b9-3c5418457d92,Node4,24,128,500,default_ubuntu_18,qcow2,utah-w5.fabric-testbed.net,UTAH,ubuntu,,Ticketed,,ssh -i {{ _self_.private_ssh_key_file }} -F /home/fabric/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }},/home/fabric/work/fabric_config/sliver.pub,/home/fabric/work/fabric_config/sliver
c8addc40-ffdc-4efc-bd39-a18ec67e0e99,Node5,24,128,500,default_ubuntu_18,qcow2,utah-w2.fabric-testbed.net,UTAH,ubuntu,,Ticketed,,ssh -i {{ _self_.private_ssh_key_file }} -F /home/fabric/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }},/home/fabric/work/fabric_config/sliver.pub,/home/fabric/work/fabric_config/sliver
1ad030db-38ee-46ab-b2ae-e242c8e102fc,Node6,24,128,500,default_ubuntu_18,qcow2,utah-w2.fabric-testbed.net,UTAH,ubuntu,,Ticketed,,ssh -i {{ _self_.private_ssh_key_file }} -F /home/fabric/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }},/home/fabric/work/fabric_config/sliver.pub,/home/fabric/work/fabric_config/sliver
811c12ea-3c18-4e78-a4e0-814c0ce0939e,Node7,24,128,500,default_ubuntu_18,qcow2,utah-w3.fabric-testbed.net,UTAH,ubuntu,,Ticketed,,ssh -i {{ _self_.private_ssh_key_file }} -F /home/fabric/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }},/home/fabric/work/fabric_config/sliver.pub,/home/fabric/work/fabric_config/sliver
824d9810-a696-4035-bef8-3aac6ec30f51,Node8,24,128,500,default_ubuntu_18,qcow2,utah-w3.fabric-testbed.net,UTAH,ubuntu,,Ticketed,,ssh -i {{ _self_.private_ssh_key_file }} -F /home/fabric/work/fabric_config/ssh_config {{ _self_.username }}@{{ _self_.management_ip }},/home/fabric/work/fabric_config/sliver.pub,/home/fabric/work/fabric_config/sliver


ID,Name,Layer,Type,Site,Gateway,Subnet,State,Error
4d71d803-7b68-4c78-b688-339eaa880449,cluster_network,L2,L2Bridge,UTAH,,,Ticketed,


## Step-3: Configuring the network and setting up the cluster

In [2]:
try:
    slice=fablib.get_slice(name="AVAH") # Put the cluster name that you want to delete
    #slice.delete()
except Exception as e:
    print(f"Exception: {e}")

In [57]:
#Run this cell

from ipaddress import IPv4Address, IPv4Network

try:
    subnet=IPv4Network("192.168.1.0/24")
    available_ips=list(subnet)[1:]
except Exception as e:
    print(f"Exception: {e}")

In [46]:
#try:
#    for node in slice.get_nodes():
#        node_iface=node.get_interface(network_name=network_name)
#        stdout, stderr = node.execute(f'ip addr show {node_iface.get_os_interface()}')
#except Exception as e:
#    print(f"Exception: {e}")

In [58]:
#Run this cell
#%%capture
try:
    for node in slice.get_nodes():
        node_iface=node.get_interface(network_name=network_name)
        node_IP_addr=available_ips.pop(0)
        node_iface.ip_addr_add(addr=node_IP_addr, subnet=subnet)
        
        stdout, stderr = node.execute(f'ip addr show {node_iface.get_os_interface()}')
        _, _ = node.execute('sudo apt-get update')
        stdout, stderr = node.execute('sudo apt install net-tools')        
        stdout, stderr = node.execute(f'sudo ifconfig {node_iface.get_os_interface()} up')
        
        
except Exception as e:
    print(f"Exception: {e}")

3: enp7s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
    link/ether 8e:ee:c4:45:b7:98 brd ff:ff:ff:ff:ff:ff
    inet 192.168.1.1/24 scope global enp7s0
       valid_lft forever preferred_lft forever
    inet6 fe80::8cee:c4ff:fe45:b798/64 scope link 
       valid_lft forever preferred_lft forever
Reading package lists...
[31m E: List directory /var/lib/apt/lists/partial is missing. - Acquire (2: No such file or directory)
E: flAbsPath on /var/lib/dpkg/status failed - realpath (2: No such file or directory)
E: Could not open file  - open (2: No such file or directory)
E: Problem opening 
E: The package lists or status file could not be parsed or opened.
 [0m[31m 

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (2: No such file or directory)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?
 [0m3: enp7s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000

In [59]:
#Run this cell
# If the ping is successful, that means the nodes are connected properly if there is any error then may need to recreate the cluster or further have to look into it. 

try:
    node1=slice.get_node(name='Node1')
    
    stdout,stderr=node1.execute(f' ping -c 3 192.168.1.4')
    print(stdout)
    print(stderr)
    
except Exception as e:
    print(f'Exception: {e}')   

PING 192.168.1.4 (192.168.1.4) 56(84) bytes of data.
64 bytes from 192.168.1.4: icmp_seq=1 ttl=64 time=0.140 ms
64 bytes from 192.168.1.4: icmp_seq=2 ttl=64 time=0.108 ms
64 bytes from 192.168.1.4: icmp_seq=3 ttl=64 time=0.129 ms

--- 192.168.1.4 ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 2030ms
rtt min/avg/max/mdev = 0.108/0.125/0.140/0.018 ms
PING 192.168.1.4 (192.168.1.4) 56(84) bytes of data.
64 bytes from 192.168.1.4: icmp_seq=1 ttl=64 time=0.140 ms
64 bytes from 192.168.1.4: icmp_seq=2 ttl=64 time=0.108 ms
64 bytes from 192.168.1.4: icmp_seq=3 ttl=64 time=0.129 ms

--- 192.168.1.4 ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 2030ms
rtt min/avg/max/mdev = 0.108/0.125/0.140/0.018 ms




In [60]:
# Run this cell
# Function to create a file that contains IPs and hostnames related to it.

def append_line(file_path,text):
    with open(file_path,"a+") as file_des:
        file_des.seek(0)
        data=file_des.read(-1)
        if len(data)>0:
            file_des.write("\n")
        file_des.write(text)

In [61]:
# Run this cell

import os

if os.path.isfile('/home/fabric/work/hosts') or os.path.isfile('/home/fabric/work/ips.txt') or os.path.isfile('/home/fabric/work/workers'):
    os.system("rm /home/fabric/work/hosts")
    os.system("rm /home/fabric/work/ips.txt")
    os.system("rm /home/fabric/work/workers")    
else:
    print("does not exist")

if os.path.isfile('/home/fabric/work/gpu_ips.txt'):
    os.system("rm /home/fabric/work/gpu_ips.txt")
    
#os.system("rm /home/fabric/work/ips.txt")

In [62]:
# Run this cell

# Capturing the IP addresses, this step needs to be integrated with the IP assigning stage, coded above. Or it may stay independent.
import os

i=1
local_host="127.0.0.1 localhost"
path_to_host_file="/home/fabric/work/hosts"
path_to_ip_file="/home/fabric/work/ips.txt"
path_to_worker_ip="/home/fabric/work/workers"
path_to_gpu_ips="/home/fabric/work/gpu_ips.txt"
gpu_name="NVIDIA"

append_line(path_to_host_file,local_host)

try:
    for node in slice.get_nodes():
        stdout, stderr=node.execute("hostname -I")
        IP=stdout.split(" ")[1]
        node_name="node{0}".format(i)
        vm_names="vm{0}".format(i-1)
        append_line(path_to_ip_file,IP)
        
        stdout, stderr=node.execute("hostname")
        line=IP+" "+node_name+" "+vm_names+" "+stdout
        append_line(path_to_host_file,line)
      
    
        if(i>1):
            append_line(path_to_worker_ip,vm_names)
            #append_line(path_to_worker_ip,IP)
        
        print(line)
        print(stderr)
        
        stdout_gpu, _=node.execute('lspci | grep NVIDIA')   
        if gpu_name in stdout_gpu:
            stdout_ip, _=node.execute('hostname -I')
            ip=stdout_ip.split(" ")[1]
            append_line(path_to_gpu_ips,ip)
            #print(ip)
        
        i=i+1
except Exception as e:
    print(f"Exception: {e}")
    
print(IP)

10.30.6.122 192.168.1.1 2001:400:a100:3030:f816:3eff:fe75:8623 
5b3d02da-b518-495a-bfe5-5ff140a10a0b-node1
192.168.1.1 node1 vm0 5b3d02da-b518-495a-bfe5-5ff140a10a0b-node1


10.30.6.25 192.168.1.2 2001:400:a100:3030:f816:3eff:fe4f:ee24 
e90deda7-78df-4ccf-85be-fb09c8032038-node2
192.168.1.2 node2 vm1 e90deda7-78df-4ccf-85be-fb09c8032038-node2


10.30.6.154 192.168.1.3 2001:400:a100:3030:f816:3eff:fe7d:32c1 
e62975a1-296b-4f33-98f2-6d64a0888c52-node3
192.168.1.3 node3 vm2 e62975a1-296b-4f33-98f2-6d64a0888c52-node3


10.30.6.38 192.168.1.4 2001:400:a100:3030:f816:3eff:fea2:4e18 
8823cf72-08c9-4891-882d-f304b8237f98-node4
192.168.1.4 node4 vm3 8823cf72-08c9-4891-882d-f304b8237f98-node4


10.30.6.109 192.168.1.5 2001:400:a100:3030:f816:3eff:fe9e:7aa4 
fc5a06f6-5534-4d1f-809d-951b2c36d178-node5
192.168.1.5 node5 vm4 fc5a06f6-5534-4d1f-809d-951b2c36d178-node5


10.30.6.45 192.168.1.6 2001:400:a100:3030:f816:3eff:fe4b:cf3d 
dd426998-3a14-4a5e-a8ae-45517f35ae37-node6
192.168.1.6 node6 vm5 dd42

In [63]:
# Run this cell

try:
    for node in slice.get_nodes():
        stdout, stderr=node.execute(f'sudo cp /etc/hosts /etc/hosts_backup') # if you run the command twice the back up will be overwritten, a conditional block should be written
        output_host_copy=node.upload_file(path_to_host_file,"/home/ubuntu/hosts")
        output_ip_copy=node.upload_file(path_to_ip_file,"/home/ubuntu/ips.txt")
        output_worker_copy=node.upload_file(path_to_worker_ip,"/home/ubuntu/workers")
        stdout_host_copy,stderr_host_copy=node.execute(f'sudo cp /home/ubuntu/hosts /etc/hosts')
        if os.path.isfile('/home/fabric/work/gpu_ips.txt'):
            output_gpu_copy=node.upload_file(path_to_gpu_ips,"/home/ubuntu/gpu_ips.txt")
        
        print(stderr)
        print(stderr_host_copy)
except Exception as e:
    print(f"Exception : {e}")
    



















In [64]:
# Run this cell

import os

output=os.system('ssh-keygen -q -t rsa -N "" -f /home/fabric/work/id_rsa > /dev/null 2>&1')

In [65]:
# Run this cell

try:
    for node in slice.get_nodes():
        output_private=node.upload_file("/home/fabric/work/id_rsa","/home/ubuntu/.ssh/id_rsa")
        output_public=node.upload_file("/home/fabric/work/id_rsa.pub","/home/ubuntu/.ssh/id_rsa.pub")
        
        stdout, stderr=node.execute(f' cat /home/ubuntu/.ssh/id_rsa.pub >> /home/ubuntu/.ssh/authorized_keys')
        stdout, stderr=node.execute(f' chmod 600 /home/ubuntu/.ssh/id_rsa*')
        
        print(output_private)
        print(output_public)
        print(stderr)
        #print(stdout)
        
except Exception as e:
    print(f"Exception: {e}")

-rw-------   1 1000     1000         2655 21 Nov 22:16 ?
-rw-------   1 1000     1000          609 21 Nov 22:16 ?

-rw-------   1 1000     1000         2655 21 Nov 22:16 ?
-rw-------   1 1000     1000          609 21 Nov 22:16 ?

-rw-------   1 1000     1000         2655 21 Nov 22:16 ?
-rw-------   1 1000     1000          609 21 Nov 22:16 ?

-rw-------   1 1000     1000         2655 21 Nov 22:16 ?
-rw-------   1 1000     1000          609 21 Nov 22:16 ?

-rw-------   1 1000     1000         2655 21 Nov 22:16 ?
-rw-------   1 1000     1000          609 21 Nov 22:16 ?

-rw-------   1 1000     1000         2655 21 Nov 22:16 ?
-rw-------   1 1000     1000          609 21 Nov 22:16 ?

-rw-------   1 1000     1000         2655 21 Nov 22:16 ?
-rw-------   1 1000     1000          609 21 Nov 22:16 ?

-rw-------   1 1000     1000         2655 21 Nov 22:17 ?
-rw-------   1 1000     1000          609 21 Nov 22:17 ?



In [66]:
# Run this cell. This is the last cell to run. Please read the comments in the next few cells to know how to extend the lease time and how to delete a slice/cluster. 

from ipaddress import ip_address, IPv6Address
try:
    for node in slice.get_nodes():
        if type(ip_address(node.get_management_ip())) is IPv6Address:
            node.upload_file("/home/fabric/work/nat64.sh", "/home/ubuntu/nat64.sh")
            #stdout, stderr=node.execute(f' chmod +x /home/ubuntu/nat64.sh && sudo bash /home/ubuntu/nat64.sh')
            stdout, stderr=node.execute(f' chmod +x /home/ubuntu/nat64.sh && sudo bash /home/ubuntu/nat64.sh > /dev/null 2>&1')
            
            print(stdout)
            print(stderr)
except Exception as e:
    print(f"Exception: {e}")



















## Step-4: Deleting and extending lease of a slice

In [36]:
# Run this cell ONLY when you want to delete the cell
# To delete a slice/cluster.
try:
    slice=fablib.get_slice(name="AVAH") # Put the cluster name that you want to delete
    slice.delete()
except Exception as e:
    print(f"Exception: {e}")

In [23]:
# Run this cell ONLY when you want to exted the time of the slice. Or to extend the lease time of the slice/cluster.

import datetime
slice_name='test2' # Give the cluster/slice name that you want to extend

#Set end host to now plus 1 day
new_end_date = (datetime.datetime.utcnow() + datetime.timedelta(days=7)).strftime("%Y-%m-%d %H:%M:%S %z")
#new_end_date = (datetime.now(timezone.utc) + timedelta(days=6)).strftime("%Y-%m-%d %H:%M:%S %z")
print(type(new_end_date), new_end_date)
try:
    slice=fablib.get_slice(name=slice_name)
    slice.renew('2023-05-31 18:04:26 +0000') # Give the new lease end date and time of the slice. One can increase it by 7 days from the day of creation of the slice/cluster. 
    
    #print(f"Lease End (UTC)        : {slice.get_lease_end()}")
except Exception as e:
    print(f"Exception: {e}")

<class 'str'> 2023-06-01 02:02:16 


In [41]:
# Run this cell ONLY to observe the new lease time.

slice_name='cluster_exp' # Give the slice/cluster name that you just extended. 
try:
    slice = fablib.get_slice(name=slice_name)
    print(f"{slice}")
except Exception as e:
    print(f"Exception: {e}")

-----------  ------------------------------------
Slice Name   cluster_exp
Slice ID     6f79613e-46f0-4323-8f04-395221080b2e
Slice State  StableOK
Lease End    2023-05-06 18:04:26 +0000
-----------  ------------------------------------


In [17]:
# Run this cell ONLY to observe the new lease time.

slice_name='cluster_gpu' # Give the slice/cluster name that you just extended. 
try:
    slice = fablib.get_slice(name=slice_name)
    gpu_name="NVIDIA"
    for node in slice.get_nodes():
        #stdout, stderr=node.execute(f'sudo cp /etc/hosts /etc/hosts_backup') # if you run the command twice the back up will be overwritten, a conditional block should be written
        stdout_gpu, _=node.execute('lspci | grep NVIDIA')   
        if gpu_name in stdout_gpu:
            stdout_ip, _=node.execute('hostname -I')
            ip=stdout_ip.split(" ")[1]
            print(ip)
except Exception as e:
    print(f"Exception: {e}")

## Do not bother about the code below !!! ;-)

In [37]:
# Do not Run this cell

storage_name = 'gaf-storage1'

node1=slice.get_node(name='Node1')
node1.add_component(model='NVME_P4510',name='gaf-storage')

TopologyException: Component names must be unique within node.

In [15]:
slice_name='cluster_gpu_2'

try:
    slice=fablib.get_slice(name=slice_name)
except Exception as e:
    print(f"Exception: {e}")

In [77]:
# Do not run this cell, only for experimentation.

try:
    node = slice.get_node(name='Node4')
    print(f"{node}")
    storage = node.get_storage(name='gaf-storage')
    print(f"{storage}")
    print(f"Storage Device Name: {storage.get_device_name()}")
    #print("Mounting the storage volume")
    #stdout, stderr = node.execute(f"sudo mkdir -p /mnt/{storage_name};"
                                 # f"sudo mount {storage.get_device_name()} /mnt/{storage_name}")
    print(stdout)
    print(stderr)
except Exception as e:
    print(f"Exception: {e}")



-----------------  ------------------------------------------------------------------------------------------------------------------------------------------------
ID                 342cfbfa-8e20-4e69-992a-64364d08db35
Name               Node4
Cores              4
RAM                8
Disk               10
Image              default_ubuntu_20
Image Type         qcow2
Host               star-w2.fabric-testbed.net
Site               STAR
Management IP      2001:400:a100:3030:f816:3eff:fe08:a7d4
Reservation State  Active
Error Message
SSH Command        ssh -i /home/fabric/work/fabric_config/slice_key -J mjdbz4_0000018266@bastion-1.fabric-testbed.net ubuntu@2001:400:a100:3030:f816:3eff:fe08:a7d4
-----------------  ------------------------------------------------------------------------------------------------------------------------------------------------
-----------  --------------------
Name         gaf-storage
Details      Site-local NAS share
Disk (G)     0
Units        1
PCI Addres

In [None]:
hosts=[]
import random
randomList=[]

while True:
    r=random.randint(1,9)
    
    if r not in randomList:
        randomList.append(r)
    if len(randomList) == 8:
        break
        
for i in range(1,num_nodes+1):
    hosts.append("{0}-w{1}.fabric-testbed.net".format(site.lower(),randomList[i-1]))
    
print(hosts)  

In [74]:
try:
    node = slice.get_node('Node3') 
    #node.show()

    nvme1 = node.get_component(nvme_names[1])
    nvme1.show()
    nvme1.configure_nvme()
except Exception as e:
    print(f"Exception: {e}")

0,1
Name,Node3-nvme2
Details,Dell Express Flash NVMe P4510 1TB SFF
Disk,0
Units,1
PCI Address,['0000:00:08.0']
Model,NVME_P4510
Type,NVME




Disk /dev/nvme0n1: 894.3 GiB, 960197124096 bytes, 1875385008 sectors
Units: sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
[31m fdisk: cannot open /dev/nvme0: Illegal seek
 [0mconfig_nvme Fail: Node3-nvme2
Exception: []


In [73]:
nvme_names[1]

'nvme2'