In [None]:
from fabrictestbed_extensions.fablib.fablib import FablibManager as fablib_manager

try:
    fablib = fablib_manager()
                     
    fablib.show_config()
except Exception as e:
    print(f"Exception: {e}")

In [None]:
################################################################################
###################### PLease set cluster properties ###########################
################################################################################

# Number of nodes in the cluster.
num_nodes=8

# Give a cluster name
slice_name='cluster_GATK_3'

# get attached to the cluster. By default adding NVMe is false.
add_NVMe = False

# Make it True if you want to include GPUs to different nodes in the cluster [depends on availability].
add_gpu=False

# By default master node will not have GPUs in it. For CPU only cluster False means lower resources then the workers.
master_gpu=False

# If only t4 gpu needs to be added.
add_t4=True

# If only rtx gpu needs to be added.
add_rtx=True

# Site name, pick one site from the above list of resources.
site='FIU'

# Select node resource:- It follows the pattern fabric.c#N.m#N.d#N , where c:cores, m:primary memory, d:disk and #N: capacity.
# eg: fabric.c4.m8.d50 means cores:4, Memory:8Gb, disk:50Gb  
# List of node instance type is provided in - https://github.com/fabric-testbed/InformationModel/blob/master/fim/slivers/data/instance_sizes.json
instance_worker='fabric.c24.m128.d500'

# The resources of master node. It is better to be less than the worker nodes.
instance_master='fabric.c8.m16.d500'

# Operation system, Linux distribution e.g. default_ubuntu_18, default_ubuntu_20, etc.
image='default_ubuntu_18'

In [None]:
import pandas as pd

try:
    json_format=fablib.list_sites(output='json',quiet=True)
except Exception as e:
    print(f"Exception: {e}")

sites_df=pd.read_json(json_format)
site_df=sites_df[sites_df['name']==site]
type_t4=site_df['tesla_t4_available'].values
type_rtx=site_df['rtx6000_available'].values
nvme_available=site_df['nvme_available'].values
print("Number of Nvidia t4 available now at",site,"is:", type_t4[0])
print("Number of Nvidia rtx6000 available now at",site,"is:", type_rtx[0])
print("Number of NVMe available now at",site,"is:", nvme_available[0])

max_type_t4 = type_t4[0]
max_type_rtx = type_rtx[0]
total_gpus = max_type_t4 + max_type_rtx

In [None]:
# The max available GPU for the chosen site is mentioned above, to change it please make changes to the variables below.
# By default initialized with the available GPUs for the site.

#custom_number_of_t4 = max_type_t4
#custom_number_of_rtx = max_type_rtx

custom_number_of_t4 = 3
custom_number_of_rtx = 5

if custom_number_of_t4 < max_type_t4 or custom_number_of_rtx < max_type_rtx:
    max_type_t4 = custom_number_of_t4
    max_type_rtx = custom_number_of_rtx
    total_gpus = max_type_t4 + max_type_rtx
print(total_gpus)
gpu_names=[]

for i in range(1,total_gpus+1):
    gpu_names.append("GPU_{0}".format(i))
 
print(gpu_names)
temp_t4=max_type_t4
temp_rtx=max_type_rtx

######################################################
#####                  NVMe                      #####
######################################################

# Assumption: If the number of available NVMes' are less than the number of cluster node minus the master NVMes' will not

if nvme_available >= num_nodes-1 and add_NVMe == True:
    add_NVMe = True
print(add_NVMe)

In [None]:
node_names=[]
nic_names=[]
iface_names=[]
nvme_names=[]
network_name='cluster_network'
storage_name = 'gaf-storage'


for i in range(1,num_nodes+1):
    node_names.append("Node{0}".format(i))
    nic_names.append("Nic{0}".format(i))
    iface_names.append("iface{0}".format(i))
    nvme_names.append("nvme{0}".format(i))

print(node_names)
print(nic_names)
print(iface_names)
print(nvme_names)

In [None]:
try:
    slice=fablib.new_slice(name=slice_name)
    
    for i in range(num_nodes):
        if master_gpu == False and i == 0:
            node=slice.add_node(name=node_names[i],
                                site=site, 
                                instance_type=instance_master,
                                image=image)
            iface_names[i]=node.add_component(model='NIC_Basic', name=node_names[i]).get_interfaces()[0]
            node.add_component(model='NVME_P4510', name=nvme_names[i])
        
        else:
            node=slice.add_node(name=node_names[i],
                                site=site,
                                instance_type=instance_worker,
                                image=image)
            iface_names[i]=node.add_component(model='NIC_Basic',name=node_names[i]).get_interfaces()[0]
            
            if add_gpu == True:
                if add_t4 == True and temp_t4 > 0:
                    node.add_component(model='GPU_TeslaT4', name=gpu_names[i-1])
                    temp_t4 = temp_t4 - 1
                elif add_rtx == True and temp_rtx >0 and temp_t4 <=0:
                    node.add_component(model='GPU_RTX6000',name=gpu_names[i-1])
                    temp_rtx = temp_rtx - 1
            if add_NVMe == True:
                node.add_component(model='NVME_P4510', name=nvme_names[i])

except Exception as e:
    print(f'Exception: {e}') 

In [None]:
try:
    net_cluster=slice.add_l2network(name=network_name, interfaces=iface_names[:])
except Exception as e:
    print(f"Exception: {e}")

In [None]:
try:
    slice.submit()
except Exception as e:
    print(f'Exception: {e}')

In [None]:
from ipaddress import IPv4Address, IPv4Network

try:
    subnet=IPv4Network("192.168.1.0/24")
    available_ips=list(subnet)[1:]
except Exception as e:
    print(f"Exception: {e}")

In [None]:
try:
    for node in slice.get_nodes():
        node_iface=node.get_interface(network_name=network_name)
        node_IP_addr=available_ips.pop(0)
        node_iface.ip_addr_add(addr=node_IP_addr, subnet=subnet)
        
        stdout, stderr = node.execute(f'ip addr show {node_iface.get_os_interface()}')
        _, _ = node.execute('sudo apt-get update')
        stdout, stderr = node.execute('sudo apt install net-tools')        
        stdout, stderr = node.execute(f'sudo ifconfig {node_iface.get_os_interface()} up')
                
except Exception as e:
    print(f"Exception: {e}")

In [None]:
# If the ping is successful, that means the nodes are connected properly if there is any error then 
# may need to recreate the cluster or further have to look into it. 

try:
    node1=slice.get_node(name='Node2')
    
    stdout,stderr=node1.execute(f' ping -c 3 192.168.1.5')
    print(stdout)
    print(stderr)
    
except Exception as e:
    print(f'Exception: {e}')  

In [None]:
def append_line(file_path,text):
    with open(file_path,"a+") as file_des:
        file_des.seek(0)
        data=file_des.read(-1)
        if len(data)>0:
            file_des.write("\n")
        file_des.write(text)

In [None]:
import os

if os.path.isfile('/home/fabric/work/hosts') or os.path.isfile('/home/fabric/work/ips.txt') or os.path.isfile('/home/fabric/work/workers'):
    os.system("rm /home/fabric/work/hosts")
    os.system("rm /home/fabric/work/ips.txt")
    os.system("rm /home/fabric/work/workers")    
else:
    print("does not exist")

if os.path.isfile('/home/fabric/work/gpu_ips.txt'):
    os.system("rm /home/fabric/work/gpu_ips.txt")

In [None]:
import os

i=1
local_host="127.0.0.1 localhost"
path_to_host_file="/home/fabric/work/hosts"
path_to_ip_file="/home/fabric/work/ips.txt"
path_to_worker_ip="/home/fabric/work/workers"
path_to_gpu_ips="/home/fabric/work/gpu_ips.txt"
gpu_name="NVIDIA"

append_line(path_to_host_file,local_host)

try:
    for node in slice.get_nodes():
        stdout, stderr=node.execute("hostname -I")
        IP=stdout.split(" ")[1]
        node_name="node{0}".format(i)
        vm_names="vm{0}".format(i-1)
        append_line(path_to_ip_file,IP)
        
        stdout, stderr=node.execute("hostname")
        line=IP+" "+node_name+" "+vm_names+" "+stdout
        append_line(path_to_host_file,line)
      
    
        if(i>1):
            append_line(path_to_worker_ip,vm_names)
            #append_line(path_to_worker_ip,IP)
        
        print(line)
        print(stderr)
        
        stdout_gpu, _=node.execute('lspci | grep NVIDIA')   
        if gpu_name in stdout_gpu:
            stdout_ip, _=node.execute('hostname -I')
            ip=stdout_ip.split(" ")[1]
            append_line(path_to_gpu_ips,ip)
            #print(ip)
        
        i=i+1
except Exception as e:
    print(f"Exception: {e}")
    
print(IP)

In [None]:
try:
    for node in slice.get_nodes():
        stdout, stderr=node.execute(f'sudo cp /etc/hosts /etc/hosts_backup') # if you run the command twice the back up will be overwritten, a conditional block should be written
        output_host_copy=node.upload_file(path_to_host_file,"/home/ubuntu/hosts")
        output_ip_copy=node.upload_file(path_to_ip_file,"/home/ubuntu/ips.txt")
        output_worker_copy=node.upload_file(path_to_worker_ip,"/home/ubuntu/workers")
        stdout_host_copy,stderr_host_copy=node.execute(f'sudo cp /home/ubuntu/hosts /etc/hosts')
        if os.path.isfile('/home/fabric/work/gpu_ips.txt'):
            output_gpu_copy=node.upload_file(path_to_gpu_ips,"/home/ubuntu/gpu_ips.txt")
        
        print(stderr)
        print(stderr_host_copy)
except Exception as e:
    print(f"Exception : {e}")

In [None]:
import os

output=os.system('ssh-keygen -q -t rsa -N "" -f /home/fabric/work/id_rsa > /dev/null 2>&1')

In [None]:
try:
    for node in slice.get_nodes():
        output_private=node.upload_file("/home/fabric/work/id_rsa","/home/ubuntu/.ssh/id_rsa")
        output_public=node.upload_file("/home/fabric/work/id_rsa.pub","/home/ubuntu/.ssh/id_rsa.pub")
        
        stdout, stderr=node.execute(f' cat /home/ubuntu/.ssh/id_rsa.pub >> /home/ubuntu/.ssh/authorized_keys')
        stdout, stderr=node.execute(f' chmod 600 /home/ubuntu/.ssh/id_rsa*')
        
        print(output_private)
        print(output_public)
        print(stderr)
        #print(stdout)
        
except Exception as e:
    print(f"Exception: {e}")

In [None]:
from ipaddress import ip_address, IPv6Address
try:
    for node in slice.get_nodes():
        if type(ip_address(node.get_management_ip())) is IPv6Address:
            node.upload_file("/home/fabric/work/nat64.sh", "/home/ubuntu/nat64.sh")
            #stdout, stderr=node.execute(f' chmod +x /home/ubuntu/nat64.sh && sudo bash /home/ubuntu/nat64.sh')
            stdout, stderr=node.execute(f' chmod +x /home/ubuntu/nat64.sh && sudo bash /home/ubuntu/nat64.sh > /dev/null 2>&1')
            
            print(stdout)
            print(stderr)
except Exception as e:
    print(f"Exception: {e}")