# Initial imports and opening files

In [1]:
# imports
import numpy as np
import copy
import pandas
import json
from datetime import datetime
import math

import plotly.graph_objects as go
import plotly.express as px

In [2]:
# load general OST information (df), as well as the getstripe and stat information for the desired files
df = pandas.read_csv("LustreExampleData/df.csv")
directory_stats = pandas.read_csv("LustreExampleData/ERA5_directory_stats.csv")
with open("LustreExampleData/ERA5_getstripe_infos.json") as directory_data:
    stripe_data = json.load(directory_data)

# Overview

In [28]:
# define which partition you want to see an overview for, 
#   as some systems devide their devices into (non-exclusive) "partitions"

partition = "/work"
# equal to "/pool/data"
# equal to "/scratch"

#partition = "/home"
# equal to "/sw"
# equal to "/lustre/home"

#partition = "/fastdata"

In [29]:
# search for only the OSTs that belong to the desired partition
df_pool = df[(df["partition"] == partition) & (df["storage_type"] == "OST")]

# draw a bar chart for the utilization of this partition
fig = px.bar(df_pool, x="id", y="blocks", color="use_percent", range_color = [0,100], color_continuous_scale="thermal",
            labels={"id":"OST", "blocks":"Total Gigabytes", "color":"used percent"}, title="OST utilization in " + partition)
fig.show()
#fig.write_html("Lustre_overview.html")

In [30]:
# the same as befor, but showing the utilization as a stacked bar chart instead of a color scale
df_pool = df[(df["partition"] == partition) & (df["storage_type"] == "OST")]
fig = px.bar(df_pool, x="id", y=["used", "available"],
            labels={"id":"OST", "value":"Total Gigabytes", "variable":""}, title="OST utilization in " + partition)
fig.show()
#fig.write_html("Lustre_overview.html")

# Directory

In [37]:
# defines by hand
partition = "/work"

In [38]:
number_of_files = []
dff = df[(df["partition"] == partition) & (df["storage_type"] == "OST")]
# count the number of osts in this partition
for ost in range(len(dff)):
    number_of_files.append(0)
# and how many files from our queried directory are in each   
for files in stripe_data:
    # non-PFL files
    if len(files) == 1:
        for file in files:
            if "osts" in file:
                for ost in file["osts"]:
                    number_of_files[int(ost["obdidx"])] += 1
    # PFL-files
    else:
        for file_layout in files:
            if "osts" in file_layout:
                for ost in file_layout["osts"]:
                    number_of_files[int(ost["l_ost_idx"])] += 1

In [48]:
#important to note: This just shows the number of files that have at least some data on an OST, 
#   but ignores how much data is stored there
osts = df.loc[(df["partition"] == partition) & (df["storage_type"] == "OST")].id.to_list()
use = df.loc[(df["partition"] == partition) & (df["storage_type"] == "OST")].use_percent.to_list()

fig = px.bar(x=osts, y=use, color=number_of_files, color_continuous_scale="viridis", range_color=(0,max(number_of_files)),
            labels={"x":"OST", "y":"Use%", "color":"number of files"}, title="number of files in " + partition + " on OSTs")
fig.show()

# Single File

In [49]:
def get_stripe_information(filename):
    for array in stripe_data:
        if array[0]["filename"] == filename:
            return array

In [50]:
stripes = copy.copy(get_stripe_information(selected_file))
stripes_size = copy.copy(directory_stats.loc[directory_stats["name"] == selected_file, "size"].values[0])


# striping information
ost_id = []
component = []
bytes_used = []

if len(stripes) == 1:
    # no PFL
    ost_list = stripes[0]["osts"]
    for ost in ost_list:        
        ost_id.append(ost["obdidx"])
        component.append("0")
        bytes_used.append(stripes_size/int(stripes[0]["lmm_stripe_count"]))

else:
    # PFL-files
    # remove header as it has no information on stripes, then extract components that are initialised
    stripes.pop(0)
    init_components = [component for component in stripes if component["lcme_flags"] == "init"]

    for sub_component in init_components:
        #every sub_component before the last used has a start and end point that is fully utilized
        if sub_component != init_components[-1]:
            dif = int(sub_component["lcme_extent.e_end"]) - int(sub_component["lcme_extent.e_start"])
            stripes_size -= dif
            for ost in sub_component["osts"]:
                ost_id.append(ost["l_ost_idx"])
                component.append(sub_component["lcme_id"])
                bytes_used.append(dif/int(sub_component["lmm_stripe_count"]))

        # but the last component will most likely not hit the end point. Therefor counting bytes is neccessary
        else:
            for ost in sub_component["osts"]:
                ost_id.append(ost["l_ost_idx"])
                component.append(sub_component["lcme_id"])
                bytes_used.append(stripes_size/int(sub_component["lmm_stripe_count"]))

# to sort the osts in the ascending order whithout any gaps in plotly requires creating a table for them, 
#    casting the ost-names into integers, sotring the rows by the names, and then casting the names back into strings
striping_df = pandas.DataFrame(data = {"ost_id": ost_id, "component": component, "bytes_used": bytes_used})  
striping_df["ost_id"] = striping_df["ost_id"].astype(int)
sorted_df = striping_df.sort_values(by = "ost_id")
sorted_df["ost_id"] = sorted_df["ost_id"].astype("string")

In [51]:
fig = px.bar(sorted_df, x="ost_id", y="bytes_used", color="component", color_discrete_sequence=["blue", "red", "green"],
             category_orders={"ost_id": sorted_df["ost_id"].tolist()}, labels = {"x":"OST", "y":"Bytes", "color":"component"})
fig.show()

# Multiple Files
## ERA-5 Example

In [52]:
data = {'OST': [], 'File': [], 'Bytes': []}


for file in stripe_data:

    # PFL
    if len(file)>1:
        # only extract filename from first block as it is just the header
        filename = file[0]["filename"]
        remaining_bytes = directory_stats.loc[directory_stats["name"] == filename, "size"].values[0]
        init_components = [component for component in file[1:] if component["lcme_flags"] == "init"]

        for sub_component in init_components:
            #every sub_component before the last used has a start and end point that is fully utilized
            if sub_component != init_components[-1]:
                dif = int(sub_component["lcme_extent.e_end"]) - int(sub_component["lcme_extent.e_start"])
                remaining_bytes -= dif
                for ost in sub_component["osts"]:
                    data["OST"].append(int(ost["l_ost_idx"]))
                    data["File"].append(filename)
                    data["Bytes"].append(dif/int(sub_component["lmm_stripe_count"]))

            # but the last component will most likely not hit the end point. Therefor counting bytes is neccessary
            else:
                for ost in sub_component["osts"]:
                    data["OST"].append(int(ost["l_ost_idx"]))
                    data["File"].append(filename)
                    data["Bytes"].append(remaining_bytes/int(sub_component["lmm_stripe_count"]))

    # no PFL
    else:
        file_size = directory_stats.loc[directory_stats["name"] == file[0]["filename"], "size"].values[0]
        for ost in file[0]["osts"]:
            data["OST"].append(int(ost["l_ost_idx"]))
            data["File"].append(file[0]["filename"])
            data["Bytes"].append(file_size/int(file[0]["lmm_stripe_count"]))

multifile_df = pandas.DataFrame(data)

In [55]:
fig = px.bar(multifile_df, x="OST", y="Bytes", color="File", color_discrete_sequence=px.colors.qualitative.Dark2,
            labels={"x":"OST", "y":"Bytes used", "color":"Filename"}, title="file distribution")
fig.update_traces(showlegend=False)
fig.show()
#fig.write_html("manyfiles.html")

# Heatmap

In [82]:
with open("LustreExampleData/input1.json") as directory_data:
    input_data = json.load(directory_data)
with open("LustreExampleData/output1.json") as directory_data:
    output_data = json.load(directory_data)

In [83]:
number_of_files = []
partition = "/work"
dff = df[(df["partition"] == partition) & (df["storage_type"] == "OST")]
# count the number of osts in this partition
for ost in range(len(dff)):
    number_of_files.append(0)

In [84]:
# derive heatmap matrix
ost_list = number_of_files
ost_names = []

ost_in = number_of_files.copy()
ost_out = number_of_files.copy()

In [85]:
# in-files
for file in input_data:

    # PFL
    if len(file)>1:
        # only extract filename from first block as it is just the header
        filename = file[0]["filename"]
        remaining_bytes = directory_stats.loc[directory_stats["name"] == filename, "size"].values[0]
        init_components = [component for component in file[1:] if component["lcme_flags"] == "init"]

        for sub_component in init_components:
            #every sub_component before the last used has a start and end point that is fully utilized
            if sub_component != init_components[-1]:
                dif = int(sub_component["lcme_extent.e_end"]) - int(sub_component["lcme_extent.e_start"])
                remaining_bytes -= dif
                for ost in sub_component["osts"]:
                    ost_in[int(ost["l_ost_idx"])] += dif/int(sub_component["lmm_stripe_count"])

            # but the last component will most likely not hit the end point. Therefor counting bytes is neccessary
            else:
                for ost in sub_component["osts"]:
                    ost_in[int(ost["l_ost_idx"])] += remaining_bytes/int(sub_component["lmm_stripe_count"])

    # no PFL
    else:
        file_size = directory_stats.loc[directory_stats["name"] == file[0]["filename"], "size"].values[0]
        for ost in file[0]["osts"]:
            ost_in[int(ost["obdidx"])] += file_size/int(file[0]["lmm_stripe_count"])

In [86]:
# out-files
for file in output_data:

    # PFL
    if len(file)>1:
        # only extract filename from first block as it is just the header
        filename = file[0]["filename"]
        remaining_bytes = directory_stats.loc[directory_stats["name"] == filename, "size"].values[0]
        init_components = [component for component in file[1:] if component["lcme_flags"] == "init"]

        for sub_component in init_components:
            #every sub_component before the last used has a start and end point that is fully utilized
            if sub_component != init_components[-1]:
                dif = int(sub_component["lcme_extent.e_end"]) - int(sub_component["lcme_extent.e_start"])
                remaining_bytes -= dif
                for ost in sub_component["osts"]:
                    ost_out[int(ost["l_ost_idx"])] += dif/int(sub_component["lmm_stripe_count"])

            # but the last component will most likely not hit the end point. Therefor counting bytes is neccessary
            else:
                for ost in sub_component["osts"]:
                    ost_out[int(ost["l_ost_idx"])] += remaining_bytes/int(sub_component["lmm_stripe_count"])

    # no PFL
    else:
        file_size = directory_stats.loc[directory_stats["name"] == file[0]["filename"], "size"].values[0]
        for ost in file[0]["osts"]:
            ost_out[int(ost["obdidx"])] += file_size/int(file[0]["lmm_stripe_count"])

In [87]:
for i,ost in enumerate(ost_list):
    ost_list[i] = min(ost_in[i], ost_out[i])
    ost_names.append(i)
width = int(math.ceil(math.sqrt(len(ost_list))))
total_cells = width**2
for i in range(total_cells-len(ost_list)):
    ost_list.append(0)
    ost_names.append("n/a")
#ost_names = list(range(total_cells))
    
heatmap_names =  np.reshape(ost_names, (width, width)).tolist()
heatmap_matrix = np.reshape(ost_list, (width, width)).tolist()

In [88]:
# This shows the amount of overlap between the input and output files, in Bytes
fig = go.Figure(data=go.Heatmap(
                    z=heatmap_matrix,
                    text=heatmap_names,
                    texttemplate="%{text}",
                    textfont={"size":15}))

fig.show()