In [47]:
import pandas as pd
import json
import typing as tp

from plotly import graph_objects as go
from tqdm import tqdm

# pip install Pillow
from PIL.ImageColor import getcolor

In [48]:
def get_colors(fp, line_start, a_node, a_link, link_color_shift, color_step, cut):
    colors = []
    with open(fp, mode='r') as file:
        for line in file:
            colors.append('#' + line.strip())
    colors_node = []
    colors_link = []
    for i in range(line_start, len(colors)-cut, color_step):

        try:
            r, g, b = getcolor(colors[i+link_color_shift], "RGB")
            colors_link.append('rgba({r},{g},{b}, {a_link})'.format(r=r, g=g, b=b, a_link=a_link))

            r, g, b = getcolor(colors[i], "RGB")
            colors_node.append('rgba({r},{g},{b}, {a_node})'.format(r=r, g=g, b=b, a_node=a_node))
        except IndexError:
            continue

    return colors_node, colors_link

In [49]:
def prepare_data(raw_data):
    '''
    raw_data = json[
        {
            log_datetime: datetime,
            object_id: str | int,
            event_id: str | int,
            event_value: int
        }, ...
    ]

    cond_data = json[
        {
            event_id: str | int,
            event_value_condition: str | int,
            node_name: str
        }, ...
    ]

    output = json[
        {
            object_id: str | int,
            node_name: str
        }, ...
    ]
    '''

    def _drop_duplicates(raw_data_df):
        raw_data_df = raw_data_df.sort_values(by=['object_id','log_datetime', 'node_name']).reset_index(drop=True)
        indexes_to_delete = []
        for i in tqdm(range(1, len(raw_data_df))):
            if raw_data_df.iloc[i]['object_id'] == raw_data_df.iloc[i-1]['object_id']\
                and raw_data_df.iloc[i]['node_name'] == raw_data_df.iloc[i-1]['node_name']:
                indexes_to_delete.append(i)
        raw_data_df = raw_data_df.drop(index=indexes_to_delete)
        return raw_data_df

    raw_data_df = _drop_duplicates(raw_data)
    prepared_data = raw_data_df\
        .sort_values(by=['object_id','log_datetime', 'node_name'])\
            .reset_index(drop=True)[['object_id', 'node_name']]

    return prepared_data

In [50]:
def construct_sankey_data(prepared_data,
                          cond_data,
                          node_color,
                          link_color):
    '''
    input: prepared_data = output from func prepare_data

    output: sankey_data = {
        "data":{\n
            "node": {
                "label": [str, ...],
                "color": ["rgba(0-255, 0-255, 0-255, 0.0 - 1.0)", ...]
                },\n
            "link": {
                "source": [int, ...],
                "target": [int, ...],
                "value": [float | int],
                "color": ["rgba(0-255, 0-255, 0-255, 0.0 - 1.0)", ...],
                "label": [optional[str], ...]
                }
            }
        }
    '''

    def _prepare_lstv(prepared_data, cond_data):
        # label
        def _get_priority(cond_data):
            elem_priority = []
            for elem in cond_data:
                elem_priority.append(elem['node_name'])
            return elem_priority


        def _merge_lists_in_priority(list1, list2, elem_priority):
            result = []
            elem_priority = elem_priority
            i_max = len(list1) - 1
            j_max = len(list2) - 1
            i, j = 0, 0
            while i + j <= i_max + j_max + 1:
                if i > i_max:
                    result.extend(list2[j:])
                    j += len(list2[j:])
                elif j > j_max:
                    result.extend(list1[i:])
                    i += len(list1[i:])
                elif list1[i] == list2[j]:
                    result.append(list1[i])
                    i += 1
                    j += 1
                elif elem_priority.index(list1[i]) > elem_priority.index(list2[j]):
                    result.append(list2[j])
                    j += 1
                elif elem_priority.index(list1[i]) < elem_priority.index(list2[j]):
                    result.append(list1[i])
                    i += 1
            return result

        elem_priority = _get_priority(cond_data)
        label = []
        object_id_prev = None
        object_trace = []
        for elem in tqdm(prepared_data):
            if elem['object_id'] != object_id_prev:
                label = _merge_lists_in_priority(label, object_trace, elem_priority)
                object_trace = [elem['node_name']]
                object_id_prev = elem['object_id']
            else:
                object_trace.append(elem['node_name'])
        label = _merge_lists_in_priority(label, object_trace, elem_priority)

        # source, target, value
        source = []
        target = []
        value = []
        object_id_prev = None
        result_dict = {}
        for elem in tqdm(prepared_data):
            object_id = elem['object_id']
            target_elem = elem['node_name']
            if object_id_prev != object_id:
                position = 0
                object_id_prev = object_id
            else:
                source_indx = position + label[position:].index(source_elem)
                target_indx = source_indx + 1 + label[source_indx + 1:].index(target_elem)
                position = target_indx
                result_dict.setdefault((source_indx, target_indx), 0)
                result_dict[(source_indx, target_indx)] += 1
            source_elem = target_elem

        for (source_elem, target_elem), value_elem in result_dict.items():
            source.append(source_elem)
            target.append(target_elem)
            value.append(value_elem)

        return label, source, target, value


    def _set_colors(node_color, link_color, label, source):
        labels_cnt = len(label)
        node_color_multiply = (round(labels_cnt/len(node_color)) + 1) * node_color
        link_color_multiply = (round(labels_cnt/len(link_color)) + 1) * link_color

        label_sorted_set = []
        for x in label:
            if x not in label_sorted_set:
                label_sorted_set.append(x)
            else:
                continue
        node_color_dict = {x: y for x, y in zip(label_sorted_set, node_color_multiply)}
        node_color_label = [node_color_dict[x] for x in label]

        link_color_dict = {x: y for x, y in zip(label_sorted_set, link_color_multiply)}
        link_color_label = [link_color_dict[x] for x in label]

        link_index_color_dict = {x: y for x, y in zip(list(range(len(label))), link_color_label)}
        link_color_source = [link_index_color_dict[x] for x in source]

        return node_color_label, link_color_source


    label, source, target, value = _prepare_lstv(prepared_data, cond_data)
    node_color_label, link_color_source = _set_colors(node_color, link_color, label, source)
    target_label = [label[x] for x in target]

    sankey_data = {
        "data": {
            "node": {
                "label": label,
                "color": node_color_label
                },
            "link": {
                "source": source,
                "target": target,
                "value": value,
                "color": link_color_source,
                "label": target_label
                }
            }
        }

    return sankey_data

In [56]:
def sankey_fig(sankey_data):
    fig = go.Figure(
        data=[go.Sankey(
            valueformat = 'int',
            valuesuffix = ' чел',
            domain = {"x": [0,1],"y": [0,1]},
            arrangement ='freeform',
            orientation = 'h',
            node = {**sankey_data['data']['node'],
                    'pad': 10,
                    'thickness': 20,
                    'line': {'color': 'black', 'width': 0.5},
                    'hoverlabel': {'align': 'left', 'bgcolor': 'lightgreen', 'bordercolor': 'black',
                                   # 'namelength': 4
                                   },
                    'hovertemplate': '%{label}<extra>%{value}</extra>'
                    },
            link = {**sankey_data['data']['link'],
                    # 'arrowlen': 10,
                    'hoverlabel': {'bgcolor': 'lightgreen', 'bordercolor': 'black', 'align': 'left',
                                   # 'namelength': 4
                                   },
                    'hovertemplate': '%{label}<extra>%{value}</extra>'
                    },
            textfont = {'color': 'black'}
        )]
    )
    fig.update_layout(title_text='Test Diagram',
                      font_size=10)
    return fig

In [57]:
duel_colors_node, duel_colors_link = get_colors(fp='/home/ivan/projects/visual_experiments/duel.txt',
                                      line_start=3,
                                      a_node=1,
                                      a_link=0.5,
                                      link_color_shift=1,
                                      color_step=8,
                                      cut=0)


dewdrop_colors_node, dewdrop_colors_link = get_colors(fp='/home/ivan/projects/visual_experiments/dewdrop-dynasty-40.txt',
                                      line_start=0,
                                      a_node=1,
                                      a_link=1,
                                      link_color_shift=1,
                                      color_step=2,
                                      cut=6)


data = pd.read_json('/home/ivan/projects/visual_experiments/sankey_test_logs.json', lines = True)
data = data[['worker_id', 'timestamp', 'skill_name']]
data.columns = ['object_id', 'log_datetime', 'node_name']
prepared_data = prepare_data(data)
cond_data = prepared_data[['node_name']].drop_duplicates()[:5]
node_names = list(cond_data['node_name'].values)
prepared_data = prepared_data.query('node_name in @node_names')

prepared_data = prepared_data.to_json(orient='records', indent=4, force_ascii=False)
prepared_data = json.loads(prepared_data)

cond_data = cond_data.to_json(orient='records', indent=4, force_ascii=False)
cond_data = json.loads(cond_data)

sankey_data = construct_sankey_data(prepared_data=prepared_data,
                                    cond_data=cond_data,
                                    node_color=duel_colors_node,
                                    link_color=duel_colors_link)


fig = sankey_fig(sankey_data)
# fig.show()

100%|██████████| 138593/138593 [00:31<00:00, 4351.13it/s]
100%|██████████| 1381/1381 [00:00<00:00, 861697.98it/s]
100%|██████████| 1381/1381 [00:00<00:00, 1062228.83it/s]


In [58]:
fig.write_html('test_sankey.html', auto_open=True)