In [1]:
# -*- coding: utf-8 -*-
#
# Copyright (C) 2021-2024  LMAI_team @ TU Dresden:
#     LMAI_team: Zhixu Ni, Maria Fedorova
#
# Licensing:
# This code is licensed under AGPL-3.0 license (Affero General Public License v3.0).
# For more information, please read:
#     AGPL-3.0 License: https://www.gnu.org/licenses/agpl-3.0.en.html
#
# Citation:
# Please cite our publication in an appropriate form.
#
# For more information, please contact:
#     Fedorova Lab (#LMAI_team): https://fedorovalab.net/
#     LMAI on Github: https://github.com/LMAI-TUD
#

In [2]:
import os

import pandas as pd

from lmai.sankey_prep import get_sankey_data
from lmai.sankey_plot import format_sankey_data, plot_sankey, reload_sankey_json

In [3]:
# define the input file path
input_file_path = 'data/data_sankey.xlsx'
# define the output file path
# a set of svg, png, html and json files will be generated, with the same prefix
# you don't need to specify the file extension here
output_file_path = 'output/sankey_plot'

# define the temporary files for pre-processing
link_flow_json = 'output/link_flow.json'
node_color_config_csv = 'output/node_color_config.csv'



In [4]:
# define other settings
# input file column names
spb_col_name = 'SPB'
lipid_class_col_name = 'Class'
fa_col_name = 'FA'
val_col_name = 'Value'
lipid_species_col_name = 'Species'
# if the ratio of the other class is less than the threshold, it will be merged into an "Others" node
# value should be a float with "."
# e.g. 2 % threshold should be set as 2.0
threshold_config_info = {'SPB': 2.0, 'Class': 0.8, 'FA': 0.8}
# define the size of the image
img_width = 1280
img_height = 800
node_box_thickness = 100

In [5]:
# pre-process the data
calc_sankey_df, calc_map_df = get_sankey_data(input_file=input_file_path, 
                                              start_col_name = spb_col_name, 
                                              mid_col_name= lipid_class_col_name, 
                                              end_col_name= fa_col_name, 
                                              val_col_name = val_col_name, 
                                              lipid_index_col_name = lipid_species_col_name,
                                              other_class_threshold_info=threshold_config_info)

File found: /Users/ni/PycharmProjects/LipidSankey/data/data_sankey.xlsx


In [6]:
# preview the data
calc_sankey_df.to_csv(f'{output_file_path}_df.csv', index=True)
calc_sankey_df.head()

Unnamed: 0,source_label,target_label,value,ratio
0,Others (3.8%),Ceramide (35.8%),3.213036,2.100575
1,Others (3.8%),Dihydro-Ceramide (0.8%),1.226526,0.801862
2,Others (3.8%),Dihydro-Deoxy-Ceramide (3.5%),0.704055,0.460288
3,Others (3.8%),Hex-(n)-Ceramide (43.1%),0.231049,0.151052
4,Others (3.8%),Others (0.8%),0.451926,0.295454


In [7]:
calc_map_df.to_csv(f'{output_file_path}_map_df.csv', index=True)
calc_map_df.head()

Unnamed: 0,lipid,source_label,target_label,source_node,target_node,value,ratio
0,Cer(18:0;1/16:0),SPB 18:0;1 (3.0%),Dihydro-Deoxy-Ceramide (3.5%),SPB 18:0;1,Dihydro-Deoxy-Ceramide,0.21079,0.137807
1,Cer(18:0;1/20:0),SPB 18:0;1 (3.0%),Dihydro-Deoxy-Ceramide (3.5%),SPB 18:0;1,Dihydro-Deoxy-Ceramide,0.232121,0.151753
2,Cer(18:0;1/22:0),SPB 18:0;1 (3.0%),Dihydro-Deoxy-Ceramide (3.5%),SPB 18:0;1,Dihydro-Deoxy-Ceramide,0.860527,0.562584
3,Cer(18:0;1/23:0),SPB 18:0;1 (3.0%),Dihydro-Deoxy-Ceramide (3.5%),SPB 18:0;1,Dihydro-Deoxy-Ceramide,0.128398,0.083943
4,Cer(18:0;1/24:0),SPB 18:0;1 (3.0%),Dihydro-Deoxy-Ceramide (3.5%),SPB 18:0;1,Dihydro-Deoxy-Ceramide,1.57659,1.030722


In [8]:
# format the data for sankey plot
calc_link_json, color_df = format_sankey_data(calc_sankey_df, link_flow_json, node_color_config_csv)

# Advanced users can modify the calc_link_json file to change the order of the nodes.
# Please make sure that you change the corresponding node orders in the links as well.

In [9]:
# Auto-generate the color configuration file
print(f"please check the color configuration file to change colors: {node_color_config_csv}")
color_df

please check the color configuration file to change colors: output/node_color_config.csv


Unnamed: 0,label,color
0,Ceramide (35.8%),#636EFA
1,Deoxy-Ceramide (9.8%),#EF553B
2,Dihydro-Ceramide (0.8%),#00CC96
3,Dihydro-Deoxy-Ceramide (3.5%),#AB63FA
4,FA 16:0 (18.3%),#FFA15A
5,FA 18:0 (3.2%),#19D3F3
6,FA 20:0 (1.9%),#FF6692
7,FA 22:0 (16.9%),#B6E880
8,FA 23:0 (5.0%),#FF97FF
9,FA 24:0 (31.3%),#FECB52


**You can got to the next cell to plot the sankey diagram with adjusted colors.**

*Please do NOT change the row order of the color configuration file.*

In [10]:
# check the color configuration file
# if you changed the color configuration file, please change the node_color_config_csv to the updated file path
updated_node_color_config_csv = node_color_config_csv
updated_color_df = pd.read_csv(updated_node_color_config_csv)
updated_color_df

Unnamed: 0,label,color
0,Ceramide (35.8%),#636EFA
1,Deoxy-Ceramide (9.8%),#EF553B
2,Dihydro-Ceramide (0.8%),#00CC96
3,Dihydro-Deoxy-Ceramide (3.5%),#AB63FA
4,FA 16:0 (18.3%),#FFA15A
5,FA 18:0 (3.2%),#19D3F3
6,FA 20:0 (1.9%),#FF6692
7,FA 22:0 (16.9%),#B6E880
8,FA 23:0 (5.0%),#FF97FF
9,FA 24:0 (31.3%),#FECB52


In [11]:
# plot the sankey diagram
plot_sankey(link_flow_json, updated_node_color_config_csv, output_file_path, img_width, img_height, thickness=node_box_thickness)

Sankey plot saved to output/sankey_plot.png and output/sankey_plot.svg


**To change order of each node, use the interactive interface in the web browser that pop up automatically or the corresponding html file.**

**To save the image after rearrangement, just use screenshot function.**

In [12]:
print("Finished.")

Finished.
