# Structure Property visualizer

This is the second of two deliverables for the SiSc-Lab2020 project.

Authors = Sijie Luo and Anna Garoufali

Supervisors: Dr. Jens Bröder, Dr. Daniel Wortmann, Johannes Wasmer, Prof. Dr. Stefan Blügel.

**Usage: adjust user constants in code cell 'User constants'.**

In [None]:
# User constants
aiida_profile_name = "wasmer"
enable_autoreload = True # disable for timings

# ---

# selected workflow identifier for 'single workflow' analysis sections and interactive plot.
# this identifier specifies [workflow type: TODO] and that type's version.
# notebook will inspect all database workflow nodes of this type and version.
# available identifiers are listed in helpers.py dict MAP values.

# # example for database with aiida-fleur workflows
# single_workflow_identifier = 'workflow_0.4.2'

# example for database with aiida-kkr workflows
single_workflow_identifier = 'parser_0.3.2'


# ---

# # for interactive plot of predefined workflow. 
# # make sure attributes are defined in helpers.predefined_workflows for selected workflow.

# # example for database with aiida-fleur workflows only
# xcol = 'total_energy'
# ycol = 'distance_charge'

# example for database with aiida-kkr workflows only
xcol = 'alat'
ycol = 'emin_minus_efermi'
# xcol = 'fermi_energy'
# ycol = 'dos_at_fermi_energy'

# ---

# for timings file
# database_size: in terminal, connect to postgres database via psql and execute '\l+'.
notebook_name = "D2"
database_name = "wasmer_medium_size"
database_size = 431 # MB
database_description = [
    "800 Impurity (defect atoms) embeddings into different elemental host crystals with aiida-kkr."
]

In [None]:
if enable_autoreload:
    %load_ext autoreload
    %autoreload 2
%matplotlib notebook

In [None]:
# python imports:
from collections import Counter
import time
import numpy as np
import pandas as pd
#from pprint import pprint

#from aiida_jutools.sisc_lab import helpers
from bokeh.io import output_notebook
# init bokeh
output_notebook()

# aiida imports:
from aiida import load_profile
profile = load_profile(aiida_profile_name)

In [None]:
# # project imports prep (for johannes, else comment out)

# # add project module to sys.path
# import sys
# from pathlib import Path

# def add_to_sys_path(path:Path):
#     if str(path) not in sys.path:
#         sys.path.append(str(path))

# # load developer's code: general package
# project_dir = Path("/Users/wasmer/src/aiida-jutools/")
# add_to_sys_path(project_dir)

In [None]:
import helpers

In [None]:
# init timer for timings:
timer = helpers.Timer(notebook_name=notebook_name, 
                      database_name=database_name,
                      database_size=database_size)
timer.DATABASE_DESCRIPTION = database_description

In [None]:
helpers.print_bold(f"This notebook/dashboard will visualize the contents from the database of profile {profile.name}")

## Check workflows and versions

In [None]:
timing_name = "Workflows info"
timer.start(timing_name)

In [None]:
# Preprocessing: Set formula attributes for all the structure nodes
helpers.set_structure_formula()

# workflow_name = 'fleur_scf_wc' # Filter workflow
# workflow_filters = {'attributes.process_label' : {'==' : workflow_name}}
# workflowdictlst = helpers.get_structure_workflow_dict(workflow_filters=workflow_filters)
#or
workflow_name = None # No restriction. Querying by default
workflowdictlst, versionslst = helpers.get_structure_workflow_dict(timing=True, check_version=True)

print("Number of the workflows: ", len(workflowdictlst), '\n')
print("Workflows: ")
workflowdictlst[:2]

In [None]:
from helpers import MAP
from helpers import predifined_workflow
versions = [key for key,val in versionslst]
versions

In [None]:
if single_workflow_identifier not in versions:
    raise NotImplementedError(f"D2 is not implemented for the specified single workflow identifier {single_workflow_identifier}."
                             f"Please add this identifier to {helpers.__name__} dictionary 'MAP' and list 'predefined_workflows'.")

In [None]:
timer.stop(timing_name)

## Structure nodes

In [None]:
#!pip install openpyxl

### Single workflow version

In [None]:
timing_name = "Preprocess structures single workflow version"
timer.start(timing_name)

In [None]:
structure_project_single=['uuid', 'extras.formula']
structure_nodes = helpers.generate_structure_property_pandas_source(
            version=single_workflow_identifier,
            workflow_name=workflow_name,
            structure_project=structure_project_single,
            filename=f"structure_properties_{MAP[single_workflow_identifier]}.json")
#structure_nodes.head()

In [None]:
timer.stop(timing_name)

### Multiple workflow versions

In [None]:
timing_name = "Prepare Structure multiple workflow versions"
timer.start(timing_name)

In [None]:
filename='structure_properties_all.xlsx'
excel_writer = pd.ExcelWriter(filename)

for version in versions:    
    structure_project_multiple=['uuid', 'extras.formula']
    structure_nodes = helpers.generate_structure_property_pandas_source(
                version=version,
                workflow_name=workflow_name, 
                structure_project=structure_project_multiple)
    print(structure_nodes)
    structure_nodes.to_excel(excel_writer, sheet_name=MAP[version], index=False)

excel_writer.save()

In [None]:
timer.stop(timing_name)

## Dict nodes

### Single workflow version

In [None]:
timing_name = "Prepare Dict single workflow version"
timer.start(timing_name)

In [None]:
# Single workflow version
dict_project_single = predifined_workflow.get_workflow(MAP[single_workflow_identifier]).projections
dict_nodes = helpers.generate_dict_property_pandas_source(
        workflow_name=workflow_name,
        version=single_workflow_identifier,
        dict_project=dict_project_single, 
        filename=f"dict_properties_{MAP[single_workflow_identifier]}.json")
#dict_nodes.head()

In [None]:
timer.stop(timing_name)

In [None]:
# check if user constants xcol, ycol are present in selected single workflow's attributes
def is_attr_in_single_workflow_attributes(attr):
    return any([attr in attr_name for attr_name in dict_project_single])
if not is_attr_in_single_workflow_attributes(xcol) or not is_attr_in_single_workflow_attributes(ycol):
    import json
    raise NotImplementedError(f"Interactive plot columns xcol='{xcol}' and ycol='{ycol}' are not listed in the selected single "
                             f"workflow identifier '{single_workflow_identifier}''s attributes. If they should, please adjust "
                             f"respective entry in {helpers.__name__} list 'predefined_workflows'. "
                             f"Attributes defined there for this workflow are: {json.dumps(dict_project_single, indent=4)}.")

### Multiple workflow versions

In [None]:
timing_name = "Prepare Dict multiple workflow versions"
timer.start(timing_name)

In [None]:
filename='dict_properties_all.xlsx'
excel_writer = pd.ExcelWriter(filename)

for version in versions:
    dict_project_multiple = predifined_workflow.get_workflow(MAP[version]).projections
    dict_nodes = helpers.generate_dict_property_pandas_source(
            workflow_name=workflow_name,
            version=version,
            dict_project=dict_project_multiple)
#     print(dict_nodes)
    dict_nodes.to_excel(excel_writer, sheet_name=MAP[version], index=False)

excel_writer.save()

In [None]:
timer.stop(timing_name)

## Combine two kind of nodes

### Single workflow version

In [None]:
timing_name = "Join data single workflow version"
timer.start(timing_name)

In [None]:
structure_project_single=['uuid', 'extras.formula']
dict_project_single = predifined_workflow.get_workflow(MAP[single_workflow_identifier]).projections
combinednodes = helpers.generate_combined_property_pandas_source(
        workflow_name=workflow_name, 
        version=single_workflow_identifier,
        structure_project=structure_project_single,
        dict_project=dict_project_single,
        filename=f"combined_properties_{MAP[single_workflow_identifier]}.json")

In [None]:
timer.stop(timing_name)

In [None]:
# df = helpers.read_json_file('combined_properties_wf_0_4_2.json')
df_single = helpers.read_json_file(f"combined_properties_{MAP[single_workflow_identifier]}.json")

In [None]:
df_single[0]

### Multiple workflow versions

In [None]:
timing_name = "Join data multiple workflow versions"
timer.start(timing_name)

In [None]:
filename='combined_properties_all.xlsx'
excel_writer = pd.ExcelWriter(filename)

for version in versions:
    structure_project_multiple=['uuid', 'extras.formula']
    dict_project_multiple = predifined_workflow.get_workflow(MAP[version]).projections
    combined_nodes = helpers.generate_combined_property_pandas_source(
            workflow_name=workflow_name, 
            version=version,
            structure_project=structure_project_multiple,
            dict_project=dict_project_multiple)
#     print(combined_nodes)
    combined_nodes.to_excel(excel_writer, sheet_name=MAP[version], index=False)

excel_writer.save()   

In [None]:
timer.stop(timing_name)


# Interactive plot

## Check data source before plotting

### Single workflow version

In [None]:
timing_name = "Join data single workflow version"
timer.start(timing_name)

In [None]:
# df = helpers.read_json_file('combined_properties_wf_0_4_2.json')
df_single = helpers.read_json_file(f"combined_properties_{MAP[single_workflow_identifier]}.json")

In [None]:
df_single

In [None]:
filtered_df_single, xdata, ydata = helpers.filter_missing_value(df_single, xcol, ycol)

In [None]:
timer.stop(timing_name)

### Multiple workflow versions

In [None]:
timing_name = "Join data multiple workflow versions"
timer.start(timing_name)

In [None]:
dfs = helpers.read_excel_file('combined_properties_all.xlsx')

In [None]:
from helpers import INVMAP

df_all, OPTIONS_all, UNITS_all = {}, {}, {}
versions, mversions = [], []
for key, df in dfs.items():
    df = helpers.filter_unavailable_df(df)
    if not df.empty:
        df_all[key] = df
        OPTIONS_all[key], UNITS_all[key] = helpers.get_attrs_and_units(df)
        mversions.append(key)
        versions.append(INVMAP[key])

In [None]:
timer.stop(timing_name)

## Interactive plot by Bokeh

In [None]:
single_workflow_identifier

In [None]:
timing_name = "Interactive plot"
timer.start(timing_name)

In [None]:
# load dataframe from multiple workflow versions de/serialization
# DEVNOTE: wasmer: single workflow version seems broken
df = dfs[MAP[single_workflow_identifier]]

In [None]:
MAP[single_workflow_identifier]

In [None]:
dfs.keys()

In [None]:
df

In [None]:
# from single workflow version data (json)
helpers.bokeh_struc_prop_vis(df, xcol, ycol, 
                            output_filename="vis_wf042.html", axis_type=['linear', 'linear'], nbins=40)

In [None]:
timer.stop(timing_name)

## Interactive plot using Bokeh server application

In [None]:
# In vscode terminal:
# bokeh serve --show --port 5001 bokehplotting.py

In [None]:
########################
# save timings
timer.save(silent=False)