# Statistical birds eye view of the contents in an AiiDAdb

This is the first of two deliverable for the SiSc-Lab2020 project.

Authors = Miao Wang(2. - 2.4), Zhipeng Tan(2.5 - 3.)

Supervisors: Dr. Jens Bröder, Dr. Daniel Wortmann, Johannes Wasmer, Prof. Dr. Stefan Blügel.

**Usage: adjust user constants in code cell 'User constants'.**

In [None]:
# User constants
aiida_profile_name = "wasmer"
enable_autoreload = True # disable for timings

# for timings file
# to get database_size: 
#   1) in terminal, type 'verdi profile show'. note down aiidadb_name.
#   2) in terminal, type 'psql', then '\l+'. note down db size from table. exit with '\q'.
notebook_name = "D1"
database_name = "wasmer_medium_size"
database_size = 431 # MB
database_description = [
    "800 Impurity (defect atoms) embeddings into different elemental host crystals with aiida-kkr."
]

In [None]:
if enable_autoreload:
    %load_ext autoreload
    %autoreload 2
%matplotlib notebook

In [None]:
# python imports:
import time
import json
#from pprint import pprint

#%pylab inline
#figuresize=(18, 4)
from collections import Counter
from math import pi
import numpy as np
import pandas as pd
from pandas import DataFrame
from bokeh.io import output_file,output_notebook, show
from bokeh.layouts import column
from bokeh.palettes import Category20,Category20c,Spectral11
from bokeh.plotting import figure,ColumnDataSource
from bokeh.transform import cumsum
from bokeh.models import Legend,LegendItem,HoverTool,ColumnDataSource
# init bokeh
output_notebook()

# aiida imports:
from aiida import load_profile
profile = load_profile(aiida_profile_name)

# ggf add futher imports
from aiida.orm import QueryBuilder as QB
from aiida.orm import QueryBuilder
from aiida.orm import WorkflowNode
from aiida.orm import load_node, Node, Group, Computer,Dict
from aiida.orm import User, CalcJobNode, Code, StructureData, ProcessNode
from aiida.plugins import DataFactory
from aiida.common.constants import elements as PeriodicTableElements

In [None]:
# # project imports prep (for johannes, else comment out)

# # add project module to sys.path
# import sys
# from pathlib import Path

# def add_to_sys_path(path:Path):
#     if str(path) not in sys.path:
#         sys.path.append(str(path))

# # load developer's code: general package
# project_dir = Path("/Users/wasmer/src/aiida-jutools/")
# add_to_sys_path(project_dir)

In [None]:
# project imports:
#import helpers
# if this does not work, do a `pip install -e .` in the aiida-jutools head folder
from aiida_jutools.sisc_lab import helpers


import aiida_jutools.sisc_lab.util.data_visu as DV
import aiida_jutools.sisc_lab.util.serialization as SR
from aiida_jutools.sisc_lab.util.data_visu import AnalyseStructureElements,ShowElements

In [None]:
# init timer for timings:
timer = helpers.Timer(notebook_name=notebook_name, 
                      database_name=database_name,
                      database_size=database_size)
timer.DATABASE_DESCRIPTION = database_description

In [None]:
helpers.print_bold(f"This notebook/dashboard will visualize the contents from the database of profile {profile.name}")

# Database overview:

In [None]:
timing_name = "Database overview"
timer.start(timing_name)

In [None]:
# query for all nodes
print('Information on nodes in the DB: \n')
now = time.strftime("%c")
print('last executed on {}'.format(now))
q = QB()
q.append(Node, project=['id', 'ctime', 'mtime', 'node_type'], tag='node')
q.append(User, with_node='node', project='email')
# TODO: execute query here
t = time.time()
res = q.all()
elapsed = time.time() - t
totalnodes = len(res)
print("Total number of nodes in the database: {} (retrieved in {} s.)".format(totalnodes, elapsed))

In [None]:
timer.stop(timing_name)

## User information:

In [None]:
timing_name = "User information"
timer.start(timing_name)

In [None]:
print("Users:")
helpers.print_Count('user',res)

In [None]:
timer.stop(timing_name)

## Node types distribution:

In [None]:
timing_name = "Node types"
timer.start(timing_name)

In [None]:
print("Node types:")
helpers.print_Count('types',res)

In [None]:
#split data nodes and process nodes
types = Counter([r[3] for r in res])
node_count = helpers.get_data_node_count(types,'data') 
p = helpers.draw_pie_chart(node_count,'Data Nodes:%s')

process_count = helpers.get_process_node_count(types,'process')
p1 = helpers.draw_pie_chart(process_count,'Process Nodes:%s')

show(column(p,p1))

In [None]:
p = helpers.draw_pie_chart(Counter(helpers.get_dict_link_types()),'Dict Link Types:%s')
show(p)

In [None]:
timer.stop(timing_name)

## Database time evolution:

In [None]:
timing_name = "Database evolution"
timer.start(timing_name)

In [None]:
users = Counter([r[4] for r in res])
output_notebook()
helpers.draw_line_plot(users,res)

In [None]:
timer.stop(timing_name)

## Codes:

In [None]:
timing_name = "Codes analysis"
timer.start(timing_name)

In [None]:
codes = Code.objects.all()
result = {code.full_label: len(code.get_outgoing(node_class=CalcJobNode).all_nodes()) for code in codes}
#result_df=pd.Series(result).sort_values(ascending=False)
result_df=pd.DataFrame({'code@computer':result.keys(),'CalaJobcount':result.values()}).sort_values(by='CalaJobcount',ascending=False).reset_index(drop=True)
result_df

In [None]:
timer.stop(timing_name)

## Groups:

In [None]:
timing_name = "Groups analysis"
timer.start(timing_name)

In [None]:
try:
    Groups_data = SR.deserialize_from_file('./output/group.json',Node_type='Group')
except (FileNotFoundError, ValueError) as err:
    qb = QueryBuilder()
    qb.append(Group)
    group = qb.all()

    #data = GroupDataHelper(group)
    #data.ListGroup(exclude=['export','import'])

    ### add more columns for this and do also for other nodes
    serializer = SR.Serializer(group)
    serializer.to_file('./output/group.json',Node_type='Group')
    Groups_data = SR.deserialize_from_file('./output/group.json',Node_type='Group')

In [None]:
qb = QueryBuilder()
qb.append(Group)
group = qb.all()
group[0][0].__dict__
s = dir(group[0][0])
s


In [None]:
group[0][0].count

In [None]:
Groups_data

In [None]:
data = DV.GroupDataHelper(Groups_data)
data.ListGroup(exclude=['export','import'])

In [None]:
timer.stop(timing_name)

## Structure Analysis:

In [None]:
timing_name = "Structures analysis"
timer.start(timing_name)

In [None]:
try:
    filepath = './output/Num_structure.json'
    Newdata = SR.deserialize_from_file(filepath,Node_type = 'StructureFormula')
except:
    ################### serialization
    qb = QueryBuilder()
    qb.append(StructureData)
    StructDatas = qb.all()

    #print(dic.keys())

    serializer = SR.Serializer(StructDatas)
    filepath = './output/Num_structure.json'
    serializer.to_file(filepath ,Node_type='StructureFormula')
    Newdata = SR.deserialize_from_file(filepath,Node_type = 'StructureFormula')


In [None]:
try:
    filepath = './output/StructDataGeneral.json'
    dataF = SR.deserialize_from_file(filepath,Node_type = 'StructureGeneral')
except:
    qb = QueryBuilder()
    qb.append(StructureData)
    StructDatas = qb.all()
    filepath = './output/StructDataGeneral.json'
    serializer = SR.Serializer(StructDatas)
    serializer.to_file(filepath,'StructureGeneral')
    dataF = SR.deserialize_from_file(filepath,Node_type = 'StructureGeneral')

In [None]:
dataF.head()

In [None]:
DV.ShowFormula(Newdata)

In [None]:
try:
    filepath = './output/Struct_Element.json'
    x = SR.deserialize_from_file(filepath,'StructureElement')
    
except:
    qb = QueryBuilder()
    qb.append(StructureData)
    StructDatas = qb.all()
    serializer = SR.Serializer(StructDatas)
    filepath = './output/Struct_Element.json'
    serializer.to_file(filepath,'StructureElement')
    x = SR.deserialize_from_file(filepath,'StructureElement')

In [None]:
ShowElements(x)

In [None]:
timer.stop(timing_name)

## Processes:

In [None]:
timing_name = "Processes info"
timer.start(timing_name)

In [None]:
####### CalcNode 
try:
    filepath = './output/CalcNode.json'
    calcArray = SR.deserialize_from_file(filepath,Node_type = 'ProcessNode')
except:  
    qb = QueryBuilder()
    qb.append(CalcJobNode)
    CalcNode = qb.all()

    serializer = SR.Serializer(CalcNode)
    filepath = './output/CalcNode.json'
    serializer.to_file(filepath,'ProcessNode')
    calcArray = SR.deserialize_from_file(filepath,Node_type = 'ProcessNode')

######## WorkflowNode
try:
    filepath2 = './output/WorkflowNode.json'
    WorkflowArray = SR.deserialize_from_file(filepath2,Node_type = 'ProcessNode')
except:
    qb = QueryBuilder()
    qb.append(WorkflowNode)
    WorkflowNodes = qb.all()

    serializer = SR.Serializer(WorkflowNodes)
    filepath2 = './output/WorkflowNode.json'
    serializer.to_file(filepath2,'ProcessNode')
    WorkflowArray = SR.deserialize_from_file(filepath2,Node_type = 'ProcessNode')

In [None]:
qb = QueryBuilder()
qb.append(CalcJobNode)
CalcNode = qb.all()
dir(CalcNode[0][0])

In [None]:
calcArray.head()

In [None]:
WorkflowArray.head()

In [None]:
Newdict1 = DV.GetWorkflowDict(WorkflowArray)
Newdict2 = DV.GetWorkflowDict(calcArray)
DV.ShowWorkflow(Newdict1,'Work Flow Node Information')
DV.ShowWorkflow(Newdict2,'Calculate Job Node Information')

In [None]:
timer.stop(timing_name)

# Data provenance health indicators:

In [None]:
timing_name = "Provenance analysis"
timer.start(timing_name)

In [None]:
########## this cell will take some time,but after the preprocessing everything should be fine
try:
    filepath = './output/provenance.json'
    provenance = SR.deserialize_from_file(filepath,'Provenance')
except:
    qb = QueryBuilder()
    qb.append(Node)
    Nodes = qb.all()

    #### serialization to filepath
    provenance_serializer = SR.Serializer(Nodes)
    filepath = './output/provenance.json'
    provenance_serializer.to_file(filepath,'Provenance')
    provenance = SR.deserialize_from_file(filepath,'Provenance')

In [None]:
#### deserialization from filepath
provenance

In [None]:

No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict = DV.Count_In_Out(provenance)
print(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)

In [None]:
DV.Show_In_Out(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)
### split and think about bar plot
# reduce complexity

In [None]:
timer.stop(timing_name)

In [None]:
########################
# save timings
timer.save(silent=False)