# Statistical birds eye view of the contents in an AiiDAdb

This is the first of two deliverable for the SiSc-Lab2020 project.

Authors = Miao Wang(a - e), Zhipeng Tan(f - i)

Supervisors: Dr. Jens Bröder, Dr. Daniel Wortmann, Johannes Wasmer, Prof. Dr. Stefan Blügel.

In [1]:
aiida_profile_name = "wasmer"
enable_autoreload = False # disable for timings
timings_filename = "all_times_D1_wasmer.txt"

________________

In [2]:
if enable_autoreload:
    %load_ext autoreload
    %autoreload 2
%matplotlib notebook

In [3]:
# python imports:
import os
import time
import json
#from pprint import pprint

#%pylab inline
#figuresize=(18, 4)
from collections import Counter
from math import pi
import numpy as np
import pandas as pd
from pandas import DataFrame
from bokeh.io import output_file,output_notebook, show
from bokeh.layouts import column
from bokeh.palettes import Category20,Category20c
from bokeh.plotting import figure,ColumnDataSource
from bokeh.transform import cumsum
from bokeh.models import Legend,LegendItem,HoverTool

In [4]:
# aiida imports:
from aiida import load_profile
profile = load_profile(aiida_profile_name)

# ggf add futher imports
from aiida.orm import QueryBuilder as QB
from aiida.orm import QueryBuilder
from aiida.orm import WorkflowNode
from aiida.orm import load_node, Node, Group, Computer,Dict
from aiida.orm import User, CalcJobNode, Code, StructureData, ProcessNode
from aiida.plugins import DataFactory
from aiida.common.constants import elements as PeriodicTableElements

In [5]:
# project imports

# now check if outsourced version produces same result
# add project module to sys.path
import sys
from pathlib import Path

def add_to_sys_path(path:Path):
    if str(path) not in sys.path:
        sys.path.append(str(path))

# load developer's code: general package
project_dir = Path("/Users/wasmer/src/aiida-jutools/")
add_to_sys_path(project_dir)

# project imports:
#import helpers
# if this does not work, do a `pip install -e .` in the aiida-jutools head folder
from aiida_jutools.sisc_lab import helpers


import aiida_jutools.sisc_lab.util.data_visu as DV
import aiida_jutools.sisc_lab.util.serialization as SR
from aiida_jutools.sisc_lab.util.data_visu import AnalyseStructureElements,ShowElements

output_notebook()

In [6]:
helpers.print_bold(f"This notebook/dashboard will visualize the contents from the database of profile '{profile.name}'")

[1mThis notebook/dashboard will visualize the contents from the database of profile 'wasmer'[1m


In [7]:
all_times = []

# Database overview:

In [8]:
t1 = time.time()

# query for all nodes
print('Information on nodes in the DB: \n')
now = time.strftime("%c")
print('last executed on {}'.format(now))
q = QB()
q.append(Node, project=['id', 'ctime', 'mtime', 'node_type'], tag='node')
q.append(User, with_node='node', project='email')
# TODO: execute query here
t = time.time()
res = q.all()
elapsed = time.time() - t
totalnodes = len(res)
print("Total number of nodes in the database: {} (retrieved in {} s.)".format(totalnodes, elapsed))

all_times.append(time.time()-t1)

Information on nodes in the DB: 

last executed on Fri Jan 29 10:02:07 2021
Total number of nodes in the database: 6105 (retrieved in 0.15147042274475098 s.)


## User information:

In [9]:
t1 = time.time()

print("Users:")
helpers.print_Count('user',res)

all_times.append(time.time()-t1)

Users:
- johannes.wasmer@gmail.com created 6105 nodes


## Node types distribution:

In [10]:
t1 = time.time()

print("Node types:")
helpers.print_Count('types',res)

Node types:
- data.dict.Dict. created 3299 nodes
- process.calculation.calcfunction.CalcFunctionNode. created 751 nodes
- process.calculation.calcjob.CalcJobNode. created 347 nodes
- data.remote.RemoteData. created 345 nodes
- data.folder.FolderData. created 345 nodes
- data.bool.Bool. created 282 nodes
- process.workflow.workchain.WorkChainNode. created 218 nodes
- data.cif.CifData. created 142 nodes
- data.singlefile.SinglefileData. created 137 nodes
- data.structure.StructureData. created 106 nodes
- data.float.Float. created 91 nodes
- process.workflow.workfunction.WorkFunctionNode. created 37 nodes
- data.code.Code. created 5 nodes


In [11]:
#split data nodes and process nodes
types = Counter([r[3] for r in res])
node_count = helpers.get_data_node_count(types,'data') 
p = helpers.draw_pie_chart(node_count,'Data Nodes')

process_count = helpers.get_process_node_count(types,'process')
p1 = helpers.draw_pie_chart(process_count,'Process Nodes')

show(column(p,p1))

In [12]:
p = helpers.draw_pie_chart(Counter(helpers.get_dict_link_types()),'Dict Link Types')
show(p)

all_times.append(time.time()-t1)

## Database time evolution:

In [13]:
# line plot by ctime & mtime
t1 = time.time()

users = Counter([r[4] for r in res])
output_notebook()
helpers.draw_line_plot(users,res)

all_times.append(time.time()-t1)

## Codes:

In [14]:
t1 = time.time()

codes = Code.objects.all()
result = {code.full_label: len(code.get_outgoing(node_class=CalcJobNode).all_nodes()) for code in codes}
#result_df=pd.Series(result).sort_values(ascending=False)
result_df=pd.DataFrame({'code@computer':result.keys(),'CalaJobcount':result.values()}).sort_values(by='CalaJobcount',ascending=False).reset_index(drop=True)

all_times.append(time.time()-t1)
result_df


Unnamed: 0,code@computer,CalaJobcount
0,kkrhost@claix18,203
1,voronoi@localhost,76
2,kkrimp@claix18,68
3,kkrimp@localhost,0
4,kkrhost@localhost,0


## Groups:

In [15]:
t1 = time.time()

qb = QueryBuilder()
qb.append(Group)
group = qb.all()

### add more columns for this and do also for other nodes
serializer = SR.Serializer(group)
if not os.path.isdir('./output'):
    os.mkdir('./output')
serializer.to_file('./output/group.json',Node_type='Group')
x = SR.deserialize_from_file('./output/group.json',Node_type='Group')
x

Unnamed: 0,User,Group_Name,Node,type_string
0,johannes.wasmer@gmail.com,aiida_kkr_tutorial,0,core
1,johannes.wasmer@gmail.com,aiida_kkr_tutorial/calculations,0,core
2,johannes.wasmer@gmail.com,aiida_kkr_tutorial/calculations/voronoi,0,core
3,johannes.wasmer@gmail.com,aiida_kkr_tutorial/calculations/kkrhost,0,core
4,johannes.wasmer@gmail.com,aiida_kkr_tutorial/calculations/kkrimp,0,core
5,johannes.wasmer@gmail.com,CIFs_from_deltaCodesDFT/primCIFs/structures_v01,72,core
6,johannes.wasmer@gmail.com,CIFs_from_deltaCodesDFT,0,core
7,johannes.wasmer@gmail.com,imp_embeddings_for_ml,0,core
8,johannes.wasmer@gmail.com,aiida_kkr_tutorial/workflows,0,core
9,johannes.wasmer@gmail.com,CIFs_from_deltaCodesDFT/CIFs,71,core


In [16]:
data = DV.GroupDataHelper(x)
data.ListGroup(exclude=['export','import'])

all_times.append(time.time()-t1)

Group names:                                        sizes:
aiida_kkr_tutorial                                |    0
aiida_kkr_tutorial/calculations                   |    0
aiida_kkr_tutorial/calculations/voronoi           |    0
aiida_kkr_tutorial/calculations/kkrhost           |    0
aiida_kkr_tutorial/calculations/kkrimp            |    0
CIFs_from_deltaCodesDFT/primCIFs/structures_v01   |   72
CIFs_from_deltaCodesDFT                           |    0
imp_embeddings_for_ml                             |    0
aiida_kkr_tutorial/workflows                      |    0
CIFs_from_deltaCodesDFT/CIFs                      |   71
CIFs_from_deltaCodesDFT/primCIFs                  |   71
imp_embeddings_for_ml/host_scf                    |   38
imp_embeddings_for_ml/host_gf                     |   35
imp_embeddings_for_ml/imp                         |   35


## Structure Analysis:

In [17]:
t1 = time.time()

################### serialization
qb = QueryBuilder()
qb.append(StructureData)
StructDatas = qb.all()

#print(dic.keys())

serializer = SR.Serializer(StructDatas)
serializer.to_file('./output/Num_structure.json' ,Node_type='StructureFormula')


Counting 1.number of atoms and 2.number of nodes containing this atom number...
This process will take some time...


0

In [18]:
filepath = './output/Num_structure.json'
Newdata = SR.deserialize_from_file(filepath,Node_type = 'StructureFormula')

DV.ShowFormula(Newdata)

In [19]:
qb = QueryBuilder()
qb.append(StructureData)
StructDatas = qb.all()

In [20]:
serializer = SR.Serializer(StructDatas)
filepath = './output/Struct_Element.json'
serializer.to_file(filepath,'StructureElement')

Counting the number of all elements...
This process will take some time...


0

In [21]:
filepath = './output/Struct_Element.json'
x = SR.deserialize_from_file(filepath,'StructureElement')
x

Unnamed: 0,Be,Mg,Sc,Ti,V,Fe,Ni,Sr,Cu,Nb,...,S,Se,Ba,Si,Sn,Tc,P,Te,Po,Tl
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
ShowElements(x)
## sort in other ways

all_times.append(time.time()-t1)

## Processes:

In [23]:
t1 = time.time()

####### CalcNode 
qb = QueryBuilder()
qb.append(CalcJobNode)
CalcNode = qb.all()

serializer = SR.Serializer(CalcNode)
filepath = './output/CalcNode.json'
serializer.to_file(filepath,'ProcessNode')

######## WorkflowNode
qb = QueryBuilder()
qb.append(WorkflowNode)
WorkflowNodes = qb.all()

serializer = SR.Serializer(WorkflowNodes)
filepath2 = './output/WorkflowNode.json'
serializer.to_file(filepath2,'ProcessNode')

0

In [24]:
calcArray = SR.deserialize_from_file(filepath,Node_type = 'ProcessNode')
calcArray.head()
WorkflowArray = SR.deserialize_from_file(filepath2,Node_type = 'ProcessNode')
WorkflowArray.head()

Unnamed: 0,Node_Pk,Process_State,Exit_Message,node_type
0,17911,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
1,18286,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
2,17763,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
3,18269,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
4,18001,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.


In [25]:
WorkflowArray = SR.deserialize_from_file(filepath2,Node_type = 'ProcessNode')
WorkflowArray.head()

Unnamed: 0,Node_Pk,Process_State,Exit_Message,node_type
0,17911,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
1,18286,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
2,17763,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
3,18269,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
4,18001,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.


In [26]:
# Here better create a table: exit_code | exit message | ntimes | relative (to all processes of same type)

In [27]:
Newdict1 = DV.GetWorkflowDict(WorkflowArray)
Newdict2 = DV.GetWorkflowDict(calcArray)
DV.ShowWorkflow(Newdict1,'Work Flow Node Information')
DV.ShowWorkflow(Newdict2,'Calculate Job Node Information')

all_times.append(time.time()-t1)

# Data provenance health indicators:

In [28]:
t1 = time.time()

########## this cell will take some time,but after the preprocessing everything should be fine
qb = QueryBuilder()
qb.append(Node)
Nodes = qb.all()

#### serialization to filepath
provenance_serializer = SR.Serializer(Nodes)
filepath = './output/provenance.json'
provenance_serializer.to_file(filepath,'Provenance')

Begin looking for incoming and outgoing nodes of each node...
The preprocessing is slow because we will dig the incoming and outgoing nodes of each node, please wait for a moment...
Approximate running time for smaller dataset with 5000+ Nodes is about 2 min...
The preprocessing took 60.81062316894531 seconds


0

In [29]:
#### deserialization from filepath
filepath = './output/provenance.json'
provenance = SR.deserialize_from_file(filepath,'Provenance')
provenance

Unnamed: 0,Node_Type,PK,FirstInput,FirstOutput
0,data.code.Code.,50,,"[eddcf30c-0d0f-4aac-93ac-d4bc4dfbd975, {'name'..."
1,data.dict.Dict.,10502,,
2,data.dict.Dict.,12786,"[cd8241a3-b30c-4c5a-b085-3e799ce2b40f, {'name'...",
3,data.dict.Dict.,10503,,
4,data.code.Code.,42,,"[eddcf30c-0d0f-4aac-93ac-d4bc4dfbd975, {'name'..."
...,...,...,...,...
6100,data.float.Float.,20015,,"[dac97232-92ed-4ea2-9b8b-5e7df42de12c, {'name'..."
6101,data.singlefile.SinglefileData.,20017,"[dac97232-92ed-4ea2-9b8b-5e7df42de12c, {'name'...","[a14bd078-8335-4d67-8a66-3b957cb53dd9, {'name'..."
6102,process.calculation.calcfunction.CalcFunctionN...,20016,"[7b38a16c-fe09-477e-bc63-1d08bdf721f3, {'name'...","[2d019387-4d76-4f6f-872b-36961b177564, {'name'..."
6103,process.workflow.workchain.WorkChainNode.,20013,"[5df5439b-fbe1-443e-8d69-31779ca3adac, {'name'...","[4c4f373f-ba6f-46d3-8529-3cc355a93f7c, {'name'..."


In [30]:
No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict = DV.Count_In_Out(provenance)
print(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)

{'data.code.Code.': 5, 'data.dict.Dict.': 2410, 'data.bool.Bool.': 282, 'data.cif.CifData.': 142, 'data.float.Float.': 91} {'data.dict.Dict.': 1603, 'data.code.Code.': 2, 'data.folder.FolderData.': 278, 'data.cif.CifData.': 71, 'data.float.Float.': 56, 'data.singlefile.SinglefileData.': 35, 'data.structure.StructureData.': 33, 'data.bool.Bool.': 79, 'data.remote.RemoteData.': 5, 'process.calculation.calcjob.CalcJobNode.': 2} {'data.dict.Dict.': 1332, 'data.code.Code.': 2, 'data.cif.CifData.': 71, 'data.float.Float.': 56, 'data.bool.Bool.': 79}


In [31]:
DV.Show_In_Out(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)
### split and think about bar plot
# reduce complexity



In [32]:
all_times.append(time.time()-t1)

In [33]:
npro  = sum(node_count.values())
ndata = sum(process_count.values())
size = 34
header = ('# Timings of D1 in seconds\n# Database info: {} nodes; {} processes, {} data, {} MB size \n'
          '# Database overview, user info, node type, database evolution, code analysis, group analysis, structure analysis, process info, provenance analysis\n'
           ''.format(totalnodes, npro, ndata, size))
timestring = ''
for times in all_times:
    timestring = timestring + '{}  '.format(times) 
with open("timings/" + timings_filename, 'w') as file1:
    file1.write(header)
    file1.write(timestring)