# Statistical birds eye view of the contents in an AiiDAdb

This is the first of two deliverable for the SiSc-Lab2020 project.

Authors = Miao Wang(a - e), Zhipeng Tan(f - i)

Supervisors: Jens Bröder, Dr. Daniel Wortmann, Johannes Wasmer, Prof. Dr. Stefan Blügel.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [2]:
# python imports:
import os
import time
import json
#from pprint import pprint

#%pylab inline
#figuresize=(18, 4)
from collections import Counter
from math import pi
import numpy as np
import pandas as pd
from pandas import DataFrame
from bokeh.io import output_file,output_notebook, show
from bokeh.layouts import column
from bokeh.palettes import Category20,Category20c
from bokeh.plotting import figure,ColumnDataSource
from bokeh.transform import cumsum
from bokeh.models import Legend,LegendItem,HoverTool

# aiida imports:
from aiida import load_profile
profile = load_profile()

# ggf add futher imports
from aiida.orm import QueryBuilder as QB
from aiida.orm import QueryBuilder
from aiida.orm import WorkflowNode
from aiida.orm import load_node, Node, Group, Computer,Dict
from aiida.orm import User, CalcJobNode, Code, StructureData, ProcessNode
from aiida.plugins import DataFactory
from aiida.common.constants import elements as PeriodicTableElements

# project imports:
#import helpers
# if this does not work, do a `pip install -e .` in the aiida-jutools head folder
from aiida_jutools.sisc_lab import helpers


import aiida_jutools.sisc_lab.util.data_visu as DV
import aiida_jutools.sisc_lab.util.serialization as SR
from aiida_jutools.sisc_lab.util.data_visu import AnalyseStructureElements,ShowElements

output_notebook()

In [3]:
helpers.print_bold(f"This notebook/dashboard will visualize the contents from the database of profile '{profile.name}'")

[1mThis notebook/dashboard will visualize the contents from the database of profile 'default'[1m


In [4]:
all_times = []

# Database overview:

In [5]:
t1 = time.time()

# query for all nodes
print('Information on nodes in the DB: \n')
now = time.strftime("%c")
print('last executed on {}'.format(now))
q = QB()
q.append(Node, project=['id', 'ctime', 'mtime', 'node_type'], tag='node')
q.append(User, with_node='node', project='email')
# TODO: execute query here
t = time.time()
res = q.all()
elapsed = time.time() - t
totalnodes = len(res)
print("Total number of nodes in the database: {} (retrieved in {} s.)".format(totalnodes, elapsed))

all_times.append(time.time()-t1)

Information on nodes in the DB: 

last executed on Wed Jan 27 10:19:08 2021
Total number of nodes in the database: 5150 (retrieved in 0.2110753059387207 s.)


## User information:

In [6]:
t1 = time.time()

print("Users:")
helpers.print_Count('user',res)

all_times.append(time.time()-t1)

Users:
- aiida@localhost created 3959 nodes
- j.broeder@fz-juelich.de created 1191 nodes


## Node types distribution:

In [7]:
t1 = time.time()

print("Node types:")
helpers.print_Count('types',res)

Node types:
- data.dict.Dict. created 1715 nodes
- process.calculation.calcfunction.CalcFunctionNode. created 691 nodes
- process.workflow.workchain.WorkChainNode. created 538 nodes
- data.bool.Bool. created 351 nodes
- data.structure.StructureData. created 288 nodes
- process.calculation.calcjob.CalcJobNode. created 261 nodes
- data.remote.RemoteData. created 261 nodes
- data.folder.FolderData. created 246 nodes
- data.float.Float. created 239 nodes
- data.int.Int. created 228 nodes
- data.fleur.fleurinp.FleurinpData. created 213 nodes
- data.array.ArrayData. created 68 nodes
- data.code.Code. created 15 nodes
- data.singlefile.SinglefileData. created 12 nodes
- data.array.xy.XyData. created 6 nodes
- data.array.kpoints.KpointsData. created 5 nodes
- data.str.Str. created 4 nodes
- process.workflow.workfunction.WorkFunctionNode. created 2 nodes
- data.upf.UpfData. created 2 nodes
- data.array.trajectory.TrajectoryData. created 2 nodes
- data.array.bands.BandsData. created 2 nodes
- da

In [8]:
#split data nodes and process nodes
types = Counter([r[3] for r in res])
node_count = helpers.get_data_node_count(types,'data') 
p = helpers.draw_pie_chart(node_count,'Data Nodes')

process_count = helpers.get_process_node_count(types,'process')
p1 = helpers.draw_pie_chart(process_count,'Process Nodes')

show(column(p,p1))

In [9]:
p = helpers.draw_pie_chart(Counter(helpers.get_dict_link_types()),'Dict Link Types')
show(p)

all_times.append(time.time()-t1)

## Database time evolution:

In [10]:
# line plot by ctime & mtime
t1 = time.time()

users = Counter([r[4] for r in res])
output_notebook()
helpers.draw_line_plot(users,res)

all_times.append(time.time()-t1)

## Codes:

In [11]:
t1 = time.time()

codes = Code.objects.all()
result = {code.full_label: len(code.get_outgoing(node_class=CalcJobNode).all_nodes()) for code in codes}
#result_df=pd.Series(result).sort_values(ascending=False)
result_df=pd.DataFrame({'code@computer':result.keys(),'CalaJobcount':result.values()}).sort_values(by='CalaJobcount',ascending=False).reset_index(drop=True)

all_times.append(time.time()-t1)
result_df


Unnamed: 0,code@computer,CalaJobcount
0,fleur_MaXR4_th1@iffslurm,82
1,inpgen_MaXR4_th1@iffslurm,75
2,fleur_serial_m4@iffslurm_oscar,23
3,inpgen_m4@iffslurm_oscar,21
4,inpgen_MaXR5_th1@iffslurm,17
5,kkrhost_3.5_intel@iffslurm,15
6,fleur_MaXR5_th1@iffslurm,15
7,kkrimp_3.5_intel@iffslurm,6
8,voronoi_3.5_intel@iffslurm,5
9,qe-6.5-pw@localhost (Imported #0),2


## Groups:

In [12]:
t1 = time.time()

qb = QueryBuilder()
qb.append(Group)
group = qb.all()

### add more columns for this and do also for other nodes
serializer = SR.Serializer(group)
if not os.path.isdir('./output'):
    os.mkdir('./output')
serializer.to_file('./output/group.json',Node_type='Group')
x = SR.deserialize_from_file('./output/group.json',Node_type='Group')
x

Unnamed: 0,User,Group_Name,Node,type_string
0,aiida@localhost,20210120-192642,5,core.import
1,j.broeder@fz-juelich.de,delta_structures_gustav,71,core
2,j.broeder@fz-juelich.de,delta_parameters_gutstav_soc,71,core
3,aiida@localhost,20210120-192655,142,core.import
4,aiida@localhost,20210120-192658,6,core.import
5,aiida@localhost,20210120-192700,6,core.import
6,aiida@localhost,iffslurm_options,8,core
7,aiida@localhost,20210120-192702,8,core.import
8,aiida@localhost,20210120-192716,351,core.import
9,aiida@localhost,20210120-194227,351,core.import


In [13]:
data = DV.GroupDataHelper(x)
data.ListGroup(exclude=['export','import'])

all_times.append(time.time()-t1)

Group names:                                        sizes:
delta_structures_gustav                           |   71
delta_parameters_gutstav_soc                      |   71
iffslurm_options                                  |    8
imp_dos_nodes                                     |    2


## Structure Analysis:

In [14]:
t1 = time.time()

################### serialization
qb = QueryBuilder()
qb.append(StructureData)
StructDatas = qb.all()

#print(dic.keys())

serializer = SR.Serializer(StructDatas)
serializer.to_file('./output/Num_structure.json' ,Node_type='StructureFormula')


Counting 1.number of atoms and 2.number of nodes containing this atom number...
This process will take some time...


0

In [15]:
filepath = './output/Num_structure.json'
Newdata = SR.deserialize_from_file(filepath,Node_type = 'StructureFormula')

DV.ShowFormula(Newdata)

In [16]:
qb = QueryBuilder()
qb.append(StructureData)
StructDatas = qb.all()

In [17]:
serializer = SR.Serializer(StructDatas)
filepath = './output/Struct_Element.json'
serializer.to_file(filepath,'StructureElement')

Counting the number of all elements...
This process will take some time...


0

In [18]:
filepath = './output/Struct_Element.json'
x = SR.deserialize_from_file(filepath,'StructureElement')
x

Unnamed: 0,P,Tc,Li,Ge,K,Ba,Cl,Ag,I,Sb,...,Na,Sn,Fe,Te,Zn,Mg,Re,Ir,Au,F
0,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
ShowElements(x)
## sort in other ways

all_times.append(time.time()-t1)

## Processes:

In [20]:
t1 = time.time()

####### CalcNode 
qb = QueryBuilder()
qb.append(CalcJobNode)
CalcNode = qb.all()

serializer = SR.Serializer(CalcNode)
filepath = './output/CalcNode.json'
serializer.to_file(filepath,'ProcessNode')

######## WorkflowNode
qb = QueryBuilder()
qb.append(WorkflowNode)
WorkflowNodes = qb.all()

serializer = SR.Serializer(WorkflowNodes)
filepath2 = './output/WorkflowNode.json'
serializer.to_file(filepath2,'ProcessNode')

0

In [31]:
calcArray = SR.deserialize_from_file(filepath,Node_type = 'ProcessNode')
calcArray.head()
WorkflowArray = SR.deserialize_from_file(filepath2,Node_type = 'ProcessNode')
WorkflowArray.head()

Unnamed: 0,Node_Pk,Process_State,Exit_Message,node_type
0,518,ProcessState.FINISHED,Inpgen calculation failed.,process.workflow.workchain.WorkChainNode.
1,893,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
2,1074,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
3,521,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
4,522,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.


In [22]:
WorkflowArray = SR.deserialize_from_file(filepath2,Node_type = 'ProcessNode')
WorkflowArray.head()

Unnamed: 0,Node_Pk,Process_State,Exit_Message,node_type
0,518,ProcessState.FINISHED,Inpgen calculation failed.,process.workflow.workchain.WorkChainNode.
1,893,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
2,1074,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
3,521,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
4,522,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.


In [23]:
# Here better create a table: exit_code | exit message | ntimes | relative (to all processes of same type)

In [24]:
Newdict1 = DV.GetWorkflowDict(WorkflowArray)
Newdict2 = DV.GetWorkflowDict(calcArray)
DV.ShowWorkflow(Newdict1,'Work Flow Node Information')
DV.ShowWorkflow(Newdict2,'Calculate Job Node Information')

all_times.append(time.time()-t1)

# Data provenance health indicators:

In [25]:
t1 = time.time()

########## this cell will take some time,but after the preprocessing everything should be fine
qb = QueryBuilder()
qb.append(Node)
Nodes = qb.all()

#### serialization to filepath
provenance_serializer = SR.Serializer(Nodes)
filepath = './output/provenance.json'
provenance_serializer.to_file(filepath,'Provenance')

Begin looking for incoming and outgoing nodes of each node...
The preprocessing is slow because we will dig the incoming and outgoing nodes of each node, please wait for a moment...
Approximate running time for smaller dataset with 5000+ Nodes is about 2 min...
The preprocessing took 66.58062982559204 seconds


0

In [26]:
#### deserialization from filepath
filepath = './output/provenance.json'
provenance = SR.deserialize_from_file(filepath,'Provenance')
provenance

Unnamed: 0,Node_Type,PK,FirstInput,FirstOutput
0,data.dict.Dict.,1,,
1,data.dict.Dict.,3,,
2,data.dict.Dict.,4,,
3,data.dict.Dict.,5,,
4,data.dict.Dict.,6,,
...,...,...,...,...
5145,process.calculation.calcfunction.CalcFunctionN...,4649,"[a1ba78e8-841f-43dc-a99f-31a0001859b9, {'name'...","[a0e7c4d5-e81a-4c5d-8393-4681485074cd, {'name'..."
5146,data.dict.Dict.,4672,,"[df706be5-a904-44b3-b97f-9132e53d57f3, {'name'..."
5147,data.dict.Dict.,4674,"[df706be5-a904-44b3-b97f-9132e53d57f3, {'name'...","[acf88ba1-d9f8-46a3-9458-6b31d31d2b98, {'name'..."
5148,process.calculation.calcfunction.CalcFunctionN...,4673,"[cfcead1e-1f05-40d4-a704-585e8ac8404f, {'name'...","[11099919-c1c8-4a9c-8299-b64cfecc922d, {'name'..."


In [27]:
No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict = DV.Count_In_Out(provenance)
print(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)

{'data.dict.Dict.': 1296, 'data.structure.StructureData.': 123, 'data.bool.Bool.': 351, 'data.int.Int.': 228, 'data.float.Float.': 84, 'data.code.Code.': 15, 'data.upf.UpfData.': 2, 'data.str.Str.': 4, 'data.list.List.': 1, 'data.array.kpoints.KpointsData.': 3} {'data.dict.Dict.': 223, 'data.structure.StructureData.': 177, 'data.folder.FolderData.': 155, 'data.float.Float.': 115, 'data.remote.RemoteData.': 224, 'data.code.Code.': 5, 'data.array.ArrayData.': 66, 'data.array.bands.BandsData.': 2, 'process.workflow.workchain.WorkChainNode.': 1, 'data.singlefile.SinglefileData.': 4, 'data.fleur.fleurinp.FleurinpData.': 1} {'data.dict.Dict.': 86, 'data.structure.StructureData.': 69, 'data.code.Code.': 5}


In [28]:
DV.Show_In_Out(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)
### split and think about bar plot
# reduce complexity



In [29]:
all_times.append(time.time()-t1)

In [30]:
npro  = sum(node_count.values())
ndata = sum(process_count.values())
size = 34
header = ('# Timings of D1 in seconds\n# Database info: {} nodes; {} processes, {} data, {} MB size \n'
          '# Database overview, user info, node type, database evolution, code analysis, group analysis, structure analysis, process info, provenance analysis\n'
           ''.format(totalnodes, npro, ndata, size))
timestring = ''
for times in all_times:
    timestring = timestring + '{}  '.format(times) 
with open('all_times_D1_iffaiida.txt', 'w') as file1:
    file1.write(header)
    file1.write(timestring)