# Statistical birds eye view of the contents in an AiiDAdb

This is the first of two deliverable for the SiSc-Lab2020 project.

Authors = Miao Wang(a - e), Zhipeng Tan(f - i)

Supervisors: Jens Bröder, Dr. Daniel Wortmann, Johannes Wasmer, Prof. Dr. Stefan Blügel.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [2]:
# python imports:
import time
import json
#from pprint import pprint

#%pylab inline
#figuresize=(18, 4)
from collections import Counter
from math import pi
import numpy as np
import pandas as pd
from pandas import DataFrame
from bokeh.io import output_file,output_notebook, show
from bokeh.layouts import column
from bokeh.palettes import Category20,Category20c
from bokeh.plotting import figure,ColumnDataSource
from bokeh.transform import cumsum
from bokeh.models import Legend,LegendItem,HoverTool

# aiida imports:
from aiida import load_profile
profile = load_profile()

# ggf add futher imports
from aiida.orm import QueryBuilder as QB
from aiida.orm import QueryBuilder
from aiida.orm import WorkflowNode
from aiida.orm import load_node, Node, Group, Computer,Dict
from aiida.orm import User, CalcJobNode, Code, StructureData, ProcessNode
from aiida.plugins import DataFactory
from aiida.common.constants import elements as PeriodicTableElements

# project imports:
#import helpers
# if this does not work, do a `pip install -e .` in the aiida-jutools head folder
from aiida_jutools.sisc_lab import helpers


import aiida_jutools.sisc_lab.util.data_visu as DV
import aiida_jutools.sisc_lab.util.serialization as SR
from aiida_jutools.sisc_lab.util.data_visu import AnalyseStructureElements,ShowElements

output_notebook()

In [3]:
helpers.print_bold(f"This notebook/dashboard will visualize the contents from the database of profile {profile.name}")

[1mThis notebook/dashboard will visualize the contents from the database of profile aiida-test3[1m


In [4]:
all_times = []

# Database overview:

In [5]:
t1 = time.time()

# query for all nodes
print('Information on nodes in the DB: \n')
now = time.strftime("%c")
print('last executed on {}'.format(now))
q = QB()
q.append(Node, project=['id', 'ctime', 'mtime', 'node_type'], tag='node')
q.append(User, with_node='node', project='email')
# TODO: execute query here
t = time.time()
res = q.all()
elapsed = time.time() - t
totalnodes = len(res)
print("Total number of nodes in the database: {} (retrieved in {} s.)".format(totalnodes, elapsed))

all_times.append(time.time()-t1)

Information on nodes in the DB: 

last executed on Wed Jan 27 20:24:57 2021
Total number of nodes in the database: 98411 (retrieved in 2.200373888015747 s.)


## User information:

In [6]:
t1 = time.time()

print("Users:")
helpers.print_Count('user',res)

all_times.append(time.time()-t1)

Users:
- j.broeder@fz-juelich.de created 97678 nodes
- tests@aiida.mail created 733 nodes


## Node types distribution:

In [7]:
t1 = time.time()

print("Node types:")
helpers.print_Count('types',res)

Node types:
- data.structure.StructureData. created 33129 nodes
- process.calculation.calcjob.CalcJobNode. created 10887 nodes
- data.remote.RemoteData. created 10875 nodes
- data.folder.FolderData. created 10822 nodes
- data.dict.Dict. created 10631 nodes
- data.fleur.fleurinp.FleurinpData. created 9844 nodes
- process.calculation.calcfunction.CalcFunctionNode. created 3679 nodes
- process.workflow.workchain.WorkChainNode. created 3651 nodes
- data.bool.Bool. created 2321 nodes
- data.int.Int. created 1479 nodes
- data.float.Float. created 813 nodes
- data.array.ArrayData. created 238 nodes
- data.code.Code. created 37 nodes
- data.list.List. created 5 nodes


In [8]:
#split data nodes and process nodes
types = Counter([r[3] for r in res])
node_count = helpers.get_data_node_count(types,'data') 
p = helpers.draw_pie_chart(node_count,'Data Nodes')

process_count = helpers.get_process_node_count(types,'process')
p1 = helpers.draw_pie_chart(process_count,'Process Nodes')

show(column(p,p1))

In [9]:
p = helpers.draw_pie_chart(Counter(helpers.get_dict_link_types()),'Dict Link Types')
show(p)

all_times.append(time.time()-t1)

## Database time evolution:

In [10]:
# line plot by ctime & mtime
t1 = time.time()

users = Counter([r[4] for r in res])
output_notebook()
helpers.draw_line_plot(users,res)

all_times.append(time.time()-t1)

## Codes:

In [11]:
t1 = time.time()

codes = Code.objects.all()
result = {code.full_label: len(code.get_outgoing(node_class=CalcJobNode).all_nodes()) for code in codes}
#result_df=pd.Series(result).sort_values(ascending=False)
result_df=pd.DataFrame({'code@computer':list(result.keys()),'CalaJobcount':list(result.values())}).sort_values(by='CalaJobcount',ascending=False).reset_index(drop=True)

all_times.append(time.time()-t1)
result_df


Unnamed: 0,code@computer,CalaJobcount
0,inpgen2@local_iff,6517
1,inpgen@local_iff,2696
2,fleur_serial_m4@iffslurm_oscar,873
3,inpgen_m4@iffslurm_oscar,603
4,fleur_serial@local_iff,112
5,inpgen_MaXR5_th1@iffslurm,10
6,fleur@localhost-test (Imported #3),10
7,inpgen@localhost-test (Imported #3),10
8,fleur@localhost-test (Imported #4),7
9,inpgen@localhost-test (Imported #4),7


## Groups:

In [12]:
t1 = time.time()

qb = QueryBuilder()
qb.append(Group)
group = qb.all()

### add more columns for this and do also for other nodes
serializer = SR.Serializer(group)
serializer.to_file('./output/group.json',Node_type='Group')
x = SR.deserialize_from_file('./output/group.json',Node_type='Group')
x

Unnamed: 0,User,Group_Name,Node,type_string
0,j.broeder@fz-juelich.de,20200520-130520,44,core.import
1,j.broeder@fz-juelich.de,20200520-130929,21,core.import
2,j.broeder@fz-juelich.de,20200520-130940,20,core.import
3,j.broeder@fz-juelich.de,20200520-131009,19,core.import
4,j.broeder@fz-juelich.de,20200520-131155,193,core.import
5,j.broeder@fz-juelich.de,20200520-131156,149,core.import
6,j.broeder@fz-juelich.de,20200520-131156_1,52,core.import
7,j.broeder@fz-juelich.de,20200520-131156_2,21,core.import
8,j.broeder@fz-juelich.de,20200520-131157,21,core.import
9,j.broeder@fz-juelich.de,20200520-131157_1,26,core.import


In [13]:
data = DV.GroupDataHelper(x)
data.ListGroup(exclude=['export','import'])

all_times.append(time.time()-t1)

Group names:                                        sizes:
Element_structures_from_ICSD                      | 1271
Binary_structures_from_ICSD                       |30448
delta_structures_gustav                           |   71
delta_parameters_gutstav_soc                      |   71
Full Database                                     |81470


## Structure Analysis:

In [14]:
t1 = time.time()

################### serialization
qb = QueryBuilder()
qb.append(StructureData)
StructDatas = qb.all()

#print(dic.keys())

serializer = SR.Serializer(StructDatas)
serializer.to_file('./output/Num_structure.json' ,Node_type='StructureFormula')


Counting 1.number of atoms and 2.number of nodes containing this atom number...
This process will take some time...


0

In [15]:
filepath = './output/Num_structure.json'
Newdata = SR.deserialize_from_file(filepath,Node_type = 'StructureFormula')

DV.ShowFormula(Newdata)



In [16]:
qb = QueryBuilder()
qb.append(StructureData)
StructDatas = qb.all()

In [17]:
serializer = SR.Serializer(StructDatas)
filepath = './output/Struct_Element.json'
serializer.to_file(filepath,'StructureElement')

Counting the number of all elements...
This process will take some time...


0

In [18]:
filepath = './output/Struct_Element.json'
x = SR.deserialize_from_file(filepath,'StructureElement')
x

Unnamed: 0,Si,Pt,Fe,Ga,Ti,B,Tc,Cf,S,K,...,Os,Np,Ar,Xe,Kr,Ra,Es,Pm,He,Rn
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33126,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
ShowElements(x)
## sort in other ways

all_times.append(time.time()-t1)

## Processes:

In [20]:
t1 = time.time()

####### CalcNode 
qb = QueryBuilder()
qb.append(CalcJobNode)
CalcNode = qb.all()

serializer = SR.Serializer(CalcNode)
filepath = './output/CalcNode.json'
serializer.to_file(filepath,'ProcessNode')

######## WorkflowNode
qb = QueryBuilder()
qb.append(WorkflowNode)
WorkflowNodes = qb.all()

serializer = SR.Serializer(WorkflowNodes)
filepath2 = './output/WorkflowNode.json'
serializer.to_file(filepath2,'ProcessNode')

0

In [21]:
calcArray = SR.deserialize_from_file(filepath,Node_type = 'ProcessNode')
calcArray.head()

Unnamed: 0,Node_Pk,Process_State,Exit_Message,node_type
0,55,ProcessState.FINISHED,,process.calculation.calcjob.CalcJobNode.
1,57,ProcessState.FINISHED,,process.calculation.calcjob.CalcJobNode.
2,62,ProcessState.FINISHED,,process.calculation.calcjob.CalcJobNode.
3,67,ProcessState.FINISHED,,process.calculation.calcjob.CalcJobNode.
4,14,ProcessState.FINISHED,,process.calculation.calcjob.CalcJobNode.


In [22]:
WorkflowArray = SR.deserialize_from_file(filepath2,Node_type = 'ProcessNode')
WorkflowArray.head()

Unnamed: 0,Node_Pk,Process_State,Exit_Message,node_type
0,56,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
1,60,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
2,63,ProcessState.FINISHED,,process.workflow.workchain.WorkChainNode.
3,64,ProcessState.FINISHED,Force theorem calculation failed.,process.workflow.workchain.WorkChainNode.
4,65,ProcessState.EXCEPTED,,process.workflow.workchain.WorkChainNode.


In [23]:
Newdict1 = DV.GetWorkflowDict(WorkflowArray)
Newdict2 = DV.GetWorkflowDict(calcArray)
DV.ShowWorkflow(Newdict1,'Work Flow Node Information')
DV.ShowWorkflow(Newdict2,'Calculate Job Node Information')

all_times.append(time.time()-t1)

# Data provenance health indicators:

In [24]:
t1 = time.time()

########## this cell will take some time,but after the preprocessing everything should be fine
qb = QueryBuilder()
qb.append(Node)
Nodes = qb.all()

#### serialization to filepath
provenance_serializer = SR.Serializer(Nodes)
filepath = './output/provenance.json'
provenance_serializer.to_file(filepath,'Provenance')

Begin looking for incoming and outgoing nodes of each node...
The preprocessing is slow because we will dig the incoming and outgoing nodes of each node, please wait for a moment...
Approximate running time for smaller dataset with 5000+ Nodes is about 2 min...
The preprocessing took 1171.036108493805 seconds


0

In [25]:
#### deserialization from filepath
filepath = './output/provenance.json'
provenance = SR.deserialize_from_file(filepath,'Provenance')
provenance

Unnamed: 0,Node_Type,PK,FirstInput,FirstOutput
0,data.fleur.fleurinp.FleurinpData.,35,"[f1a4faa4-0167-42e1-884a-7a0b6509e631, {'name'...","[b7a2515b-8e97-4eaf-9e72-043b4df60f65, {'name'..."
1,data.dict.Dict.,36,,"[b7a2515b-8e97-4eaf-9e72-043b4df60f65, {'name'..."
2,data.fleur.fleurinp.FleurinpData.,37,"[b7a2515b-8e97-4eaf-9e72-043b4df60f65, {'name'...","[2267bb3c-48d3-4698-8e67-e16dd4de849e, {'name'..."
3,data.code.Code.,1,,"[58a7abc5-dd60-4d0b-89fd-989e324d4664, {'name'..."
4,data.fleur.fleurinp.FleurinpData.,169,"[7eaadf31-43aa-4ee7-b2fe-1767a703af46, {'name'...","[9f7d8059-a372-424f-9aad-9feef32f26e2, {'name'..."
...,...,...,...,...
98406,data.dict.Dict.,97241,,"[630e9f76-1059-4e20-abc6-409645f36974, {'name'..."
98407,data.dict.Dict.,97243,"[3171a436-5b6b-42c9-a4dc-aab0777ed4b2, {'name'...",
98408,process.calculation.calcfunction.CalcFunctionN...,97242,"[56839a08-21e5-4da3-bc17-39c87a430f9f, {'name'...","[8688598e-299e-4449-a8b7-72ce26d6f307, {'name'..."
98409,process.workflow.workchain.WorkChainNode.,97182,"[19b33c5e-2ae7-4b19-b5a6-f64f9303c4c4, {'name'...","[cab884c2-47d0-41dc-8076-ad292f551665, {'name'..."


In [26]:
No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict = DV.Count_In_Out(provenance)
print(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)

{'data.dict.Dict.': 7609, 'data.code.Code.': 37, 'data.structure.StructureData.': 32355, 'data.int.Int.': 1479, 'data.bool.Bool.': 2321, 'data.fleur.fleurinp.FleurinpData.': 193, 'data.list.List.': 5, 'data.float.Float.': 434} {'data.folder.FolderData.': 10156, 'data.remote.RemoteData.': 10545, 'data.dict.Dict.': 1557, 'data.fleur.fleurinp.FleurinpData.': 8094, 'data.structure.StructureData.': 27313, 'data.float.Float.': 339, 'data.array.ArrayData.': 238, 'process.calculation.calcjob.CalcJobNode.': 12, 'process.calculation.calcfunction.CalcFunctionNode.': 3, 'process.workflow.workchain.WorkChainNode.': 5} {'data.structure.StructureData.': 26793, 'data.dict.Dict.': 72}


In [27]:
DV.Show_In_Out(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)
### split and think about bar plot
# reduce complexity



In [28]:
all_times.append(time.time()-t1)

In [29]:
npro  = sum(node_count.values())
ndata = sum(process_count.values())
size = 34
header = ('# Timings of D1 in seconds\n# Database info: {} nodes; {} processes, {} data, {} MB size \n'
          '# Database overview, user info, node type, database evolution, code analysis, group analysis, structure analysis, process info, provenance analysis\n'
           ''.format(totalnodes, npro, ndata, size))
timestring = ''
for times in all_times:
    timestring = timestring + '{}  '.format(times) 
with open('all_times_D1_iffaiida.txt', 'w') as file1:
    file1.write(header)
    file1.write(timestring)