# Statistical birds eye view of the contents in an AiiDAdb

This is the first of two deliverable for the SiSc-Lab2020 project.

Authors = Miao Wang(a - e), Zhipeng Tan(f - i)

Supervisors: Jens Bröder, Dr. Daniel Wortmann, Johannes Wasmer, Prof. Dr. Stefan Blügel.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [None]:
# python imports:
import time
import json
#from pprint import pprint

#%pylab inline
#figuresize=(18, 4)
from collections import Counter
from math import pi
import numpy as np
import pandas as pd
from pandas import DataFrame
from bokeh.io import output_file,output_notebook, show
from bokeh.layouts import column
from bokeh.palettes import Category20,Category20c
from bokeh.plotting import figure,ColumnDataSource
from bokeh.transform import cumsum
from bokeh.models import Legend,LegendItem,HoverTool

# aiida imports:
from aiida import load_profile
profile = load_profile()

# ggf add futher imports
from aiida.orm import QueryBuilder as QB
from aiida.orm import QueryBuilder
from aiida.orm import WorkflowNode
from aiida.orm import load_node, Node, Group, Computer,Dict
from aiida.orm import User, CalcJobNode, Code, StructureData, ProcessNode
from aiida.plugins import DataFactory
from aiida.common.constants import elements as PeriodicTableElements

# project imports:
#import helpers
# if this does not work, do a `pip install -e .` in the aiida-jutools head folder
from aiida_jutools.sisc_lab import helpers


import aiida_jutools.sisc_lab.util.data_visu as DV
import aiida_jutools.sisc_lab.util.serialization as SR
from aiida_jutools.sisc_lab.util.data_visu import AnalyseStructureElements,ShowElements

output_notebook()

In [None]:
helpers.print_bold(f"This notebook/dashboard will visualize the contents from the database of profile {profile.name}")

In [None]:
all_times = []

# Database overview:

In [None]:
t1 = time.time()

# query for all nodes
print('Information on nodes in the DB: \n')
now = time.strftime("%c")
print('last executed on {}'.format(now))
q = QB()
q.append(Node, project=['id', 'ctime', 'mtime', 'node_type'], tag='node')
q.append(User, with_node='node', project='email')
# TODO: execute query here
t = time.time()
res = q.all()
elapsed = time.time() - t
totalnodes = len(res)
print("Total number of nodes in the database: {} (retrieved in {} s.)".format(totalnodes, elapsed))

all_times.append(time.time()-t1)

## User information:

In [None]:
t1 = time.time()

print("Users:")
helpers.print_Count('user',res)

all_times.append(time.time()-t1)

## Node types distribution:

In [None]:
t1 = time.time()

print("Node types:")
helpers.print_Count('types',res)

In [None]:
#split data nodes and process nodes
types = Counter([r[3] for r in res])
node_count = helpers.get_data_node_count(types,'data') 
p = helpers.draw_pie_chart(node_count,'Data Nodes')

process_count = helpers.get_process_node_count(types,'process')
p1 = helpers.draw_pie_chart(process_count,'Process Nodes')

show(column(p,p1))

In [None]:
p = helpers.draw_pie_chart(Counter(helpers.get_dict_link_types()),'Dict Link Types')
show(p)

all_times.append(time.time()-t1)

## Database time evolution:

In [None]:
# line plot by ctime & mtime
t1 = time.time()

users = Counter([r[4] for r in res])
output_notebook()
helpers.draw_line_plot(users,res)

all_times.append(time.time()-t1)

## Codes:

In [None]:
t1 = time.time()

codes = Code.objects.all()
result = {code.full_label: len(code.get_outgoing(node_class=CalcJobNode).all_nodes()) for code in codes}
#result_df=pd.Series(result).sort_values(ascending=False)
result_df=pd.DataFrame({'code@computer':list(result.keys()),'CalaJobcount':list(result.values())}).sort_values(by='CalaJobcount',ascending=False).reset_index(drop=True)

all_times.append(time.time()-t1)
result_df


## Groups:

In [None]:
t1 = time.time()

qb = QueryBuilder()
qb.append(Group)
group = qb.all()

### add more columns for this and do also for other nodes
serializer = SR.Serializer(group)
serializer.to_file('./output/group.json',Node_type='Group')
x = SR.deserialize_from_file('./output/group.json',Node_type='Group')
x

In [None]:
data = DV.GroupDataHelper(x)
data.ListGroup(exclude=['export','import'])

all_times.append(time.time()-t1)

## Structure Analysis:

In [None]:
t1 = time.time()

################### serialization
qb = QueryBuilder()
qb.append(StructureData)
StructDatas = qb.all()

#print(dic.keys())

serializer = SR.Serializer(StructDatas)
serializer.to_file('./output/Num_structure.json' ,Node_type='StructureFormula')


In [None]:
filepath = './output/Num_structure.json'
Newdata = SR.deserialize_from_file(filepath,Node_type = 'StructureFormula')

DV.ShowFormula(Newdata)

In [None]:
qb = QueryBuilder()
qb.append(StructureData)
StructDatas = qb.all()

In [None]:
serializer = SR.Serializer(StructDatas)
filepath = './output/Struct_Element.json'
serializer.to_file(filepath,'StructureElement')

In [None]:
filepath = './output/Struct_Element.json'
x = SR.deserialize_from_file(filepath,'StructureElement')
x

In [None]:
ShowElements(x)
## sort in other ways

all_times.append(time.time()-t1)

## Processes:

In [None]:
t1 = time.time()

####### CalcNode 
qb = QueryBuilder()
qb.append(CalcJobNode)
CalcNode = qb.all()

serializer = SR.Serializer(CalcNode)
filepath = './output/CalcNode.json'
serializer.to_file(filepath,'ProcessNode')

######## WorkflowNode
qb = QueryBuilder()
qb.append(WorkflowNode)
WorkflowNodes = qb.all()

serializer = SR.Serializer(WorkflowNodes)
filepath2 = './output/WorkflowNode.json'
serializer.to_file(filepath2,'ProcessNode')

In [None]:
calcArray = SR.deserialize_from_file(filepath,Node_type = 'ProcessNode')
calcArray.head()

In [None]:
WorkflowArray = SR.deserialize_from_file(filepath2,Node_type = 'ProcessNode')
WorkflowArray.head()

In [None]:
Newdict1 = DV.GetWorkflowDict(WorkflowArray)
Newdict2 = DV.GetWorkflowDict(calcArray)
DV.ShowWorkflow(Newdict1,'Work Flow Node Information')
DV.ShowWorkflow(Newdict2,'Calculate Job Node Information')

all_times.append(time.time()-t1)

# Data provenance health indicators:

In [None]:
t1 = time.time()

########## this cell will take some time,but after the preprocessing everything should be fine
qb = QueryBuilder()
qb.append(Node)
Nodes = qb.all()

#### serialization to filepath
provenance_serializer = SR.Serializer(Nodes)
filepath = './output/provenance.json'
provenance_serializer.to_file(filepath,'Provenance')

In [None]:
#### deserialization from filepath
filepath = './output/provenance.json'
provenance = SR.deserialize_from_file(filepath,'Provenance')
provenance

In [None]:
No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict = DV.Count_In_Out(provenance)
print(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)

In [None]:
DV.Show_In_Out(No_Incoming_Mydict,No_Outgoing_Mydict,No_InOut_Mydict)
### split and think about bar plot
# reduce complexity



In [None]:
all_times.append(time.time()-t1)

In [None]:
npro  = sum(node_count.values())
ndata = sum(process_count.values())
size = 34
header = ('# Timings of D1 in seconds\n# Database info: {} nodes; {} processes, {} data, {} MB size \n'
          '# Database overview, user info, node type, database evolution, code analysis, group analysis, structure analysis, process info, provenance analysis\n'
           ''.format(totalnodes, npro, ndata, size))
timestring = ''
for times in all_times:
    timestring = timestring + '{}  '.format(times) 
with open('all_times_D1_iffaiida.txt', 'w') as file1:
    file1.write(header)
    file1.write(timestring)