#### Data curation P-CHEM
##### Notebook, which check for errors and omissions in P-CHEM data terminology

In [None]:
from pynanomapper import aa
import ipywidgets as widgets
import textdistance
from ipywidgets import interact, interactive, fixed, interact_manual, Dropdown
import requests
from importlib import reload 
from pynanomapper import client_solr
from pynanomapper import client_ambit
from pynanomapper import annotation
import pandas as pd
import numpy as np
import json
from itertools import chain
import warnings
warnings.filterwarnings("ignore")

In [None]:
templates_endpoints = {'BET':['SPECIFIC_SURFACE_AREA_SECTION']
       ,'DLS':['PC_GRANULOMETRY_SECTION', 'ZETA_POTENTIAL_SECTION']
       ,'XRD':['CRYSTALLITE_AND_GRAIN_SIZE_SECTION','CRYSTALLINE_PHASE_SECTION']
       ,'ZETA_POTENTIAL':['ZETA_POTENTIAL_SECTION']
       ,'TEM':['PC_GRANULOMETRY_SECTION', 'ASPECT_RATIO_SHAPE_SECTION', 'ENM_0000081_SECTION']
       ,'CFS':[],'XRF':[],'DCFH2_DA':[],'SEQUENTIAL_GIT':[],'WST':[],'CONTACT_ANGLE':[],'TGA_MS':[]
       ,'SEARS_TITRATION':[],'IEP':[],'ES_DMA':[],'EFFECTIVE_DENSITY':[],'AUC':[],'DUSTINESS_SMALL_DRUM':[],'HE_PYCNOMETRY':[]}

templates_name = Dropdown(options = templates_endpoints.keys())
endp_category = Dropdown(options = templates_endpoints[templates_name.value]) 

@interact(templates = templates_name, endpoint_category = endp_category)
def print_category(templates, endpoint_category):
    endp_category.options = templates_endpoints[templates] 
    #print(Top_endpoint_category,Endpoint_category)

##### Expected terms from different Template Wizard : physchem

In [None]:
query = "https://search.data.enanomapper.net/api/templates/pchem.json"
response = requests.get(query)
if response.status_code != 200:
    print("Something went wrong. Status code ",response.status_code)
else:
    data_templ = response.json()
data_templ

In [None]:
# collect terms in dict by templates
templ = {}
templates = []
for item in data_templ["templates"]:
    templates.append(item)
    value_list = []
    for i in data_templ["templates"][item]:
        value_list.append(i)
    templ[item] = value_list
print(templates)

In [None]:
# collect terms that can be skip
fields_dict = {}
fields = []
for item in data_templ["fields"]:
    fields.append(item)
    value_list = []
    for terms in data_templ["fields"][item]:
        value_list.append(terms)
    fields_dict[item] = value_list

fields_terms = list(chain(*fields_dict.values()))
fields_terms = list(map(lambda x: x.replace('_', ''), fields_terms))
fields_terms

#### Aggregated search using Solr-API for search over eNanoMapper database instances. 

In [None]:
print('Select enanoMapper aggregated search service:')
def search_service_protected(url,apikey):
    return (url,apikey)
style = {'description_width': 'initial'}
config,config_servers, config_security, auth_object, msg = aa.parseOpenAPI3()    
service_widget = widgets.Dropdown(
    options=config_servers['url'],
    description='Service:',
    disabled=False,
    style=style)
if config_security is None:
    service = interactive(search_service_open,url=service_widget)
else:
    print(msg)
    apikey_widget=widgets.Text(
            placeholder='',
            description=config_security,
            disabled=False,
            style=style
    )    
    service = interactive(search_service_protected,url=service_widget,apikey=apikey_widget)    

display(service)

In [None]:
service_uri=service_widget.value
print("Sending queries to {}".format(service_uri))
if auth_object!=None:
    auth_object.setKey(apikey_widget.value)

In [None]:
display(templates_name)
display(endp_category)

In [None]:
reload(client_solr)
study = client_solr.StudyDocuments()
#endpoint_category = endpointcategory.value
filter = {'topcategory_s':'P-CHEM', 'endpointcategory_s': endp_category.value}
study.setStudyFilter(filter)
#this is important to retrieve params & conditions fields!
study.getSettings()["fields"]= "*"
print(study.getSettings())
query = study.getQuery(rows=10000)
#query = study.getQuery(textfilter='*:*',rows=10000)
print(query)
r = client_solr.post(service_uri,query=query,auth=auth_object)

In [None]:
#parse the data
if r.status_code==200:
    study = client_solr.StudyDocuments()
    docs = r.json()['response']['docs']
    rows = study.parse(docs)
    df = study.rows2frame(rows)
    rows=None
    uuids = ['uuid.substance']
    df.sort_values(by=uuids)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    display(df.head(50))
else:
    print(r.status_code)

#### Collected data terms and appropriate converted them for comparing

In [None]:
df_xparams_collect = []
for column in df.columns:
    if column.startswith('x.params.'):
        column = column.replace('x.params.','').replace('.','').replace('_d','').replace(' ','').replace('_','').upper()
        #column=column.replace('x.params.','').strip('.').strip('d').strip('_').strip(" ").upper()
        df_xparams_collect.append(column)

print(df_xparams_collect)

In [None]:
# taked terms from chosen template

template = (templ[templates_name.value])
template_params_collect = []
for term in template:
    if term.startswith('PARAMS_'):
        term = term.replace('PARAMS_', '').replace('_s','').replace('_','').upper()
        template_params_collect.append(term)
print(template_params_collect)

#### Full matching, similar and potencial errors terms
###### https://en.wikipedia.org/wiki/Levenshtein_distance

In [None]:
potencial_error_terms = []
similar_terms = []
equal_terms = []

for df_par in df_xparams_collect:
    for tpl_par in template_params_collect:
        normalized_similarity = round(textdistance.levenshtein.normalized_similarity(df_par,tpl_par),2)
        
        if normalized_similarity == 1:
            equal_terms.append([tpl_par,df_par])
            
        if 0.9 <= normalized_similarity < 1:
            potencial_error_terms.append([tpl_par,df_par])
            
        if 0.5 <= normalized_similarity < 0.9:
                similar_terms.append([tpl_par,df_par,normalized_similarity])

In [None]:
print(f'\n'.join(str(x) for x in equal_terms))

In [None]:
print(f'\n'.join(str(x) for x in potencial_error_terms))

In [None]:
print(f'\n'.join(str(x) for x in similar_terms))

#### Check for data completness

In [None]:
# skip terms from template 'fields'
for term in fields_terms:
    if term in template_params_collect:
        template_params_collect.remove(term)
    if term in df_xparams_collect:
        df_xparams_collect.remove(term)
        
# after skipped terms
print(f'\n'.join(str(x) for x in template_params_collect))
print()
print(f'\n'.join(str(x) for x in df_xparams_collect))

In [None]:
# remove potencial error terms from df collected terms
for tpl_par, df_par in potencial_error_terms:
    if df_par in df_xparams_collect:
        df_xparams_collect.remove(df_par)

In [None]:
mandatory_terms = set(template_params_collect).difference(set(df_xparams_collect)) 
recomended_terms = set(df_xparams_collect).difference(set(template_params_collect))

print(f"Terms that be mandatory to be added in database: \n{', '.join([str(x) for x in mandatory_terms])}\n")
print(f"Terms that be recommended to be added in template parameters: \n{', '.join([str(x) for x in recomended_terms])}")