# Import toolboxes used by the script and its functions

In [1]:
# Set display to width of screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Import toolboxes
import os
import numpy as np
import pandas as pd
import re
from SPARQLWrapper import SPARQLWrapper, JSON
from unidecode import unidecode
import urllib
import json
from pandas.io.json import json_normalize
import datetime

# Set list to capture summary information for reporting at then end of the run
summary = []

# Set time run started
start = datetime.datetime.now()

# Add start time and holding space for end time to summary information
summary.append(["Processing started:" , (start.strftime('%Y-%m-%d %H:%M:%S'))]) 
summary.append(["Processing finished:" , ""])
summary.append(["" , ""])

# Add filepaths for input file here

In [2]:
inputfile = os.path.join(os.getcwd(),os.path.normpath('ICES2P01_test_dset.xlsx'))

# Add filepaths for mapping file here

In [3]:
mapfile = os.path.join(os.getcwd(),'mappings','unmapped_substances.xlsx')
biotamap = os.path.join(os.getcwd(),'mappings','biota_synonym_mapping.xlsx')
p02_file = os.path.join(os.getcwd(),'mappings','ICES2P02_mapping.txt')

# Generate output file name

In [4]:
fileout = os.path.join(os.getcwd(),os.path.normpath(inputfile[:-5]+'_mapped.xlsx'))

# Define functions used by the script

In [5]:
def xstr(s):
    """Function to remove non-ASCII characters from NVS results"""
    if s is None:
        return ''
    return str(unidecode(s))

def sparql_nvs_json(s):
    """Function to input a SPARQL query (s) into the NVS SPARQL endpoint"""
    sparql = SPARQLWrapper("http://vocab.nerc.ac.uk/sparql/sparql")
    sparql.setQuery(s)
    sparql.setReturnFormat(JSON)
    r = sparql.query().convert()
    return r

def S27_map():
    """Function to get NVS:S27 chemical substrances which have CAS numbers as a published mapping"""
    s =  """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
                
            select ?codval ?prefLabel ?casurl
            where {
            <http://vocab.nerc.ac.uk/collection/S27/current/> skos:member ?url .
            ?url skos:notation ?codval .
            ?url skos:prefLabel ?prefLabel .
            ?url owl:deprecated 'false' .
            ?url owl:sameAs ?casurl .
            FILTER(regex(str(?casurl), "https://chem.nlm.nih.gov/chemidplus/rn/", "i"))
            }"""                
    r = sparql_nvs_json(s)    
    list = []
    for i in range(0,len(r['results']['bindings'])):
        a = xstr(r['results']['bindings'][i]['codval']['value'].replace('SDN:S27::',''))
        b = xstr(r['results']['bindings'][i]['prefLabel']['value'])
        c = xstr(r['results']['bindings'][i]['casurl']['value'].replace('http://chem.sis.nlm.nih.gov/chemidplus/rn/',''))
        list.append([a,b,c])
    return list

def taxon_map(spcs,aphia):
    """Function to get NVS:S25 TAXON from an AphiaID and species"""
    s =  """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
                
            select ?prefLabel
            where {
                    <http://vocab.nerc.ac.uk/collection/S25/current/> skos:member ?url .
                    ?url skos:notation ?codval .
                    ?url skos:prefLabel ?prefLabel .
                    ?url owl:deprecated 'false' .
                    FILTER(CONTAINS(?prefLabel,'%s')).
                    FILTER(CONTAINS(?prefLabel,'WoRMS %s)')).
                }""" % (spcs, aphia)
    r = sparql_nvs_json(s)
    list = []
    if len(r['results']['bindings']) == 0:
        a = 'Not available'
    else:
        label_list = []
        for i in range(0,len(r['results']['bindings'])):
            h = re.sub('\s\[.*?\]' ,'',xstr(r['results']['bindings'][i]['prefLabel']['value']))
            if h not in label_list:
                label_list.append(h)
                a = h
    list.append([a, aphia])
    return list

def S25_lookup(spcs,aphia,label):
    """Function to get NVS:S25 codval from the generated preflabel, if it exists"""
    s =  """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
                
            select ?codval ?prefLabel
            where {
            <http://vocab.nerc.ac.uk/collection/S25/current/> skos:member ?url .
            ?url skos:notation ?codval .
            ?url skos:prefLabel ?prefLabel .
            ?url owl:deprecated 'false' .
            FILTER(CONTAINS(?prefLabel,'%s (')).
            FILTER(CONTAINS(?prefLabel,'WoRMS %s')).
            FILTER(STRENDS(?prefLabel,'%s')).
            }""" % (spcs, aphia, label)
    r = sparql_nvs_json(s)
    list = []
    if len(r['results']['bindings']) == 0:
        a = 'No S25 term. Needs adding to NVS'
        b = "%s (ITIS: ?????: WoRMS %s) %s" % (spcs, aphia, label)
    else:
        for i in range(0,len(r['results']['bindings'])):
            a = xstr(r['results']['bindings'][i]['codval']['value'].replace('SDN:S25::',''))
            b = xstr(r['results']['bindings'][i]['prefLabel']['value'])
    list.append([a,b])
    return list

## Load P01 terms and semantic model vocabularies from the NERC Vocabulary Server

In [6]:
#%% Get the latest semantic model vocabulary contents from the NVS Sparql endpoint

a1 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0A++++PREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A++++%0D%0A++++select+%3F"
a2 = "+%3F"
a3 = "%0D%0A++++where+%7B%0D%0A++++%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2F"
a4 = "%2Fcurrent%2F%3E+skos%3Amember+%3Furl+.%0D%0A++++%3Furl+skos%3AprefLabel+%3F"
a5 = "+.%0D%0A++++%3Furl+skos%3Anotation+%3Fc+.%0D%0A++++%3Furl+owl%3Adeprecated+%27false%27+.%0D%0A++++BIND%28replace%28str%28%3Fc%29%2C%27SDN%3A"
a6 = "%3A%3A%27%2C%27%27%2C%27i%27%29+AS+%3F"
a7 = "%29%0D%0A++++%7D&output=csv&stylesheet="

S06 = pd.read_csv(a1+'S06'+a2+'S06_label'+a3+'S06'+a4+'S06_label'+a5+'S06'+a6+'S06'+a7)
S07 = pd.read_csv(a1+'S07'+a2+'S07_label'+a3+'S07'+a4+'S07_label'+a5+'S07'+a6+'S07'+a7)
S02 = pd.read_csv(a1+'S02'+a2+'S02_label'+a3+'S02'+a4+'S02_label'+a5+'S02'+a6+'S02'+a7)
S26 = pd.read_csv(a1+'S26'+a2+'S26_label'+a3+'S26'+a4+'S26_label'+a5+'S26'+a6+'S26'+a7)
P01 = pd.read_csv(a1+'P01'+a2+'P01_label'+a3+'P01'+a4+'P01_label'+a5+'P01'+a6+'P01'+a7)

# Download semantic component mapping

urlS06 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS06+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS06%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS06%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS06%29+.%0D%0A%7D&output=csv&stylesheet="
S06_P01 = pd.read_csv(urlS06)
print("P01-S06 mapping downloaded.")

urlS07 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS07+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS07%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS07%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS07%29+.%0D%0A%7D&output=csv&stylesheet="
S07_P01 = pd.read_csv(urlS07)
print("P01-S07 mapping downloaded.")

urlS27 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS27+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS27%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS27%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS27%29+.%0D%0A%7D&output=csv&stylesheet="
S27_P01 = pd.read_csv(urlS27)
print("P01-S27 mapping downloaded.")

urlS02 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS02+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS02%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS02%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS02%29+.%0D%0A%7D&output=csv&stylesheet="
S02_P01 = pd.read_csv(urlS02)
print("P01-S02 mapping downloaded.")

urlS26 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS26+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS26%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS26%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS26%29+.%0D%0A%7D&output=csv&stylesheet="
S26_P01 = pd.read_csv(urlS26)
print("P01-S26 mapping downloaded.")

urlS25 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS25+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0A%7D&output=csv&stylesheet="
S25_P01 = pd.read_csv(urlS25)
print("P01-S25 mapping downloaded.")

# Build P01 semantic model dataframe
P01 = pd.merge(P01, S06_P01, how='left', on='P01')
P01 = pd.merge(P01, S07_P01, how='left', on='P01')
P01 = pd.merge(P01, S27_P01, how='left', on='P01')
P01 = pd.merge(P01, S02_P01, how='left', on='P01')
P01 = pd.merge(P01, S26_P01, how='left', on='P01')
P01 = pd.merge(P01, S25_P01, how='left', on='P01')

P01 = P01.fillna(value={'S25': 'BE007736', 'S07': 'S0700006'})

print("P01 semantic model dataframe constructed")

display(P01.head(10))

P01-S06 mapping downloaded.
P01-S07 mapping downloaded.
P01-S27 mapping downloaded.
P01-S02 mapping downloaded.
P01-S26 mapping downloaded.
P01-S25 mapping downloaded.
P01 semantic model dataframe constructed


Unnamed: 0,P01,P01_label,S06,S07,S27,S02,S26,S25
0,PRESPR01,Pressure (spatial coordinate) exerted by the w...,S0600168,S0700006,,S023,MAT00640,BE007736
1,DOXMZZXX,Concentration of oxygen {O2 CAS 7782-44-7} per...,S0600045,S0700006,CS002779,S057,MAT00633,BE007736
2,DOXYMMOP,Concentration recalculated to zero salinity of...,S0600056,S0700006,CS002779,S053,MAT00633,BE007736
3,DOXYAAOP,Concentration of oxygen {O2 CAS 7782-44-7} per...,S0600045,S0700006,CS002779,S053,MAT00633,BE007736
4,OXYTAAOP,Temperature of oxygen determination by optode,S0600082,S0700006,,,,BE007736
5,DWIRRXUD,Downwelling vector irradiance as energy of ele...,S0600131,S0700006,,S028,MAT00640,BE007736
6,PSALST01,Practical salinity of the water body by CTD an...,S0600083,S0700006,,S032,MAT00640,BE007736
7,TEMPST01,Temperature of the water body by CTD or STD,S0600082,S0700006,,S032,MAT00640,BE007736
8,SIGTPR01,Sigma-theta of the water body by CTD and compu...,,S0700006,,S032,MAT00640,BE007736
9,CPHLPR01,Concentration of chlorophyll-a {chl-a CAS 479-...,S0600045,S0700006,CS002896,S053,MAT01053,BE007736


## Load ICES semantic model components for mapping to P01 semantic model from file into a Pandas DataFrame

In [7]:
inputs = pd.read_excel(inputfile)
param_combo = inputs.copy(deep=True)

# Insert number of rows in the input file in to the summary information
print("Rows input: %s" % len(inputs))
summary.append(["Rows input:", len(inputs)])

# Make a working copy of the parameter combinations for mapping and remove duplicate combinations
param_combo = inputs.copy(deep=True).drop_duplicates(keep=False)
input_duplicates = len(inputs) - len(param_combo)
print("Rows duplicated: %s" % input_duplicates)
summary.append(["Rows duplicated:", input_duplicates])
summary.append(["" , ""])
print("Rows for mapping: %s" % len(param_combo))
summary.append(["Rows for mapping:", len(param_combo)])

# Add columns needed for P01 semantic model
# In the working copy set NaNs to '-9' and add columns for mapped NVS semantic model elements
param_combo = param_combo.fillna('-9')
param_combo = param_combo.assign(S06_label='',              # Measurement Property
                                 S07_label='not specified', # Measurement Property Statistic
                                 S02_label='',              # Measurement - Matrix relationship
                                 )
param_combo['PARAM'] = param_combo['PARAM'].str.upper()
param_combo['AphiaID'] = param_combo['AphiaID'].astype('int32')
# Remove leading or trailing spaces from the text columns
columns = param_combo.columns.tolist()
columns.remove('AphiaID')
for column in columns:
    param_combo[column] = param_combo[column].str.strip()

Rows input: 5439
Rows duplicated: 44
Rows for mapping: 5395


# Mapping of chemical PARAMs and NVS S27 vocabulary entries
### Determine where direct mappings already published on the NVS or available in a local file

In [8]:
# First get S27 terms that have a mapping to ICES PARAM vocabulary published from the NVS
q = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
                    
    select ?PARAM ?S27 ?S27_label 
    where {
           <http://vocab.nerc.ac.uk/collection/S27/current/> skos:member ?url .
           ?url skos:notation ?a .
           ?url skos:prefLabel ?S27_label .
           ?url owl:deprecated 'false' .
           ?url skos:related ?c .
           FILTER(regex(str(?c), "http://vocab.ices.dk/services/rdf/collection/PARAM/", "i")) .
           BIND(substr(?a,10,8) as ?S27) .
           BIND(replace(str(?c), "http://vocab.ices.dk/services/rdf/collection/PARAM/", "", "i") AS ?PARAM) .
          }"""
# URL for the above query is:
url = """http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0A++++PREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A++++++++++++++++++++%0D%0A++++select+%3FPARAM+%3FS27+%3FS27_label+%0D%0A++++where+%7B%0D%0A+++++++++++%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS27%2Fcurrent%2F%3E+skos%3Amember+%3Furl+.%0D%0A+++++++++++%3Furl+skos%3Anotation+%3Fa+.%0D%0A+++++++++++%3Furl+skos%3AprefLabel+%3FS27_label+.%0D%0A+++++++++++%3Furl+owl%3Adeprecated+%27false%27+.%0D%0A+++++++++++%3Furl+skos%3Arelated+%3Fc+.%0D%0A+++++++++++FILTER%28regex%28str%28%3Fc%29%2C+%22http%3A%2F%2Fvocab.ices.dk%2Fservices%2Frdf%2Fcollection%2FPARAM%2F%22%2C+%22i%22%29%29+.%0D%0A+++++++++++BIND%28substr%28%3Fa%2C10%2C8%29+as+%3FS27%29+.%0D%0A+++++++++++BIND%28replace%28str%28%3Fc%29%2C+%22http%3A%2F%2Fvocab.ices.dk%2Fservices%2Frdf%2Fcollection%2FPARAM%2F%22%2C+%22%22%2C+%22i%22%29+AS+%3FPARAM%29+.%0D%0A++++++++++%7D&output=CSV&stylesheet="""

# More efficient to ingest SPARQL response as a CSV directly into a Pandas DataFrame
mapped_chems = pd.read_csv(url)
mapped_chems['SOURCE'] = 'NVS'

# Get local ICES PARAM to NVS S27 substance mapping from mapping file location provided earlier
local_map = pd.read_excel(mapfile)
local_map['SOURCE'] = mapfile

# Combine NVS mappings with those from the local file to generate a complete list of known mappings
full_chem_map = pd.concat([mapped_chems, local_map[['PARAM','S27','S27_label','SOURCE']]]).drop_duplicates(subset='PARAM').reset_index(drop=True)

# Identify any mappings in the local file already published from the NVS
duplicate_map = pd.merge(mapped_chems, local_map, how='inner', on='PARAM')


print("Number of ICES PARAM terms directly mapped to S27 chemical substance terms from NVS: %s" % (len(mapped_chems)))
summary.append(["Number of ICES PARAM terms directly mapped to S27 chemical substance terms from NVS:", (len(mapped_chems))])

print("Number of local mappings for chemical substances from file: %s" % (len(local_map)))
summary.append(["Number of local mappings for chemical substances from file:" , (len(local_map))])

display(local_map)

print("Number of chemical substance mappings in both local file and NVS: %s" % (len(duplicate_map)))
summary.append(["Number of chemical substance mappings in both local file and NVS:" , (len(duplicate_map))])

print("Review local mappings with BODC vocab team (vocab.services@bodc.ac.uk) for upload to NVS.")

Number of ICES PARAM terms directly mapped to S27 chemical substance terms from NVS: 0
Number of local mappings for chemical substances from file: 43


Unnamed: 0,PARAM,PRNAM,ICES_CASRN,S27_label,S27,NVS_CASRN,SOURCE
0,24D,"2,4-dichlorophenoxyacetic acid",94-75-7,"2,4-dichlorophenoxyacetic acid",CS001551,94-75-7,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
1,ATRZ,atrazine,1912-24-9,atrazine,CS001579,1912-24-9,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
2,BD190,"2,3,3',4,4',5,6-heptabromodiphenyl ether (BDE190)",189084-68-2,"2,3,3',4,4',5,6-heptabromodiphenyl ether",CS002181,189084-68-2,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
3,BENZ,benzene,71-43-2,benzene,CS003272,71-43-2,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
4,CACO3,calcium carbonate,471-34-1,calcium carbonate,CS003129,471-34-1,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
5,CB137,"2,2',3,4,4',5-hexachlorobiphenyl (CB137)",35694-06-5,"2,2',3,4,4',5-hexachlorobiphenyl",CS001740,35694-06-5,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
6,CB56,"2,3,3',4'-tetrachlorobiphenyl (CB56)",41464-43-1,"2,3,3',4'-tetrachlorobiphenyl",CS001852,41464-43-1,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
7,CB85,"2,2',3,4,4'-pentachlorobiphenyl (CB85)",65510-45-4,"2,2',3,4,4'-pentachlorobiphenyl",CS001747,65510-45-4,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
8,CB97,"2,2',3',4,5-pentachlorobiphenyl (CB97)",41464-51-1,"2,2',3,4',5'-pentachlorobiphenyl",CS001691,41464-51-1,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...
9,CO,cobalt,7440-48-4,cobalt,CS002447,7440-48-4,C:\Users\rthomas\Documents\GitHub\ICES2NVS_sem...


Number of chemical substance mappings in both local file and NVS: 0
Review local mappings with BODC vocab team (vocab.services@bodc.ac.uk) for upload to NVS.


### For PARAMs with CAS numbers check if the chemical substance exists within S27 and can be mapped via CAS number

In [9]:
# SPARQL query for all NVS substances with CAS numbers from the SPARQL endpoint
q =  """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
            
    select ?nvs_codval ?nvs_prefLabel ?nvs_casrn
    where {
           <http://vocab.nerc.ac.uk/collection/S27/current/> skos:member ?url .
           ?url skos:notation ?a .
           ?url skos:prefLabel ?nvs_prefLabel .
           ?url owl:deprecated 'false' .
           ?url owl:sameAs ?c .
           FILTER(regex(str(?c), "https://chem.nlm.nih.gov/chemidplus/rn/", "i")) .
           BIND(replace(str(?a),'SDN:S27::','','i') AS ?nvs_codval) .
           BIND(replace(str(?c),'https://chem.nlm.nih.gov/chemidplus/rn/','','i') AS ?nvs_casrn) .
          }"""                

# URL for the above query is:
url = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX%20skos%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX%20owl%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0A%0Aselect%20%3Fnvs_codval%20%3Fnvs_prefLabel%20%3Fnvs_casrn%0Awhere%20%7B%0A%20%20%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS27%2Fcurrent%2F%3E%20skos%3Amember%20%3Furl%20.%0A%20%20%3Furl%20skos%3Anotation%20%3Fa%20.%0A%20%20%3Furl%20skos%3AprefLabel%20%3Fnvs_prefLabel%20.%0A%20%20%3Furl%20owl%3Adeprecated%20'false'%20.%0A%20%20%3Furl%20owl%3AsameAs%20%3Fc%20.%0A%20%20FILTER(regex(str(%3Fc)%2C'https%3A%2F%2Fchem.nlm.nih.gov%2Fchemidplus%2Frn%2F'%2C'i'))%20.%0A%20%20BIND(replace(str(%3Fa)%2C'SDN%3AS27%3A%3A'%2C''%2C'i')%20AS%20%3Fnvs_codval)%20.%0A%20%20BIND(replace(str(%3Fc)%2C'https%3A%2F%2Fchem.nlm.nih.gov%2Fchemidplus%2Frn%2F'%2C''%2C'i')%20AS%20%3Fnvs_casrn)%20.%0A%7D&output=csv&stylesheet="   

# More efficient to ingest SPARQL response as a CSV directly into a Pandas DataFrame
nvs_cas = pd.read_csv(url)

display(nvs_cas)

Unnamed: 0,nvs_codval,nvs_prefLabel,nvs_casrn
0,CS002531,hexachlorobenzene,118-74-1
1,CS002244,"4,4'-dichlorodiphenyldichloroethane",72-54-8
2,CS026903,silicate,17181-37-2
3,CS002447,cobalt,7440-48-4
4,CS000151,total phosphorus,7723-14-0
5,CS001117,"2,4'-dichlorodiphenyltrichloroethane",789-02-6
6,CS001103,"2,4'-dichlorodiphenyldichloroethane",53-19-0
7,CS002877,nitrite,14797-65-0
8,CS026904,phosphate,14265-44-2
9,CS002896,chlorophyll-a,479-61-8


### For the contents of the input file determine if any new PARAM to S27 mappings are required

In [10]:
# Get subset of chemical PARAMS in the input list
input_PARAMS = param_combo[['PARAM','PRNAM','CAS']][param_combo['CAS']!='-9'].drop_duplicates().reset_index(drop=True)

print("Number of PARAM chemical substances from input file for P01 mapping: %s" % (len(input_PARAMS)))
summary.append(["Number of PARAM chemical substances from input file for P01 mapping:" , (len(input_PARAMS))])

# Merge information to determine which mappings are missing from the NVS
compare = pd.merge(input_PARAMS, full_chem_map, how='outer', on='PARAM')
S27_missing = compare[['PARAM','CAS']][compare['S27'].isnull()].copy(deep=True)

print("Number of PARAM chemical substances in file not directly mapped: %s" % (len(S27_missing)))
summary.append(["Number of PARAM chemical substances in file unable to be mapped:" , (len(S27_missing))])

# Merge information between unmapped PARAMs and NVS S27 based on CAS number
S27_casmap_review = pd.merge(S27_missing, nvs_cas, how='inner', left_on='CAS', right_on='nvs_casrn')

print("Number of potential PARAM-S27 mappings via CAS linkage: %s for %s PARAMs" % (len(S27_casmap_review), len(S27_casmap_review['PARAM'].drop_duplicates())))
summary.append(["    Number of potential PARAM-S27 mappings for review based on CAS linkage:", "%s for %s PARAMs" % (len(S27_casmap_review), len(S27_casmap_review['PARAM'].drop_duplicates()))])

print("Number of PARAMs with no S27 term identified via CAS number: %s" % (len(pd.concat([S27_missing['PARAM'], S27_casmap_review['PARAM']]).drop_duplicates(keep=False))))
summary.append(["    Number of PARAMs with no S27 term identified via CAS number:" , "%s" % len(pd.concat([S27_missing['PARAM'], S27_casmap_review['PARAM']]).drop_duplicates(keep=False))])
display(pd.concat([S27_missing['PARAM'], S27_casmap_review['PARAM']]).drop_duplicates(keep=False))

Number of PARAM chemical substances from input file for P01 mapping: 260
Number of PARAM chemical substances in file not directly mapped: 219
Number of potential PARAM-S27 mappings via CAS linkage: 215 for 198 PARAMs
Number of PARAMs with no S27 term identified via CAS number: 21


41         TBTIN
45         TPTIN
52         BBJKF
59         BDE66
86         CHRTR
105        OCDAN
118         BBKF
246    CB138+163
247      CB56+60
248        CDFDX
249         CDFO
250        PEBDE
251         SCB7
252        DDTEP
253        HPBDE
254        HXBDE
255         NBDE
256         OBDE
257        TEBDE
258        TRBDE
259        PFOSA
Name: PARAM, dtype: object

### Where mappings don't exist for chemical PARAMs or can't be made via CAS number as listed above:
### Either:
### - Add mapping to local file and re-run previous steps
### - Continue process and these rows will not be able to be mapped

In [11]:
# Add holding text for PARAM to S27 mappings from CAS for review
if len(S27_casmap_review)>0:
    warn = pd.DataFrame()
    warn['PARAM'] = S27_casmap_review['PARAM'].drop_duplicates().copy(deep=True).reset_index(drop=True)
    warn['S27'] = 'Potential S27 term exists.'
    warn['S27_label'] = 'Mapping needs to be reviewed and added to NVS or unmapped_substance.xlsx.'
    warn['SOURCE'] = 'CAS linkage exists.'
    
    full_chem_map = pd.concat([full_chem_map, warn])
    
# Add holding text for PARAM where no potential S27 exists through CAS
if len(S27_missing)>0:
    warn = pd.DataFrame()
    warn['PARAM'] = pd.concat([S27_missing['PARAM'], S27_casmap_review['PARAM']]).drop_duplicates(keep=False).copy(deep=True).reset_index(drop=True)
    warn['S27'] = 'No S27 term identified from CAS.'
    warn['S27_label'] = 'Potential new term to be added to S27 and mapped to PARAM.'
    warn['SOURCE'] = 'No CAS linkage found.'
    
    full_chem_map = pd.concat([full_chem_map, warn])

# Add S27 mappings to the main table    
param_combo = pd.merge(param_combo, full_chem_map[['PARAM','S27','S27_label']], how='left', on='PARAM')

# Mark rows that do not require mapping as "Not applicable."    
param_combo = param_combo.fillna(value={'S27': 'not applicable', 'S27_label': 'not applicable'})

print("Total combinations = %s" % (len(param_combo)))
PARAMs2map = pd.DataFrame()
PARAMs2map = param_combo[['PARAM','PRNAM']][param_combo['S27']=='not applicable'].drop_duplicates()

print("Review PARAMs with no mapping: %s" % len(PARAMs2map))

display(PARAMs2map)

Total combinations = 5395
Review PARAMs with no mapping: 31


Unnamed: 0,PARAM,PRNAM
38,CORG,organic carbon
122,CS137,cesium-137
140,GSMF1000,Grain Size Mass Fraction <1000 micron
141,GSMF125,Grain Size Mass Fraction <125 micron
142,GSMF2000,Grain Size Mass Fraction <2000 micron
143,GSMF63,Grain Size Mass Fraction <63 micron (silt/clay)
156,LOIGN,loss on ignition
157,MOCON%,moisture content percent
161,NORG,organic nitrogen (N)
704,SUSP,suspended solids


## Mapping of DTYPE, MATRX and METPT combinations to S26

In [12]:
matrix_check = param_combo[['DTYPE','MATRX','METPT']].drop_duplicates().copy(deep=True).reset_index(drop=True)

# Set to default of 'Check MATRX. Not mapped.'
matrix_check = matrix_check.assign(S26_label = 'Check MATRX. Not mapped.')

print("Number of MATRX for P01 mapping: %s" % len(matrix_check))
summary.append(["Number of MATRX for P01 mapping:" , len(matrix_check)])

for index, row in matrix_check.iterrows():
    if row['DTYPE'] == 'CF':
        # Set S26 label to 'biota'
        row['S26_label'] = 'biota'

    elif row['DTYPE'] == 'CS':
        if row['MATRX'] == 'SEDTOT':
            row['S26_label'] = 'sediment'
        elif row['MATRX'][3:len(row['MATRX'])] != 'TOT':
            row['S26_label'] = 'sediment <'+row['MATRX'][3:len(row['MATRX'])] +'um'
        else:
            row['S26_label'] = 'Check MATRX'
            
    elif row['DTYPE'] == 'CW':
        if row['MATRX'] == 'WT':
            if row['METPT'] == '-9':
                row['S26_label'] = 'water body [dissolved plus reactive particulate <unknown phase]'
            else:
                metpt_list = row['METPT'].split('~')               
                for metpt in metpt_list:
                    if metpt in ('NF','NONE','NA','CP'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate phase]'
                        continue
                    elif metpt in('GFF','GF/F','FF-GF-0.7'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <GF/F phase]'
                        continue
                    elif metpt in('GFC','GF/C','FF-GF-1.2','FF-PP-1.2'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <GF/C phase]'
                        continue
                    elif metpt in('FM-PC-0.4','FM-PC-0.45','FM-PES-0.45','FM-CN-0.45','FM-CA-0.45','PCF40','PCF45','PCF'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <0.4/0.45um phase]'
                        continue
                    elif metpt in('F'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <unknown phase]'
                        continue
                    elif metpt in('FM-CA-0.2'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <0.2um phase]'
                        continue
                    else:
                        if row['S26_label'] == '':
                            row['S26_label'] = 'Check METPT'
    #print("Row %s of %s matrix combinations mapped." % (index+1,len(matrix_check))) # Commented out used in debugging
#display(matrix_check)    

# Subset potential S26 new entries
S26new = matrix_check[matrix_check['S26_label']=='Check MATRX. Not mapped.']

print("Number of potential new S26 terms: %s" % len(S26new))
summary.append(["Number of potential new S26 terms:" , len(S26new)])
display(S26new)

# Add S26 semantic model mapping to the main table based on the combinations provided   
param_combo = pd.merge(param_combo, matrix_check, how='left', on=['DTYPE','MATRX','METPT'])

print("Total combinations = %s" % (len(param_combo)))

Number of MATRX for P01 mapping: 64
Number of potential new S26 terms: 5


Unnamed: 0,DTYPE,MATRX,METPT,S26_label
8,CW,SPM,NONE,Check MATRX. Not mapped.
9,CW,SPM,FM-PC-0.4,Check MATRX. Not mapped.
10,CW,SPM,NF,Check MATRX. Not mapped.
11,CW,SPM,FF-GF-1.2,Check MATRX. Not mapped.
12,CW,SPM,GFC1.2,Check MATRX. Not mapped.


Total combinations = 5395


# Taxon, WoRMS AphiaID, ITIS TSN combination check

In [13]:
# Get all existing TAXONs from S25 and simplify text labels to show distinct TAXON values from the S25 semantic model
url = """http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0A++++PREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A++++++++++++++++++++%0D%0A++++select+distinct+%3FAphiaID+%3FTAXON%0D%0A++++where+%7B%0D%0A+++++++++++%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furl+.%0D%0A+++++++++++%3Furl+skos%3AprefLabel+%3FprefLabel+.%0D%0A+++++++++++%3Furl+owl%3Adeprecated+%27false%27+.%0D%0A+++++++++++FILTER%28regex%28str%28%3FprefLabel%29%2C+%22WoRMS%22%2C+%22i%22%29%29+.%0D%0A+++++++++++BIND%28replace%28str%28%3FprefLabel%29%2C+%22%5C%5C+%5C%5C%5B.*%3F%5C%5C%5D%22%2C%22%22%2C+%22i%22%29+AS+%3FTAXON%29+.%0D%0A+++++++++++BIND%28replace%28replace%28replace%28str%28%3FTAXON%29%2C+%22%5C%5C%29%22%2C%22%22%2C+%22i%22%29%2C+%22.*%28%3F%3DWoRMS+%29%22%2C+%22%22%2C+%22i%22%29%2C+%22WoRMS+%22%2C+%22%22%2C+%22i%22%29+AS+%3FAphiaID%29+.%0D%0A++++++++++%7D%0D%0A++++order+by+%3FAphiaID%0D%0A&output=CSV&stylesheet=CSV"""

S25taxon = pd.read_csv(url)

# Identify multiple TAXONs per AphiaID within S25
S25taxon_duplicates = S25taxon[S25taxon.duplicated(['AphiaID'], keep=False)].copy(deep=True)
S25taxon_duplicates.replace(u'\xc2\xa0',u' ', regex=True, inplace=True)
S25taxon_duplicates.replace(u'\u2019',u"'", regex=True, inplace=True)

# Remove duplicate TAXON records from S25taxon dataframe
S25taxon_clean = pd.concat([S25taxon, S25taxon_duplicates]).drop_duplicates(keep=False).copy(deep=True).reset_index(drop=True)

# Create a Pandas DataFrame and populate with unique combinations of Species and AphiaID from the input file
input_taxa_check = pd.DataFrame()
input_taxa_check = param_combo[['Species','AphiaID']][param_combo['Species']!='-9'].drop_duplicates().reset_index(drop=True)
input_taxa_check = input_taxa_check.astype({"AphiaID": int})

print("Number of Species for P01 mapping: %s" % len(input_taxa_check))
summary.append(["Number of Species for P01 mapping:" , len(input_taxa_check)])
#%%
# Function to call WoRMS web service
def worms_check(url):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    if response.code==204:
        e = 'No AphiaID found.'
    elif response.code==206:
        e = 'Multiple AphiaID found.'
    elif response.code==200:
        e = response.read()
    return e

# If AphiaID is absent then lookup using the WoRMS web service
for index, row in input_taxa_check.iterrows():
    if row['AphiaID'] == -9:
        if '&' not in row['Species']:
            url = 'http://marinespecies.org/rest/AphiaIDByName/%s?marine_only=true' % row['Species'].replace(" ","%20")
            input_taxa_check.loc[index, 'AphiaID'] = worms_check(url)
        else:
            input_taxa_check.loc[index, 'AphiaID'] = 'Combination of taxa'

inputs_aphia = pd.merge(inputs, input_taxa_check, on='Species')


#%% 
# Get WoRMS scientific names from AphiaID provided using WoRMS web service
def worms_check(url):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    if response.code==204:
        list.append('No response.')
    elif response.code==200:
        e = response.read()
    return json.loads(e)
       
worms = pd.DataFrame()        

aphia_list = input_taxa_check['AphiaID'].tolist()
y = len(aphia_list)
if y<50:
    ids = ''
    for i in range(0,50):
        ids = ids + 'aphiaids%5B%5D=' + str(aphia_list[i]) + '&'
    url = 'http://www.marinespecies.org/rest/AphiaRecordsByAphiaIDs?%s' % ids[0:-1]
    worms = pd.DataFrame(worms_check(url), ignore_index=True)
elif y>50:
    for j in range(0,int(y/50)):
        ids = ''
        for i in range(j*50,min((j+1)*50,y)):
            ids = ids + 'aphiaids%5B%5D=' + str(aphia_list[i]) + '&'
        url = 'http://www.marinespecies.org/rest/AphiaRecordsByAphiaIDs?%s' % ids[0:-1]
        worms = pd.concat([worms, pd.DataFrame(worms_check(url))], ignore_index=True)
    ids = ''
    for i in range((j+1)*50,min((j+2)*50,y)):
        ids = ids + 'aphiaids%5B%5D=' + str(aphia_list[i]) + '&'
    url = 'http://www.marinespecies.org/rest/AphiaRecordsByAphiaIDs?%s' % ids[0:-1]
    worms = pd.concat([worms, pd.DataFrame(worms_check(url))], ignore_index=True)

input_taxa_check = pd.merge(input_taxa_check, worms[['AphiaID','scientificname']], how='left', on='AphiaID')

input_taxa_check = input_taxa_check.rename(index=str, columns={'scientificname': 'name_from_AphiaID'})

# Set column to indicate if a discrepancy to be resolved exists based on Scientific names not matching
a = input_taxa_check.Species == input_taxa_check.name_from_AphiaID
input_taxa_check['proceed'] = np.where(a, 'Yes', 'No')

# Subset those taxa where naming discrepancy exists
taxa_discrepancy = input_taxa_check[input_taxa_check['proceed']=='No'].reset_index(drop=True)

print("Number of Species with name discrepancy between taxon-AphiaID combination in file and WoRMS: %s" % len(taxa_discrepancy))
summary.append(["Number of Species with name discrepancy between taxon-AphiaID combination in file and WoRMS:" , len(taxa_discrepancy)])
display(taxa_discrepancy)

# Map AphiaID to S25 component TAXON for the non-duplicate AphiaID results in S25
taxa_map = pd.merge(input_taxa_check[['AphiaID','name_from_AphiaID']],
                    S25taxon_clean, 
                    how='left', 
                    on='AphiaID')
taxa_map = taxa_map.fillna(value={'TAXON': 'New TAXON required.'}).drop_duplicates()


# Add TAXON mapping to the main table based on the AphiaID provided    
param_combo = pd.merge(param_combo, taxa_map, how='left', on='AphiaID')

# Mark rows that do not require mapping as "Not applicable."    
param_combo = param_combo.fillna(value={'TAXON': 'not specified', 'name_from_AphiaID': '-9'})

print("Total combinations = %s" % (len(param_combo)))

Number of Species for P01 mapping: 116
Number of Species with name discrepancy between taxon-AphiaID combination in file and WoRMS: 9


Unnamed: 0,Species,AphiaID,name_from_AphiaID,proceed
0,Venerupis philippinarum,231750,Ruditapes philippinarum,No
1,Psetta maxima,127149,Scophthalmus maximus,No
2,Phoca hispida,159021,Pusa hispida,No
3,Mysidacea,149668,Mysida,No
4,Raja radiata,105865,Amblyraja radiata,No
5,Clupea harengus membras,126417,Clupea harengus,No
6,Clupea harengus membras,126417,Clupea harengus,No
7,Sebastes marinus,151324,Sebastes norvegicus,No
8,Macoma baltica,880017,Limecola balthica,No


Total combinations = 5395


# Iterate through file for ICES combinations where a mapping does not already exist.

In [14]:
local_map_terms = local_map['PARAM'].tolist()

nvs = nvs_cas

clean_taxa = S25taxon_clean
query_taxa = taxa_discrepancy['Species'].tolist()

param_combo['S25'] = pd.Series()
param_combo['S25_label'] = pd.Series()
param_combo['TAXON'] = pd.Series()
param_combo['SUBCOMPONENT'] = pd.Series()
param_combo['STAGE'] = pd.Series()
param_combo['COLOUR'] = pd.Series()
param_combo['SUBGROUP'] = pd.Series()
param_combo['SIZE'] = pd.Series()
param_combo['GENDER'] = pd.Series()
param_combo['MORPHOLOGY'] = pd.Series()

for row in range(0,len(param_combo)):
    if param_combo.loc[row,['P01_Code']][0]=='-9':

        # Logic for contaminants in sediment mappings
        if param_combo.loc[row,['DTYPE']][0] == 'CS':
            param_combo.loc[row,['S25']] = 'BE007736'
            param_combo.loc[row,['S25_label']] = 'not applicable'
            if param_combo.loc[row,['MUNIT']][0] == '%':
                param_combo.loc[row,['S06_label']] = "Proportion"
            elif param_combo.loc[row,['MUNIT']][0][len(param_combo.loc[row,['MUNIT']][0])-1] == 'g':
                param_combo.loc[row,['S06_label']] = 'Concentration'
            else:
                param_combo.loc[row,['S06_label']] = 'Check MUNIT'
                
            if param_combo.loc[row,['BASIS']][0] == 'D':
                param_combo.loc[row,['S02_label']] = 'per unit dry weight of'
            elif param_combo.loc[row,['BASIS']][0] == 'W':
                param_combo.loc[row,['S02_label']] = 'per unit wet weight of'
            else:
                param_combo.loc[row,['S02_label']] = 'Check BASIS'
                
            if param_combo.loc[row,['MATRX']][0] == 'SEDTOT':
                param_combo.loc[row,['S26_label']] = 'sediment'
            elif param_combo.loc[row,['MATRX']][0][3:len(param_combo.loc[row,['MATRX']])] != 'TOT':
                param_combo.loc[row,['S26_label']] = 'sediment <'+param_combo.loc[row,['MATRX']][0][3:len(param_combo.loc[row]['MATRX'])] +'um'
            else:
                param_combo.loc[row,['S26_label']] = 'Check MATRX'

        # Logic for contaminants in water mappings       
        if param_combo.loc[row,['DTYPE']][0] == 'CW':
            param_combo.loc[row,['S25']] = 'BE007736'
            param_combo.loc[row,['S25_label']] = 'not applicable'
            if param_combo.loc[row,['MUNIT']][0] =='ntu':
                if param_combo.loc[row,['PARAM']][0]=='TURB':
                    param_combo.loc[row,['S06_label']] = 'Turbidity'
                    param_combo.loc[row,['S02_label']] = 'of the'        
            elif param_combo.loc[row,['MUNIT']][0] == '%':
                param_combo.loc[row,['S06_label']] = "Proportion"
            elif param_combo.loc[row,['MUNIT']][0] == 'mBq/l':
                param_combo.loc[row,['S06_label']] = "Activity"        
                param_combo.loc[row,['S02_label']] = 'per unit volume of the'
            elif param_combo.loc[row,['MUNIT']][0][len(param_combo.loc[row,['MUNIT']][0])-1] == 'g':
                param_combo.loc[row,['S06_label']] = 'Concentration'
                param_combo.loc[row,['S02_label']] = 'per unit mass of the'
            elif param_combo.loc[row,['MUNIT']][0][len(param_combo.loc[row,['MUNIT']][0])-1] == 'l':
                param_combo.loc[row,['S06_label']] = 'Concentration'
                param_combo.loc[row,['S02_label']] = 'per unit volume of the'
            else:
                param_combo.loc[row,['S06_label']] = 'Check MUNIT'
                param_combo.loc[row,['S02_label']] = 'Check MUNIT'
                        
            if param_combo.loc[row,['MATRX']][0] == 'WT':
                if param_combo.loc[row,['METPT']][0]=='-9':
                    param_combo.loc[row,['S26_label']]='water body [dissolved plus reactive particulate <unknown phase]'
                else:
                    metpt_list = param_combo.loc[row,['METPT']][0].split('~')               
                    for metpt in metpt_list:
                        if metpt in ('NF','NONE','NA','CP'):
                            param_combo.loc[row,['S26_label']]='water body [dissolved plus reactive particulate phase]'
                            continue
                        elif metpt in('GFF','GF/F','FF-GF-0.7'):
                            param_combo.loc[row,['S26_label']]='water body [dissolved plus reactive particulate <GF/F phase]'
                            continue
                        elif metpt in('GFC','GF/C','FF-GF-1.2','FF-PP-1.2'):
                            param_combo.loc[row,['S26_label']]='water body [dissolved plus reactive particulate <GF/C phase]'
                            continue
                        elif metpt in('FM-PC-0.4','FM-PC-0.45','FM-PES-0.45','FM-CN-0.45','FM-CA-0.45','PCF40','PCF45','PCF'):
                            param_combo.loc[row,['S26_label']] = 'water body [dissolved plus reactive particulate <0.4/0.45um phase]'
                            continue
                        elif metpt in('F'):
                            param_combo.loc[row,['S26_label']]='water body [dissolved plus reactive particulate <unknown phase]'
                            continue
                        elif metpt in('FM-CA-0.2'):
                            param_combo.loc[row,['S26_label']]='water body [dissolved plus reactive particulate <0.2um phase]'
                            continue
                        else:
                            if param_combo.loc[row,['S26_label']][0] == '':
                                param_combo.loc[row,['S26_label']]='Check METPT'

        # Logic for contaminants in biota mappings
        if param_combo.loc[row,['DTYPE']][0] == 'CF':
            # Set S26 label to 'biota'
            param_combo.loc[row,['S26_label']] = 'biota'
            
            if param_combo.loc[row,['CAS']][0] in ('-9','NA'):
                if param_combo.loc[row,['PARAM']][0]!='CS137':
                    param_combo.loc[row,['S06_label']] = 'Generate mapping'
                else:
                    param_combo.loc[row,['S06_label']] = 'Concentration'
            else:
                param_combo.loc[row,['S06_label']] = 'Concentration'

            # Map BASIS column for dry weight, wet weight and lipid normalisaed concentrations. Anything else requires checking.  
            if param_combo.loc[row,['BASIS']][0] == 'D':
                param_combo.loc[row,['S02_label']] = 'per unit dry weight of'
            elif param_combo.loc[row,['BASIS']][0] == 'W':
                param_combo.loc[row,['S02_label']] = 'per unit wet weight of'
            elif param_combo.loc[row,['BASIS']][0] == 'L':
                param_combo.loc[row,['S02_label']] = 'in'
                param_combo.loc[row,['S06_label']] = 'Lipid-normalised concentration'
            else:
                param_combo.loc[row,['S02_label']] = 'Check BASIS'
            
            # Map matrix of the biota to appropriate NVS S25 SUBCOMPONENT and/or STAGE. Not some constraints based on taxa type applied in the code.
            if param_combo.loc[row,['MATRX']][0] == 'WO':
                param_combo.loc[row,['SUBCOMPONENT']] = 'not specified'
            elif param_combo.loc[row,['MATRX']][0] == 'TM':
                param_combo.loc[row,['SUBCOMPONENT']] = 'muscle tissue'
            elif param_combo.loc[row,['MATRX']][0] == 'SI':
                param_combo.loc[row,['SUBCOMPONENT']] = 'not specified'
            elif param_combo.loc[row,['MATRX']][0] == 'SH':
                param_combo.loc[row,['SUBCOMPONENT']] = 'shell'
            elif param_combo.loc[row,['MATRX']][0] == 'SB':
                if param_combo.loc[row,['Species']][0] not in ('Gobius','Crangon crangon','Mysidacea'):
                    param_combo.loc[row,['SUBCOMPONENT']] = 'flesh'
                else:
                    param_combo.loc[row,['SUBCOMPONENT']] = 'Checking species-matrx combo validity with ICES.'
            elif param_combo.loc[row,['MATRX']][0] == 'RO':
                param_combo.loc[row,['STAGE']] = 'eggs'
                param_combo.loc[row,['SUBCOMPONENT']] = 'not specified'
            elif param_combo.loc[row,['MATRX']][0] == 'MU&EP':
                param_combo.loc[row,['SUBCOMPONENT']] = 'muscle tissues and skin'
            elif param_combo.loc[row,['MATRX']][0] == 'MU':
                if param_combo.loc[row,['Species']][0] == 'Loligo vulgaris':
                    param_combo.loc[row,['SUBCOMPONENT']] = 'flesh'
                else:
                    param_combo.loc[row,['SUBCOMPONENT']] = 'muscle tissue'
            elif param_combo.loc[row,['MATRX']][0] == 'LI':
                param_combo.loc[row,['SUBCOMPONENT']] = 'liver'
            elif param_combo.loc[row,['MATRX']][0] == 'KI':
                param_combo.loc[row,['SUBCOMPONENT']] = 'kidney'
            elif param_combo.loc[row,['MATRX']][0] == 'GO':
                param_combo.loc[row,['SUBCOMPONENT']] = 'gonads'
            elif param_combo.loc[row,['MATRX']][0] == 'GI':
                param_combo.loc[row,['SUBCOMPONENT']] = 'gill'
            elif param_combo.loc[row,['MATRX']][0] == 'FE':
                param_combo.loc[row,['SUBCOMPONENT']] = 'feathers'
            elif param_combo.loc[row,['MATRX']][0] == 'FA':
                param_combo.loc[row,['SUBCOMPONENT']] = 'body fat'
            elif param_combo.loc[row,['MATRX']][0] == 'EX':
                if param_combo.loc[row,['Species']][0] == 'Mytilus edulis':
                    param_combo.loc[row,['SUBCOMPONENT']] = 'shell'
                else:
                    param_combo.loc[row,['SUBCOMPONENT']] = 'Checking species-matrx combo validity with ICES.'
            elif param_combo.loc[row,['MATRX']][0] == 'EP':
                param_combo.loc[row,['SUBCOMPONENT']] = 'skin'
            elif param_combo.loc[row,['MATRX']][0] == 'EH':
                param_combo.loc[row,['STAGE']] = 'eggs'
                param_combo.loc[row,['SUBCOMPONENT']] = 'egg yolk and albumen homogenate'
            elif param_combo.loc[row,['MATRX']][0] == 'EG':
                param_combo.loc[row,['STAGE']] = 'eggs'
            elif param_combo.loc[row,['MATRX']][0] == 'BS':
                param_combo.loc[row,['SUBCOMPONENT']] = 'blood serum'
            elif param_combo.loc[row,['MATRX']][0] == 'BR':
                param_combo.loc[row,['SUBCOMPONENT']] = 'brain'
            elif param_combo.loc[row,['MATRX']][0] == 'BL':
                param_combo.loc[row,['SUBCOMPONENT']] = 'blood'
            elif param_combo.loc[row,['MATRX']][0] == 'BC':
                param_combo.loc[row,['SUBCOMPONENT']] = 'blood cells'
            elif param_combo.loc[row,['MATRX']][0] == 'BB':
                if param_combo.loc[row,['Note']][0] != 'Fish':
                    param_combo.loc[row,['SUBCOMPONENT']] = 'blubber'
                else:
                    param_combo.loc[row,['SUBCOMPONENT']] = 'Checking species-matrx combo validity with ICES.'

            # Where there is no Species - AphiaID discrepancy check the NVS to see if taxa is already published.
            if param_combo.loc[row,['Species']][0] in clean_taxa:
                spcs = param_combo.loc[row,['Species']][0]
                aphia = int(param_combo.loc[row,['AphiaID']][0])
                
                if len(taxon_code[taxon_code['AphiaID']==aphia])==0:
                    taxon_list = taxon_map(spcs, aphia)
                    taxon_code = pd.concat([taxon_code,pd.DataFrame.from_records(taxon_list,columns=['taxon_preflabel','AphiaID'])])
                    param_combo.loc[row,['TAXON']] = taxon_list[0][0]
                else:
                    param_combo.loc[row,['TAXON']] = taxon_code[taxon_code['AphiaID']==aphia].iloc[0]['taxon_preflabel']
                
                #Build S25 preflabel components for text matching
                label=''
                txn = param_combo.loc[row,['TAXON']][0]
                scp = param_combo.loc[row,['SUBCOMPONENT']][0]
                stg = param_combo.loc[row,['STAGE']][0]
                if np.any([txn == 'Not available', scp in ('Checking species-matrx combo validity with ICES.','New term needed.'), stg in ('Checking species-matrx combo validity with ICES.','New term needed.')]):
                    label = 'No term for at least one S25 model list. Needs adding to NVS'
                elif np.all([txn != 'Not available', stg == 'not specified', scp == 'not specified']):
                    label = ')'
                elif np.all([txn != 'Not available', stg == 'not specified', scp != 'not specified']):
                    label = '[Subcomponent: %s]' % (scp)
                elif np.all([txn != 'Not available', stg != 'not specified', scp == 'not specified']):
                    label = '[Stage: %s]' % (stg)
                else:
                    label = '[Stage: %s Subcomponent: %s]' % (stg, scp)
                param_combo.loc[row,['S25_label']] = (txn + " " + label).replace(") )", ")")
                
                # Look up S25 preflabel to identify appropriate S25 codval
                if label !='No term for at least one S25 model list. Needs adding to NVS':
                    if len(S25_code[S25_code['S25_preflabel']==label])==0:
                        S25_list = S25_lookup(spcs,aphia,label)
                        S25_code = pd.concat([S25_code,pd.DataFrame.from_records(S25_list,columns=['S25_codval','S25_preflabel'])])
                        param_combo.loc[row,['S25']] = S25_list[0][0]
                    else:
                        param_combo.loc[row,['S25']] = S25_code[S25_code['S25_preflabel']==label].iloc[0]['S25_codval']
                else:
                    param_combo.loc[row,['S25']] = label
            # Where there is a Species - AphiaID discrepancy continue with the next step.    
            elif param_combo.loc[row,['Species']][0] in query_taxa:
                param_combo.loc[row,['TAXON']] = 'To be resolved.'
        
        # Logic for mapping to ICES PARAM based on CAS registry number or unmapped file        
        cas = param_combo.loc[row,['CAS']][0]
        param = param_combo.loc[row,['PRNAM']][0]
        if param in local_map_terms:
            codval = local_map['NVS_CODVAL'][local_map['PARAM']==param]
            preflabel = local_map['NVS_PREFLABEL'][local_map['PARAM']==param]
            param_combo.loc[row,['S27']] = codval.iloc[0]
            param_combo.loc[row,['S27_label']] = preflabel.iloc[0]            
        elif cas in ('-9','NA'):
            param_combo.loc[row,['S27']] = 'not applicable'
            param_combo.loc[row,['S27_label']] = 'not applicable' 
        else:
            codval = nvs['nvs_codval'][nvs['nvs_casrn']==cas]
            preflabel = nvs['nvs_prefLabel'][nvs['nvs_casrn']==cas]
            if len(nvs[nvs['nvs_casrn']==cas]) == 0:
                param_combo.loc[row,['S27']] = 'No S27 term. Needs adding to NVS'   
            elif len(nvs[nvs['nvs_casrn']==cas]) == 1:
                param_combo.loc[row,['S27']] = codval.iloc[0]
                param_combo.loc[row,['S27_label']] = preflabel.iloc[0]
            elif len(nvs[nvs['nvs_casrn']==cas]) > 1:
                if len(nvs[nvs['nvs_prefLabel']==param]) == 1:
                    codval = nvs['nvs_codval'][nvs['nvs_prefLabel']==param]
                    preflabel = nvs['nvs_prefLabel'][nvs['nvs_prefLabel']==param]
                    param_combo.loc[row,['S27']] = codval.iloc[0]
                    param_combo.loc[row,['S27_label']] = preflabel.iloc[0]
                elif len(nvs[nvs['nvs_prefLabel']==param]) == 0:
                    if param == 'mercury':
                        param = 'total mercury'
                        codval = nvs['nvs_codval'][nvs['nvs_prefLabel']==param]
                        preflabel = nvs['nvs_prefLabel'][nvs['nvs_prefLabel']==param]
                        param_combo.loc[row,['S27']] = codval.iloc[0]
                        param_combo.loc[row,['S27_label']] = preflabel.iloc[0]
    print("Row %s of %s complete." % (row+1,len(param_combo)))

# Replaces any problematic text characters from the NVS imported into the DataFrame
# that will cause issues when writing the output to file.
param_combo.replace(u'\xa0',u' ', regex=True, inplace=True)
param_combo.replace(u'\u2019',u"'", regex=True, inplace=True)

# Reorder columns for output
param_combo = param_combo[['PARGROUP','PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Note','AphiaID','Species',
                          'P01_Code','S06_label','S07_label','S27_label','S27','S02_label','S26_label','S25','S25_label',
                          'TAXON','STAGE','GENDER','SIZE','SUBCOMPONENT','MORPHOLOGY','COLOUR','SUBGROUP']]

print("Total combinations in file = %s" % (len(param_combo)))
         

Row 1 of 5395 complete.
Row 2 of 5395 complete.
Row 3 of 5395 complete.
Row 4 of 5395 complete.
Row 5 of 5395 complete.
Row 6 of 5395 complete.
Row 7 of 5395 complete.
Row 8 of 5395 complete.
Row 9 of 5395 complete.
Row 10 of 5395 complete.
Row 11 of 5395 complete.
Row 12 of 5395 complete.
Row 13 of 5395 complete.
Row 14 of 5395 complete.
Row 15 of 5395 complete.
Row 16 of 5395 complete.
Row 17 of 5395 complete.
Row 18 of 5395 complete.
Row 19 of 5395 complete.
Row 20 of 5395 complete.
Row 21 of 5395 complete.
Row 22 of 5395 complete.
Row 23 of 5395 complete.
Row 24 of 5395 complete.
Row 25 of 5395 complete.
Row 26 of 5395 complete.
Row 27 of 5395 complete.
Row 28 of 5395 complete.
Row 29 of 5395 complete.
Row 30 of 5395 complete.
Row 31 of 5395 complete.
Row 32 of 5395 complete.
Row 33 of 5395 complete.
Row 34 of 5395 complete.
Row 35 of 5395 complete.
Row 36 of 5395 complete.
Row 37 of 5395 complete.
Row 38 of 5395 complete.
Row 39 of 5395 complete.
Row 40 of 5395 complete.
Row 41 of

Row 331 of 5395 complete.
Row 332 of 5395 complete.
Row 333 of 5395 complete.
Row 334 of 5395 complete.
Row 335 of 5395 complete.
Row 336 of 5395 complete.
Row 337 of 5395 complete.
Row 338 of 5395 complete.
Row 339 of 5395 complete.
Row 340 of 5395 complete.
Row 341 of 5395 complete.
Row 342 of 5395 complete.
Row 343 of 5395 complete.
Row 344 of 5395 complete.
Row 345 of 5395 complete.
Row 346 of 5395 complete.
Row 347 of 5395 complete.
Row 348 of 5395 complete.
Row 349 of 5395 complete.
Row 350 of 5395 complete.
Row 351 of 5395 complete.
Row 352 of 5395 complete.
Row 353 of 5395 complete.
Row 354 of 5395 complete.
Row 355 of 5395 complete.
Row 356 of 5395 complete.
Row 357 of 5395 complete.
Row 358 of 5395 complete.
Row 359 of 5395 complete.
Row 360 of 5395 complete.
Row 361 of 5395 complete.
Row 362 of 5395 complete.
Row 363 of 5395 complete.
Row 364 of 5395 complete.
Row 365 of 5395 complete.
Row 366 of 5395 complete.
Row 367 of 5395 complete.
Row 368 of 5395 complete.
Row 369 of 5

Row 655 of 5395 complete.
Row 656 of 5395 complete.
Row 657 of 5395 complete.
Row 658 of 5395 complete.
Row 659 of 5395 complete.
Row 660 of 5395 complete.
Row 661 of 5395 complete.
Row 662 of 5395 complete.
Row 663 of 5395 complete.
Row 664 of 5395 complete.
Row 665 of 5395 complete.
Row 666 of 5395 complete.
Row 667 of 5395 complete.
Row 668 of 5395 complete.
Row 669 of 5395 complete.
Row 670 of 5395 complete.
Row 671 of 5395 complete.
Row 672 of 5395 complete.
Row 673 of 5395 complete.
Row 674 of 5395 complete.
Row 675 of 5395 complete.
Row 676 of 5395 complete.
Row 677 of 5395 complete.
Row 678 of 5395 complete.
Row 679 of 5395 complete.
Row 680 of 5395 complete.
Row 681 of 5395 complete.
Row 682 of 5395 complete.
Row 683 of 5395 complete.
Row 684 of 5395 complete.
Row 685 of 5395 complete.
Row 686 of 5395 complete.
Row 687 of 5395 complete.
Row 688 of 5395 complete.
Row 689 of 5395 complete.
Row 690 of 5395 complete.
Row 691 of 5395 complete.
Row 692 of 5395 complete.
Row 693 of 5

Row 974 of 5395 complete.
Row 975 of 5395 complete.
Row 976 of 5395 complete.
Row 977 of 5395 complete.
Row 978 of 5395 complete.
Row 979 of 5395 complete.
Row 980 of 5395 complete.
Row 981 of 5395 complete.
Row 982 of 5395 complete.
Row 983 of 5395 complete.
Row 984 of 5395 complete.
Row 985 of 5395 complete.
Row 986 of 5395 complete.
Row 987 of 5395 complete.
Row 988 of 5395 complete.
Row 989 of 5395 complete.
Row 990 of 5395 complete.
Row 991 of 5395 complete.
Row 992 of 5395 complete.
Row 993 of 5395 complete.
Row 994 of 5395 complete.
Row 995 of 5395 complete.
Row 996 of 5395 complete.
Row 997 of 5395 complete.
Row 998 of 5395 complete.
Row 999 of 5395 complete.
Row 1000 of 5395 complete.
Row 1001 of 5395 complete.
Row 1002 of 5395 complete.
Row 1003 of 5395 complete.
Row 1004 of 5395 complete.
Row 1005 of 5395 complete.
Row 1006 of 5395 complete.
Row 1007 of 5395 complete.
Row 1008 of 5395 complete.
Row 1009 of 5395 complete.
Row 1010 of 5395 complete.
Row 1011 of 5395 complete.


Row 1282 of 5395 complete.
Row 1283 of 5395 complete.
Row 1284 of 5395 complete.
Row 1285 of 5395 complete.
Row 1286 of 5395 complete.
Row 1287 of 5395 complete.
Row 1288 of 5395 complete.
Row 1289 of 5395 complete.
Row 1290 of 5395 complete.
Row 1291 of 5395 complete.
Row 1292 of 5395 complete.
Row 1293 of 5395 complete.
Row 1294 of 5395 complete.
Row 1295 of 5395 complete.
Row 1296 of 5395 complete.
Row 1297 of 5395 complete.
Row 1298 of 5395 complete.
Row 1299 of 5395 complete.
Row 1300 of 5395 complete.
Row 1301 of 5395 complete.
Row 1302 of 5395 complete.
Row 1303 of 5395 complete.
Row 1304 of 5395 complete.
Row 1305 of 5395 complete.
Row 1306 of 5395 complete.
Row 1307 of 5395 complete.
Row 1308 of 5395 complete.
Row 1309 of 5395 complete.
Row 1310 of 5395 complete.
Row 1311 of 5395 complete.
Row 1312 of 5395 complete.
Row 1313 of 5395 complete.
Row 1314 of 5395 complete.
Row 1315 of 5395 complete.
Row 1316 of 5395 complete.
Row 1317 of 5395 complete.
Row 1318 of 5395 complete.
R

Row 1591 of 5395 complete.
Row 1592 of 5395 complete.
Row 1593 of 5395 complete.
Row 1594 of 5395 complete.
Row 1595 of 5395 complete.
Row 1596 of 5395 complete.
Row 1597 of 5395 complete.
Row 1598 of 5395 complete.
Row 1599 of 5395 complete.
Row 1600 of 5395 complete.
Row 1601 of 5395 complete.
Row 1602 of 5395 complete.
Row 1603 of 5395 complete.
Row 1604 of 5395 complete.
Row 1605 of 5395 complete.
Row 1606 of 5395 complete.
Row 1607 of 5395 complete.
Row 1608 of 5395 complete.
Row 1609 of 5395 complete.
Row 1610 of 5395 complete.
Row 1611 of 5395 complete.
Row 1612 of 5395 complete.
Row 1613 of 5395 complete.
Row 1614 of 5395 complete.
Row 1615 of 5395 complete.
Row 1616 of 5395 complete.
Row 1617 of 5395 complete.
Row 1618 of 5395 complete.
Row 1619 of 5395 complete.
Row 1620 of 5395 complete.
Row 1621 of 5395 complete.
Row 1622 of 5395 complete.
Row 1623 of 5395 complete.
Row 1624 of 5395 complete.
Row 1625 of 5395 complete.
Row 1626 of 5395 complete.
Row 1627 of 5395 complete.
R

Row 1896 of 5395 complete.
Row 1897 of 5395 complete.
Row 1898 of 5395 complete.
Row 1899 of 5395 complete.
Row 1900 of 5395 complete.
Row 1901 of 5395 complete.
Row 1902 of 5395 complete.
Row 1903 of 5395 complete.
Row 1904 of 5395 complete.
Row 1905 of 5395 complete.
Row 1906 of 5395 complete.
Row 1907 of 5395 complete.
Row 1908 of 5395 complete.
Row 1909 of 5395 complete.
Row 1910 of 5395 complete.
Row 1911 of 5395 complete.
Row 1912 of 5395 complete.
Row 1913 of 5395 complete.
Row 1914 of 5395 complete.
Row 1915 of 5395 complete.
Row 1916 of 5395 complete.
Row 1917 of 5395 complete.
Row 1918 of 5395 complete.
Row 1919 of 5395 complete.
Row 1920 of 5395 complete.
Row 1921 of 5395 complete.
Row 1922 of 5395 complete.
Row 1923 of 5395 complete.
Row 1924 of 5395 complete.
Row 1925 of 5395 complete.
Row 1926 of 5395 complete.
Row 1927 of 5395 complete.
Row 1928 of 5395 complete.
Row 1929 of 5395 complete.
Row 1930 of 5395 complete.
Row 1931 of 5395 complete.
Row 1932 of 5395 complete.
R

Row 2200 of 5395 complete.
Row 2201 of 5395 complete.
Row 2202 of 5395 complete.
Row 2203 of 5395 complete.
Row 2204 of 5395 complete.
Row 2205 of 5395 complete.
Row 2206 of 5395 complete.
Row 2207 of 5395 complete.
Row 2208 of 5395 complete.
Row 2209 of 5395 complete.
Row 2210 of 5395 complete.
Row 2211 of 5395 complete.
Row 2212 of 5395 complete.
Row 2213 of 5395 complete.
Row 2214 of 5395 complete.
Row 2215 of 5395 complete.
Row 2216 of 5395 complete.
Row 2217 of 5395 complete.
Row 2218 of 5395 complete.
Row 2219 of 5395 complete.
Row 2220 of 5395 complete.
Row 2221 of 5395 complete.
Row 2222 of 5395 complete.
Row 2223 of 5395 complete.
Row 2224 of 5395 complete.
Row 2225 of 5395 complete.
Row 2226 of 5395 complete.
Row 2227 of 5395 complete.
Row 2228 of 5395 complete.
Row 2229 of 5395 complete.
Row 2230 of 5395 complete.
Row 2231 of 5395 complete.
Row 2232 of 5395 complete.
Row 2233 of 5395 complete.
Row 2234 of 5395 complete.
Row 2235 of 5395 complete.
Row 2236 of 5395 complete.
R

Row 2507 of 5395 complete.
Row 2508 of 5395 complete.
Row 2509 of 5395 complete.
Row 2510 of 5395 complete.
Row 2511 of 5395 complete.
Row 2512 of 5395 complete.
Row 2513 of 5395 complete.
Row 2514 of 5395 complete.
Row 2515 of 5395 complete.
Row 2516 of 5395 complete.
Row 2517 of 5395 complete.
Row 2518 of 5395 complete.
Row 2519 of 5395 complete.
Row 2520 of 5395 complete.
Row 2521 of 5395 complete.
Row 2522 of 5395 complete.
Row 2523 of 5395 complete.
Row 2524 of 5395 complete.
Row 2525 of 5395 complete.
Row 2526 of 5395 complete.
Row 2527 of 5395 complete.
Row 2528 of 5395 complete.
Row 2529 of 5395 complete.
Row 2530 of 5395 complete.
Row 2531 of 5395 complete.
Row 2532 of 5395 complete.
Row 2533 of 5395 complete.
Row 2534 of 5395 complete.
Row 2535 of 5395 complete.
Row 2536 of 5395 complete.
Row 2537 of 5395 complete.
Row 2538 of 5395 complete.
Row 2539 of 5395 complete.
Row 2540 of 5395 complete.
Row 2541 of 5395 complete.
Row 2542 of 5395 complete.
Row 2543 of 5395 complete.
R

Row 2811 of 5395 complete.
Row 2812 of 5395 complete.
Row 2813 of 5395 complete.
Row 2814 of 5395 complete.
Row 2815 of 5395 complete.
Row 2816 of 5395 complete.
Row 2817 of 5395 complete.
Row 2818 of 5395 complete.
Row 2819 of 5395 complete.
Row 2820 of 5395 complete.
Row 2821 of 5395 complete.
Row 2822 of 5395 complete.
Row 2823 of 5395 complete.
Row 2824 of 5395 complete.
Row 2825 of 5395 complete.
Row 2826 of 5395 complete.
Row 2827 of 5395 complete.
Row 2828 of 5395 complete.
Row 2829 of 5395 complete.
Row 2830 of 5395 complete.
Row 2831 of 5395 complete.
Row 2832 of 5395 complete.
Row 2833 of 5395 complete.
Row 2834 of 5395 complete.
Row 2835 of 5395 complete.
Row 2836 of 5395 complete.
Row 2837 of 5395 complete.
Row 2838 of 5395 complete.
Row 2839 of 5395 complete.
Row 2840 of 5395 complete.
Row 2841 of 5395 complete.
Row 2842 of 5395 complete.
Row 2843 of 5395 complete.
Row 2844 of 5395 complete.
Row 2845 of 5395 complete.
Row 2846 of 5395 complete.
Row 2847 of 5395 complete.
R

Row 3117 of 5395 complete.
Row 3118 of 5395 complete.
Row 3119 of 5395 complete.
Row 3120 of 5395 complete.
Row 3121 of 5395 complete.
Row 3122 of 5395 complete.
Row 3123 of 5395 complete.
Row 3124 of 5395 complete.
Row 3125 of 5395 complete.
Row 3126 of 5395 complete.
Row 3127 of 5395 complete.
Row 3128 of 5395 complete.
Row 3129 of 5395 complete.
Row 3130 of 5395 complete.
Row 3131 of 5395 complete.
Row 3132 of 5395 complete.
Row 3133 of 5395 complete.
Row 3134 of 5395 complete.
Row 3135 of 5395 complete.
Row 3136 of 5395 complete.
Row 3137 of 5395 complete.
Row 3138 of 5395 complete.
Row 3139 of 5395 complete.
Row 3140 of 5395 complete.
Row 3141 of 5395 complete.
Row 3142 of 5395 complete.
Row 3143 of 5395 complete.
Row 3144 of 5395 complete.
Row 3145 of 5395 complete.
Row 3146 of 5395 complete.
Row 3147 of 5395 complete.
Row 3148 of 5395 complete.
Row 3149 of 5395 complete.
Row 3150 of 5395 complete.
Row 3151 of 5395 complete.
Row 3152 of 5395 complete.
Row 3153 of 5395 complete.
R

Row 3423 of 5395 complete.
Row 3424 of 5395 complete.
Row 3425 of 5395 complete.
Row 3426 of 5395 complete.
Row 3427 of 5395 complete.
Row 3428 of 5395 complete.
Row 3429 of 5395 complete.
Row 3430 of 5395 complete.
Row 3431 of 5395 complete.
Row 3432 of 5395 complete.
Row 3433 of 5395 complete.
Row 3434 of 5395 complete.
Row 3435 of 5395 complete.
Row 3436 of 5395 complete.
Row 3437 of 5395 complete.
Row 3438 of 5395 complete.
Row 3439 of 5395 complete.
Row 3440 of 5395 complete.
Row 3441 of 5395 complete.
Row 3442 of 5395 complete.
Row 3443 of 5395 complete.
Row 3444 of 5395 complete.
Row 3445 of 5395 complete.
Row 3446 of 5395 complete.
Row 3447 of 5395 complete.
Row 3448 of 5395 complete.
Row 3449 of 5395 complete.
Row 3450 of 5395 complete.
Row 3451 of 5395 complete.
Row 3452 of 5395 complete.
Row 3453 of 5395 complete.
Row 3454 of 5395 complete.
Row 3455 of 5395 complete.
Row 3456 of 5395 complete.
Row 3457 of 5395 complete.
Row 3458 of 5395 complete.
Row 3459 of 5395 complete.
R

Row 3729 of 5395 complete.
Row 3730 of 5395 complete.
Row 3731 of 5395 complete.
Row 3732 of 5395 complete.
Row 3733 of 5395 complete.
Row 3734 of 5395 complete.
Row 3735 of 5395 complete.
Row 3736 of 5395 complete.
Row 3737 of 5395 complete.
Row 3738 of 5395 complete.
Row 3739 of 5395 complete.
Row 3740 of 5395 complete.
Row 3741 of 5395 complete.
Row 3742 of 5395 complete.
Row 3743 of 5395 complete.
Row 3744 of 5395 complete.
Row 3745 of 5395 complete.
Row 3746 of 5395 complete.
Row 3747 of 5395 complete.
Row 3748 of 5395 complete.
Row 3749 of 5395 complete.
Row 3750 of 5395 complete.
Row 3751 of 5395 complete.
Row 3752 of 5395 complete.
Row 3753 of 5395 complete.
Row 3754 of 5395 complete.
Row 3755 of 5395 complete.
Row 3756 of 5395 complete.
Row 3757 of 5395 complete.
Row 3758 of 5395 complete.
Row 3759 of 5395 complete.
Row 3760 of 5395 complete.
Row 3761 of 5395 complete.
Row 3762 of 5395 complete.
Row 3763 of 5395 complete.
Row 3764 of 5395 complete.
Row 3765 of 5395 complete.
R

Row 4037 of 5395 complete.
Row 4038 of 5395 complete.
Row 4039 of 5395 complete.
Row 4040 of 5395 complete.
Row 4041 of 5395 complete.
Row 4042 of 5395 complete.
Row 4043 of 5395 complete.
Row 4044 of 5395 complete.
Row 4045 of 5395 complete.
Row 4046 of 5395 complete.
Row 4047 of 5395 complete.
Row 4048 of 5395 complete.
Row 4049 of 5395 complete.
Row 4050 of 5395 complete.
Row 4051 of 5395 complete.
Row 4052 of 5395 complete.
Row 4053 of 5395 complete.
Row 4054 of 5395 complete.
Row 4055 of 5395 complete.
Row 4056 of 5395 complete.
Row 4057 of 5395 complete.
Row 4058 of 5395 complete.
Row 4059 of 5395 complete.
Row 4060 of 5395 complete.
Row 4061 of 5395 complete.
Row 4062 of 5395 complete.
Row 4063 of 5395 complete.
Row 4064 of 5395 complete.
Row 4065 of 5395 complete.
Row 4066 of 5395 complete.
Row 4067 of 5395 complete.
Row 4068 of 5395 complete.
Row 4069 of 5395 complete.
Row 4070 of 5395 complete.
Row 4071 of 5395 complete.
Row 4072 of 5395 complete.
Row 4073 of 5395 complete.
R

Row 4346 of 5395 complete.
Row 4347 of 5395 complete.
Row 4348 of 5395 complete.
Row 4349 of 5395 complete.
Row 4350 of 5395 complete.
Row 4351 of 5395 complete.
Row 4352 of 5395 complete.
Row 4353 of 5395 complete.
Row 4354 of 5395 complete.
Row 4355 of 5395 complete.
Row 4356 of 5395 complete.
Row 4357 of 5395 complete.
Row 4358 of 5395 complete.
Row 4359 of 5395 complete.
Row 4360 of 5395 complete.
Row 4361 of 5395 complete.
Row 4362 of 5395 complete.
Row 4363 of 5395 complete.
Row 4364 of 5395 complete.
Row 4365 of 5395 complete.
Row 4366 of 5395 complete.
Row 4367 of 5395 complete.
Row 4368 of 5395 complete.
Row 4369 of 5395 complete.
Row 4370 of 5395 complete.
Row 4371 of 5395 complete.
Row 4372 of 5395 complete.
Row 4373 of 5395 complete.
Row 4374 of 5395 complete.
Row 4375 of 5395 complete.
Row 4376 of 5395 complete.
Row 4377 of 5395 complete.
Row 4378 of 5395 complete.
Row 4379 of 5395 complete.
Row 4380 of 5395 complete.
Row 4381 of 5395 complete.
Row 4382 of 5395 complete.
R

Row 4651 of 5395 complete.
Row 4652 of 5395 complete.
Row 4653 of 5395 complete.
Row 4654 of 5395 complete.
Row 4655 of 5395 complete.
Row 4656 of 5395 complete.
Row 4657 of 5395 complete.
Row 4658 of 5395 complete.
Row 4659 of 5395 complete.
Row 4660 of 5395 complete.
Row 4661 of 5395 complete.
Row 4662 of 5395 complete.
Row 4663 of 5395 complete.
Row 4664 of 5395 complete.
Row 4665 of 5395 complete.
Row 4666 of 5395 complete.
Row 4667 of 5395 complete.
Row 4668 of 5395 complete.
Row 4669 of 5395 complete.
Row 4670 of 5395 complete.
Row 4671 of 5395 complete.
Row 4672 of 5395 complete.
Row 4673 of 5395 complete.
Row 4674 of 5395 complete.
Row 4675 of 5395 complete.
Row 4676 of 5395 complete.
Row 4677 of 5395 complete.
Row 4678 of 5395 complete.
Row 4679 of 5395 complete.
Row 4680 of 5395 complete.
Row 4681 of 5395 complete.
Row 4682 of 5395 complete.
Row 4683 of 5395 complete.
Row 4684 of 5395 complete.
Row 4685 of 5395 complete.
Row 4686 of 5395 complete.
Row 4687 of 5395 complete.
R

Row 4955 of 5395 complete.
Row 4956 of 5395 complete.
Row 4957 of 5395 complete.
Row 4958 of 5395 complete.
Row 4959 of 5395 complete.
Row 4960 of 5395 complete.
Row 4961 of 5395 complete.
Row 4962 of 5395 complete.
Row 4963 of 5395 complete.
Row 4964 of 5395 complete.
Row 4965 of 5395 complete.
Row 4966 of 5395 complete.
Row 4967 of 5395 complete.
Row 4968 of 5395 complete.
Row 4969 of 5395 complete.
Row 4970 of 5395 complete.
Row 4971 of 5395 complete.
Row 4972 of 5395 complete.
Row 4973 of 5395 complete.
Row 4974 of 5395 complete.
Row 4975 of 5395 complete.
Row 4976 of 5395 complete.
Row 4977 of 5395 complete.
Row 4978 of 5395 complete.
Row 4979 of 5395 complete.
Row 4980 of 5395 complete.
Row 4981 of 5395 complete.
Row 4982 of 5395 complete.
Row 4983 of 5395 complete.
Row 4984 of 5395 complete.
Row 4985 of 5395 complete.
Row 4986 of 5395 complete.
Row 4987 of 5395 complete.
Row 4988 of 5395 complete.
Row 4989 of 5395 complete.
Row 4990 of 5395 complete.
Row 4991 of 5395 complete.
R

Row 5263 of 5395 complete.
Row 5264 of 5395 complete.
Row 5265 of 5395 complete.
Row 5266 of 5395 complete.
Row 5267 of 5395 complete.
Row 5268 of 5395 complete.
Row 5269 of 5395 complete.
Row 5270 of 5395 complete.
Row 5271 of 5395 complete.
Row 5272 of 5395 complete.
Row 5273 of 5395 complete.
Row 5274 of 5395 complete.
Row 5275 of 5395 complete.
Row 5276 of 5395 complete.
Row 5277 of 5395 complete.
Row 5278 of 5395 complete.
Row 5279 of 5395 complete.
Row 5280 of 5395 complete.
Row 5281 of 5395 complete.
Row 5282 of 5395 complete.
Row 5283 of 5395 complete.
Row 5284 of 5395 complete.
Row 5285 of 5395 complete.
Row 5286 of 5395 complete.
Row 5287 of 5395 complete.
Row 5288 of 5395 complete.
Row 5289 of 5395 complete.
Row 5290 of 5395 complete.
Row 5291 of 5395 complete.
Row 5292 of 5395 complete.
Row 5293 of 5395 complete.
Row 5294 of 5395 complete.
Row 5295 of 5395 complete.
Row 5296 of 5395 complete.
Row 5297 of 5395 complete.
Row 5298 of 5395 complete.
Row 5299 of 5395 complete.
R

# Save the results of the ICES to NVS semantic model mapping to file.

In [15]:
# Split out those combinations that have already been mapped to P01 in the parameter set
outputP01 = param_combo[param_combo['P01_Code']!='-9']

# Retain those combinations that have not yet been mapped to P01
output = param_combo[param_combo['P01_Code']=='-9']

# Split out SPM combinations to file where more worked needed at ICES to allow accurate mapping.
outputSPM = output[output['MATRX']=='SPM']

# Split out combinations that exclude SPM measurements 
output = output[output['MATRX']!='SPM']

# Generate list of new substances to be added to S27
S27_cols = ['PARGROUP','PARAM','PRNAM','CAS','S27_label','S27']
output_S27 = output[S27_cols][output['S27']=='No S27 term. Needs adding to NVS'].drop_duplicates().reset_index(drop=True)

# Generate list of new biological entities to be added to S25
S25_cols = ['Note','AphiaID','Species','MATRX','S25','S25_label','TAXON','STAGE','GENDER','SIZE','SUBCOMPONENT','MORPHOLOGY','COLOUR','SUBGROUP']
output_S25a = output[S25_cols][output['S25'] == 'No S25 term. Needs adding to NVS'].drop_duplicates().reset_index(drop=True)
output_S25b = output[S25_cols][output['S25'] == 'No term for at least one S25 model list. Needs adding to NVS'].drop_duplicates().reset_index(drop=True)
    
# Generate list of paramters with full set of semantic model terms for P01 matching script
output_complete = output[output['S25']!='No S25 term. Needs adding to NVS']
output_complete = output_complete[output_complete['TAXON']!='To be resolved.']
output_complete = output_complete[output_complete['S25']!='No term for at least one S25 model list. Needs adding to NVS']
output_complete = output_complete[output_complete['S27']!='No S27 term. Needs adding to NVS']
output_complete = output_complete[output_complete['S27']!='not applicable']

output_nonchem = output[output['S27']=='not applicable'].reset_index(drop=True)

taxa_discrepancy = input_taxa_check[input_taxa_check['proceed']=='No'].reset_index(drop=True)

# Create summary information
summary = pd.DataFrame([["Processing started:" , (start.strftime('%Y-%m-%d %H:%M:%S'))],
["Processing finished:" , datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
["" , ""],
["Rows input:", len(inputs)],
["" , ""],
["Rows with P01 provided:" , len(outputP01)],
["Rows successfully mapped:" , len(output_complete)],
["Rows unable to be mapped:" , ""],
["Because:" , ""],
["    Rows for SPM:" , len(outputSPM)],
["    Rows for non-chemical codes:" , len(output_nonchem)],
["    New chemical substances for mapping:" , len(output_S27)],
["    New biological entities for creation:" , len(output_S25a)],
["    New biological entitity semantic components:" , len(output_S25b)],
["    Taxa discrepancies:" , len(taxa_discrepancy)]])


# Save outputs as worksheets within Excel file
with pd.ExcelWriter(fileout) as writer:
    summary.to_excel(writer, sheet_name='summary', header=False, index=False)
    inputs.to_excel(writer, sheet_name='input', index=False)
    outputP01.to_excel(writer, sheet_name='known_matched', index=False)
    output_complete.to_excel(writer, sheet_name='mapped', index=False)
    outputSPM.to_excel(writer, sheet_name='SPM_codes', index=False)
    output_nonchem.to_excel(writer, sheet_name='nonchemical_codes', index=False)
    output_S27.to_excel(writer, sheet_name='new_S27', index=False)
    output_S25a.to_excel(writer, sheet_name='new_S25', index=False)
    output_S25b.to_excel(writer, sheet_name='new_S25_component', index=False)
    taxa_discrepancy.to_excel(writer, sheet_name='taxa_discrepancies', index=False)

print("Process results save to file: %s" % fileout)
display(summary)

Process results save to file: C:\Users\rthomas\Documents\GitHub\ICES2NVS_semantic_map_ext\example\ICES2P01_test_dset_mapped.xlsx


Unnamed: 0,0,1
0,Processing started:,2021-06-12 10:37:44
1,Processing finished:,2021-06-12 10:41:03
2,,
3,Rows input:,5439
4,,
5,Rows with P01 provided:,546
6,Rows successfully mapped:,4135
7,Rows unable to be mapped:,
8,Because:,
9,Rows for SPM:,27
