# Import toolboxes

In [None]:
# Set display to width of screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Import toolboxes
import os
import numpy as np
import pandas as pd
import re
from SPARQLWrapper import SPARQLWrapper, JSON
from unidecode import unidecode
import urllib
import json
from pandas.io.json import json_normalize
import datetime

# Set Pandas DataFrame to resize to display full text
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Define functions used by the script

In [None]:
def xstr(s):
    """Function to remove non-ASCII characters from NVS results"""
    if s is None:
        return ''
    return str(unidecode(s))

def sparql_nvs_json(s):
    """Function to input a SPARQL query (s) into the NVS SPARQL endpoint"""
    sparql = SPARQLWrapper("http://vocab.nerc.ac.uk/sparql/sparql")
    sparql.setQuery(s)
    sparql.setReturnFormat(JSON)
    r = sparql.query().convert()
    return r

def S27_map():
    """Function to get NVS:S27 chemical substrances which have CAS numbers as a published mapping"""
    s =  """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
                
            select ?codval ?prefLabel ?casurl
            where {
            <http://vocab.nerc.ac.uk/collection/S27/current/> skos:member ?url .
            ?url skos:notation ?codval .
            ?url skos:prefLabel ?prefLabel .
            ?url owl:deprecated 'false' .
            ?url owl:sameAs ?casurl .
            FILTER(regex(str(?casurl), "https://chem.nlm.nih.gov/chemidplus/rn/", "i"))
            }"""                
    r = sparql_nvs_json(s)    
    list = []
    for i in range(0,len(r['results']['bindings'])):
        a = xstr(r['results']['bindings'][i]['codval']['value'].replace('SDN:S27::',''))
        b = xstr(r['results']['bindings'][i]['prefLabel']['value'])
        c = xstr(r['results']['bindings'][i]['casurl']['value'].replace('http://chem.sis.nlm.nih.gov/chemidplus/rn/',''))
        list.append([a,b,c])
    return list

def taxon_map(spcs,aphia):
    """Function to get NVS:S25 TAXON from an AphiaID and species"""
    s =  """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
                
            select ?prefLabel
            where {
                    <http://vocab.nerc.ac.uk/collection/S25/current/> skos:member ?url .
                    ?url skos:notation ?codval .
                    ?url skos:prefLabel ?prefLabel .
                    ?url owl:deprecated 'false' .
                    FILTER(CONTAINS(?prefLabel,'%s')).
                    FILTER(CONTAINS(?prefLabel,'WoRMS %s)')).
                }""" % (spcs, aphia)
    r = sparql_nvs_json(s)
    list = []
    if len(r['results']['bindings']) == 0:
        a = 'Not available'
    else:
        label_list = []
        for i in range(0,len(r['results']['bindings'])):
            h = re.sub('\s\[.*?\]' ,'',xstr(r['results']['bindings'][i]['prefLabel']['value']))
            if h not in label_list:
                label_list.append(h)
                a = h
    list.append([a, aphia])
    return list

def S25_lookup(spcs,aphia,label):
    """Function to get NVS:S25 codval from the generated preflabel, if it exists"""
    s =  """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
                
            select ?codval ?prefLabel
            where {
            <http://vocab.nerc.ac.uk/collection/S25/current/> skos:member ?url .
            ?url skos:notation ?codval .
            ?url skos:prefLabel ?prefLabel .
            ?url owl:deprecated 'false' .
            FILTER(CONTAINS(?prefLabel,'%s (')).
            FILTER(CONTAINS(?prefLabel,'WoRMS %s')).
            FILTER(STRENDS(?prefLabel,'%s')).
            }""" % (spcs, aphia, label)
    r = sparql_nvs_json(s)
    list = []
    if len(r['results']['bindings']) == 0:
        a = 'No S25 term. Needs adding to NVS'
        b = "%s (ITIS: ?????: WoRMS %s) %s" % (spcs, aphia, label)
    else:
        for i in range(0,len(r['results']['bindings'])):
            a = xstr(r['results']['bindings'][i]['codval']['value'].replace('SDN:S25::',''))
            b = xstr(r['results']['bindings'][i]['prefLabel']['value'])
    list.append([a,b])
    return list

##  Set input files, local mapping files and results directory for output

In [None]:
# Location of input file
inputfile = os.path.join(os.getcwd(),os.path.normpath('ICES2P01_test_dset.csv'))

# Filepaths for mapping files
mapfile = os.path.join(os.getcwd(),'mappings','unmapped_substances.csv')
biotamap = os.path.join(os.getcwd(),'mappings','biota_synonym_mapping.csv')
p02_file = os.path.join(os.getcwd(),'mappings','ICES2P02_mapping.csv')

# Set output file directory
results = os.path.join(os.getcwd(),'results')

# Set list to capture summary information for reporting at then end of the run
summary = []

# Set time run started
start = datetime.datetime.now()

# Add start time and holding space for end time to summary information
summary.append(["Processing started:" , (start.strftime('%Y-%m-%d %H:%M:%S'))]) 
summary.append(["Processing finished:" , ""])
summary.append(["" , ""])

## Load ICES semantic model components for mapping to P01 semantic model from file into a Pandas DataFrame

### Example of expected input structure and headings

<img align="left" style="padding-right:10px;" src="img/input_example.png"></br>

In [None]:
inputs = pd.read_csv(inputfile)
inputs.reset_index(inplace=True)
inputs = inputs.rename(columns = {'index':'rowID'})

# Insert number of rows in the input file in to the summary information
print("Rows input: %s" % len(inputs))
summary.append(["Rows input:", len(inputs)])

# Make a working copy of the parameter combinations for mapping
param_combo = inputs.copy(deep=True)
print("Rows for mapping: %s" % len(param_combo))
summary.append(["Rows for mapping:", len(param_combo)])

# Add columns needed for P01 semantic model
# In the working copy set NaNs to '-9' and add columns for mapped NVS semantic model elements
param_combo = param_combo.fillna('-9')
param_combo = param_combo.assign(S06_label='',              # Measurement Property
                                 S07_label='not specified', # Measurement Property Statistic
                                 S02_label='',              # Measurement - Matrix relationship
                                 )
param_combo['PARAM'] = param_combo['PARAM'].str.upper()
param_combo['AphiaID'] = param_combo['AphiaID'].astype('int32')
# Remove leading or trailing spaces from the text columns
columns = param_combo.columns.tolist()
columns.remove('AphiaID')
columns.remove('rowID')
for column in columns:
    param_combo[column] = param_combo[column].str.strip()

### Display sample of sediment ICES parameter combinations

In [None]:
display(param_combo[param_combo['DTYPE']=='CS'][['rowID','PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Species','AphiaID','Note']].head(10))

### Display sample of water ICES parameter combinations

In [None]:
display(param_combo[param_combo['DTYPE']=='CW'][['rowID','PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Species','AphiaID','Note']].head(10))

### Display sample of biota ICES parameter combinations

In [None]:
display(param_combo[param_combo['DTYPE']=='CF'][['rowID','PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Species','AphiaID','Note']].head(10))

### Display sample of P01 parameter codes and semantic model components

## Load P01 terms and semantic model vocabularies from the NERC Vocabulary Server

### For more details of the P01 Parameter Usage Vocabulary and the underlying semantic model please see the IMDIS 2018 presentation: 
#### Slides: https://www.bodc.ac.uk/about/outputs/presentations_and_papers/documents/imdis2018gmon_alexk.pdf
#### Video: https://www.youtube.com/watch?v=ePFqUSsteQs

In [None]:
#%% Get the latest semantic model vocabulary contents from the NVS Sparql endpoint

a1 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0A++++PREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A++++%0D%0A++++select+%3F"
a2 = "+%3F"
a3 = "%0D%0A++++where+%7B%0D%0A++++%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2F"
a4 = "%2Fcurrent%2F%3E+skos%3Amember+%3Furl+.%0D%0A++++%3Furl+skos%3AprefLabel+%3F"
a5 = "+.%0D%0A++++%3Furl+skos%3Anotation+%3Fc+.%0D%0A++++%3Furl+owl%3Adeprecated+%27false%27+.%0D%0A++++BIND%28replace%28str%28%3Fc%29%2C%27SDN%3A"
a6 = "%3A%3A%27%2C%27%27%2C%27i%27%29+AS+%3F"
a7 = "%29%0D%0A++++%7D&output=csv&stylesheet="

S06 = pd.read_csv(a1+'S06'+a2+'S06_label'+a3+'S06'+a4+'S06_label'+a5+'S06'+a6+'S06'+a7)
S07 = pd.read_csv(a1+'S07'+a2+'S07_label'+a3+'S07'+a4+'S07_label'+a5+'S07'+a6+'S07'+a7)
S02 = pd.read_csv(a1+'S02'+a2+'S02_label'+a3+'S02'+a4+'S02_label'+a5+'S02'+a6+'S02'+a7)
S26 = pd.read_csv(a1+'S26'+a2+'S26_label'+a3+'S26'+a4+'S26_label'+a5+'S26'+a6+'S26'+a7)
S03 = pd.read_csv(a1+'S03'+a2+'S03_label'+a3+'S03'+a4+'S03_label'+a5+'S03'+a6+'S03'+a7)
S04 = pd.read_csv(a1+'S04'+a2+'S04_label'+a3+'S04'+a4+'S04_label'+a5+'S04'+a6+'S04'+a7)
S05 = pd.read_csv(a1+'S05'+a2+'S05_label'+a3+'S05'+a4+'S05_label'+a5+'S05'+a6+'S05'+a7)
P01 = pd.read_csv(a1+'P01'+a2+'P01_label'+a3+'P01'+a4+'P01_label'+a5+'P01'+a6+'P01'+a7)

# Download semantic component mapping

urlS06 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS06+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS06%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS06%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS06%29+.%0D%0A%7D&output=csv&stylesheet="
S06_P01 = pd.read_csv(urlS06)
print("P01-S06 mapping downloaded.")

urlS07 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS07+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS07%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS07%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS07%29+.%0D%0A%7D&output=csv&stylesheet="
S07_P01 = pd.read_csv(urlS07)
print("P01-S07 mapping downloaded.")

urlS27 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS27+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS27%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS27%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS27%29+.%0D%0A%7D&output=csv&stylesheet="
S27_P01 = pd.read_csv(urlS27)
print("P01-S27 mapping downloaded.")

urlS02 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS02+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS02%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS02%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS02%29+.%0D%0A%7D&output=csv&stylesheet="
S02_P01 = pd.read_csv(urlS02)
print("P01-S02 mapping downloaded.")

urlS26 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS26+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS26%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS26%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS26%29+.%0D%0A%7D&output=csv&stylesheet="
S26_P01 = pd.read_csv(urlS26)
print("P01-S26 mapping downloaded.")

urlS25 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS25+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0A%7D&output=csv&stylesheet="
S25_P01 = pd.read_csv(urlS25)
print("P01-S25 mapping downloaded.")

urlS03 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS03+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS03%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS03%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS03%29+.%0D%0A%7D&output=csv&stylesheet="
S03_P01 = pd.read_csv(urlS03)
print("P01-S03 mapping downloaded.")

urlS04 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS04+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS04%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS04%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS04%29+.%0D%0A%7D&output=csv&stylesheet="
S04_P01 = pd.read_csv(urlS04)
print("P01-S04 mapping downloaded.")

urlS05 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS05+%3FP01%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS05%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FP01%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AP01%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FP01%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS05%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS05%29+.%0D%0A%7D&output=csv&stylesheet="
S05_P01 = pd.read_csv(urlS05)
print("P01-S05 mapping downloaded.")

# Build P01 semantic model dataframe
P01 = pd.merge(P01, S06_P01, how='left', on='P01')
P01 = pd.merge(P01, S07_P01, how='left', on='P01')
P01 = pd.merge(P01, S27_P01, how='left', on='P01')
P01 = pd.merge(P01, S02_P01, how='left', on='P01')
P01 = pd.merge(P01, S26_P01, how='left', on='P01')
P01 = pd.merge(P01, S25_P01, how='left', on='P01')
P01 = pd.merge(P01, S03_P01, how='left', on='P01')
P01 = pd.merge(P01, S04_P01, how='left', on='P01')
P01 = pd.merge(P01, S05_P01, how='left', on='P01')

P01 = P01.fillna(value={'S25': 'BE007736', 'S07': 'S0700006', 'S03': 'S0316', 'S04': 'S0421', 'S05': 'S050003'})

print("P01 semantic model dataframe constructed")

In [None]:
display(P01[P01['P01_label'].str.contains('copper')].head(10))

# Mapping of chemical PARAMs and NVS S27 vocabulary entries
### Determine where direct mappings already published on the NVS

In [None]:
# First get S27 terms that have a mapping to ICES PARAM vocabulary published from the NVS
q = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
                    
    select ?PARAM ?S27 ?S27_label 
    where {
           <http://vocab.nerc.ac.uk/collection/S27/current/> skos:member ?url .
           ?url skos:notation ?a .
           ?url skos:prefLabel ?S27_label .
           ?url owl:deprecated 'false' .
           ?url skos:related ?c .
           FILTER(regex(str(?c), "http://vocab.ices.dk/services/rdf/collection/PARAM/", "i")) .
           BIND(substr(?a,10,8) as ?S27) .
           BIND(replace(str(?c), "http://vocab.ices.dk/services/rdf/collection/PARAM/", "", "i") AS ?PARAM) .
          }"""
# URL for the above query is:
url = """http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0A++++PREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A++++++++++++++++++++%0D%0A++++select+%3FPARAM+%3FS27+%3FS27_label+%0D%0A++++where+%7B%0D%0A+++++++++++%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS27%2Fcurrent%2F%3E+skos%3Amember+%3Furl+.%0D%0A+++++++++++%3Furl+skos%3Anotation+%3Fa+.%0D%0A+++++++++++%3Furl+skos%3AprefLabel+%3FS27_label+.%0D%0A+++++++++++%3Furl+owl%3Adeprecated+%27false%27+.%0D%0A+++++++++++%3Furl+skos%3Arelated+%3Fc+.%0D%0A+++++++++++FILTER%28regex%28str%28%3Fc%29%2C+%22http%3A%2F%2Fvocab.ices.dk%2Fservices%2Frdf%2Fcollection%2FPARAM%2F%22%2C+%22i%22%29%29+.%0D%0A+++++++++++BIND%28substr%28%3Fa%2C10%2C8%29+as+%3FS27%29+.%0D%0A+++++++++++BIND%28replace%28str%28%3Fc%29%2C+%22http%3A%2F%2Fvocab.ices.dk%2Fservices%2Frdf%2Fcollection%2FPARAM%2F%22%2C+%22%22%2C+%22i%22%29+AS+%3FPARAM%29+.%0D%0A++++++++++%7D&output=CSV&stylesheet="""

# More efficient to ingest SPARQL response as a CSV directly into a Pandas DataFrame
mapped_chems = pd.read_csv(url)
mapped_chems['SOURCE'] = 'NVS'

print("Number of ICES PARAM terms directly mapped to S27 chemical substance terms from NVS: %s" % (len(mapped_chems)))
summary.append(["Number of ICES PARAM terms directly mapped to S27 chemical substance terms from NVS:", (len(mapped_chems))])

display(mapped_chems)

### Get local ICES PARAM to NVS S27 substance mapping from mapping file location provided earlier

In [None]:
local_map = pd.read_csv(mapfile)
local_map['SOURCE'] = mapfile

print("Number of local mappings for chemical substances from file: %s" % (len(local_map)))
summary.append(["Number of local mappings for chemical substances from file:" , (len(local_map))])

display(local_map)

### Identify any mappings in the local file also published from the NVS

In [None]:
duplicate_map = pd.merge(mapped_chems, local_map, how='inner', on='PARAM')

print("Number of chemical substance mappings in both local file and NVS: %s" % (len(duplicate_map)))
summary.append(["Number of chemical substance mappings in both local file and NVS:" , (len(duplicate_map))])

display(duplicate_map)

### Display local mappings, where CAS RN do not match between NVS and ICES or are absent, for review with BODC vocab team (vocab.services@bodc.ac.uk) for upload to NVS 

In [None]:
display(local_map[local_map['ICES_CASRN']!=local_map['NVS_CASRN']])

### For unmapped chemical PARAMs with CAS numbers, check if the chemical substance exists within S27 and then map via CAS number

In [None]:
# SPARQL query for all NVS substances with CAS numbers from the SPARQL endpoint
q =  """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
            
    select ?nvs_codval ?nvs_prefLabel ?nvs_casrn
    where {
           <http://vocab.nerc.ac.uk/collection/S27/current/> skos:member ?url .
           ?url skos:notation ?a .
           ?url skos:prefLabel ?nvs_prefLabel .
           ?url owl:deprecated 'false' .
           ?url owl:sameAs ?c .
           FILTER(regex(str(?c), "https://chem.nlm.nih.gov/chemidplus/rn/", "i")) .
           BIND(replace(str(?a),'SDN:S27::','','i') AS ?nvs_codval) .
           BIND(replace(str(?c),'https://chem.nlm.nih.gov/chemidplus/rn/','','i') AS ?nvs_casrn) .
          }"""                

# URL for the above query is:
url = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX%20skos%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX%20owl%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0A%0Aselect%20%3Fnvs_codval%20%3Fnvs_prefLabel%20%3Fnvs_casrn%0Awhere%20%7B%0A%20%20%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS27%2Fcurrent%2F%3E%20skos%3Amember%20%3Furl%20.%0A%20%20%3Furl%20skos%3Anotation%20%3Fa%20.%0A%20%20%3Furl%20skos%3AprefLabel%20%3Fnvs_prefLabel%20.%0A%20%20%3Furl%20owl%3Adeprecated%20'false'%20.%0A%20%20%3Furl%20owl%3AsameAs%20%3Fc%20.%0A%20%20FILTER(regex(str(%3Fc)%2C'https%3A%2F%2Fchem.nlm.nih.gov%2Fchemidplus%2Frn%2F'%2C'i'))%20.%0A%20%20BIND(replace(str(%3Fa)%2C'SDN%3AS27%3A%3A'%2C''%2C'i')%20AS%20%3Fnvs_codval)%20.%0A%20%20BIND(replace(str(%3Fc)%2C'https%3A%2F%2Fchem.nlm.nih.gov%2Fchemidplus%2Frn%2F'%2C''%2C'i')%20AS%20%3Fnvs_casrn)%20.%0A%7D&output=csv&stylesheet="   

# More efficient to ingest SPARQL response as a CSV directly into a Pandas DataFrame
nvs_cas = pd.read_csv(url)

print("Number of chemical substances in S27 with a CAS number: %s" % len(nvs_cas))

### Combine NVS mappings and local file then apply mapping to input file

In [None]:
cas_map = nvs_cas.drop_duplicates(subset=['nvs_casrn'], keep=False).rename(columns={"nvs_codval": "S27", "nvs_prefLabel": "S27_label", "nvs_casrn": "CAS"})
cas_map['SOURCE']='CASRN'

# Remove duplicate records for as CASRN from the NVS and concatenate all remaining mappings giving precidence to the local mappings where there is duplication
full_chem_map = pd.concat([mapped_chems,
                           local_map[['PARAM','S27','S27_label','SOURCE']], 
                           pd.merge(param_combo[(param_combo['CAS']!=-9)][['PARAM','CAS']],cas_map,how='inner', on='CAS').drop_duplicates()],
                          sort=True                          
                         ).drop_duplicates(subset='PARAM', keep='first').reset_index(drop=True)

# Add S27 semantic model mapping to the main table based on the combinations provided   
param_combo = pd.merge(param_combo, full_chem_map.drop(columns=['CAS']), how='left', on=['PARAM'])

print("Rows mapped from NVS and local mapping of chemical substance: %s" % len(param_combo[param_combo['S27'].notnull()]))

### Display mapping progress

In [None]:
display(param_combo[['rowID','PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Species','AphiaID','Note','S06_label','S07_label','S27_label']])

### For the contents of the input file determine if any new local PARAM to S27 mappings are required for chemical PARAMs

In [None]:
display(param_combo[(param_combo['S27'].isnull()) & (param_combo['CAS']!='-9')][['PRNAM','CAS','PARAM','S27','S27_label','SOURCE']].drop_duplicates().sort_values('CAS'))

### Check no chemical codes present among those rows without a CAS entry

In [None]:
display(param_combo[(param_combo['S27'].isnull()) & (param_combo['CAS']=='-9')][['PRNAM','CAS','PARAM','S27','S27_label','SOURCE']].drop_duplicates().sort_values('CAS'))

#### Where mappings don't exist for chemical PARAMs or can't be made via CAS number as listed above:
#### Either:
#### - Add mapping to local file and re-run previous steps
#### - Continue process and these rows will not be able to be mapped to a P01 term

In [None]:
# Mark rows that require chemical substance mapping with S27 as "manual mapping required"
param_combo.loc[(param_combo['S27'].isnull()) & (param_combo['CAS']!='-9'),'S27_label'] = 'manual mapping required'
# Mark rows that do not require chemical substance mapping with S27 as "not applicable"    
param_combo.loc[(param_combo['S27'].isnull()) & (param_combo['CAS']=='-9'),'S27_label'] = 'not applicable'

# Generate list of new substances to be added to S27
S27_cols = ['PARGROUP','PARAM','PRNAM','CAS','S27_label','S27']
S27new = param_combo[param_combo['S27_label']=='manual mapping required'][S27_cols].drop_duplicates().reset_index(drop=True)

S27new_file = os.path.join(results,'new_S27.csv')
S27new.to_csv(S27new_file, index=False)
print("Subset of new S27 terms for creation or manual mapping saved to: %s" % S27new_file)

# Generate list of PARAMs to be mapped to another element of the semantic model
altmap_cols = ['PARGROUP','PARAM','PRNAM','CAS']
alt_mapping = param_combo[param_combo['S27_label']=='not applicable'][altmap_cols].drop_duplicates().reset_index(drop=True)

altmap_file = os.path.join(results,'alt_mappings.csv')
alt_mapping.to_csv(altmap_file, index=False)
print("Subset of PARAMs for creation or manual mapping to another part of the semantic model saved to: %s" % altmap_file)

print("Total combinations = %s" % (len(param_combo)))

PARAMs2map = pd.DataFrame()
PARAMs2map = param_combo[['PARAM','PRNAM']][param_combo['S27_label']=='not applicable'].drop_duplicates()


## Mapping of ICS DTYPE, MATRX and METPT combinations to P01 semantic component S26

<img align="left" style="padding-right:10px;" src="img/matrix_mapping.png"></br>

In [None]:
matrix_check = param_combo[['DTYPE','MATRX','METPT']].drop_duplicates().copy(deep=True).reset_index(drop=True)

# Set to default of 'Check MATRX. Not mapped.'
matrix_check = matrix_check.assign(S26_label = 'Check DTYPE/MATRX/METPT combination. Not mapped.')

print("Number of MATRX for P01 mapping: %s" % len(matrix_check))
summary.append(["Number of MATRX for P01 mapping:" , len(matrix_check)])

for index, row in matrix_check.iterrows():
    if row['DTYPE'] == 'CF':
        # Set S26 label to 'biota'
        row['S26_label'] = 'biota'

    elif row['DTYPE'] == 'CS':
        if row['MATRX'] == 'SEDTOT':
            row['S26_label'] = 'sediment'
        elif row['MATRX'][3:len(row['MATRX'])] != 'TOT':
            row['S26_label'] = 'sediment <'+row['MATRX'][3:len(row['MATRX'])] +'um'
            
    elif row['DTYPE'] == 'CW':
        if row['MATRX'] == 'WT':
            if row['METPT'] == '-9':
                row['S26_label'] = 'water body [dissolved plus reactive particulate <unknown phase]'
            else:
                metpt_list = row['METPT'].split('~')               
                for metpt in metpt_list:
                    if metpt in ('NF','NONE','NA','CP'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate phase]'
                        continue
                    elif metpt in('GFF','GF/F','FF-GF-0.7'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <GF/F phase]'
                        continue
                    elif metpt in('GFC','GF/C','FF-GF-1.2','FF-PP-1.2'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <GF/C phase]'
                        continue
                    elif metpt in('FM-PC-0.4','FM-PC-0.45','FM-PES-0.45','FM-CN-0.45','FM-CA-0.45','PCF40','PCF45','PCF'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <0.4/0.45um phase]'
                        continue
                    elif metpt in('F'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <unknown phase]'
                        continue
                    elif metpt in('FM-CA-0.2'):
                        row['S26_label'] = 'water body [dissolved plus reactive particulate <0.2um phase]'
                        continue

    #print("Row %s of %s matrix combinations mapped." % (index+1,len(matrix_check))) # Commented out used in debugging
#display(matrix_check)    

# Subset potential S26 new entries
S26new = matrix_check[matrix_check['S26_label']=='Check DTYPE/MATRX/METPT combination. Not mapped.']

print("Number of potential new S26 terms: %s" % len(S26new))
summary.append(["Number of potential new S26 terms:" , len(S26new)])
display(S26new)

# Retain those combinations that have not yet been mapped to P01
S26new_file = os.path.join(results,'new_S26.csv')
S26new.to_csv(S26new_file, index=False)
print("Subset of new S26 terms for creation saved to: %s" % S26new_file)

# Add S26 semantic model mapping to the main table based on the combinations provided   
param_combo = pd.merge(param_combo, matrix_check, how='left', on=['DTYPE','MATRX','METPT'])

print("Total combinations = %s" % (len(param_combo)))

### Display mapping progress

In [None]:
display(param_combo[['PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Species','AphiaID','Note','S06_label','S07_label','S27_label','S26_label']])

# Taxon, WoRMS AphiaID, ITIS TSN combination check

In [None]:
# Get all existing TAXONs from S25 and simplify text labels to show distinct TAXON values from the S25 semantic model
url = """http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0A++++PREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A++++++++++++++++++++%0D%0A++++select+distinct+%3FAphiaID+%3FTAXON%0D%0A++++where+%7B%0D%0A+++++++++++%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furl+.%0D%0A+++++++++++%3Furl+skos%3AprefLabel+%3FprefLabel+.%0D%0A+++++++++++%3Furl+owl%3Adeprecated+%27false%27+.%0D%0A+++++++++++FILTER%28regex%28str%28%3FprefLabel%29%2C+%22WoRMS%22%2C+%22i%22%29%29+.%0D%0A+++++++++++BIND%28replace%28str%28%3FprefLabel%29%2C+%22%5C%5C+%5C%5C%5B.*%3F%5C%5C%5D%22%2C%22%22%2C+%22i%22%29+AS+%3FTAXON%29+.%0D%0A+++++++++++BIND%28replace%28replace%28replace%28str%28%3FTAXON%29%2C+%22%5C%5C%29%22%2C%22%22%2C+%22i%22%29%2C+%22.*%28%3F%3DWoRMS+%29%22%2C+%22%22%2C+%22i%22%29%2C+%22WoRMS+%22%2C+%22%22%2C+%22i%22%29+AS+%3FAphiaID%29+.%0D%0A++++++++++%7D%0D%0A++++order+by+%3FAphiaID%0D%0A&output=CSV&stylesheet=CSV"""

S25taxon = pd.read_csv(url)

# Identify multiple TAXONs per AphiaID within S25
S25taxon_duplicates = S25taxon[S25taxon.duplicated(['AphiaID'], keep=False)].copy(deep=True)
S25taxon_duplicates.replace(u'\xc2\xa0',u' ', regex=True, inplace=True)
S25taxon_duplicates.replace(u'\u2019',u"'", regex=True, inplace=True)

# Remove duplicate TAXON records from S25taxon dataframe
S25taxon_clean = pd.concat([S25taxon, S25taxon_duplicates]).drop_duplicates(keep=False).copy(deep=True).reset_index(drop=True)

# Create a Pandas DataFrame and populate with unique combinations of Species and AphiaID from the input file
input_taxa_check = pd.DataFrame()
input_taxa_check = param_combo[['Species','AphiaID']][param_combo['Species']!='-9'].drop_duplicates().reset_index(drop=True)
input_taxa_check = input_taxa_check.astype({"AphiaID": int})

print("Number of Species for P01 mapping: %s" % len(input_taxa_check))
summary.append(["Number of Species for P01 mapping:" , len(input_taxa_check)])
#%%
# Function to call WoRMS web service
def worms_check(url):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    if response.code==204:
        e = 'No AphiaID found.'
    elif response.code==206:
        e = 'Multiple AphiaID found.'
    elif response.code==200:
        e = response.read()
    return e

# If AphiaID is absent then lookup using the WoRMS web service
for index, row in input_taxa_check.iterrows():
    if row['AphiaID'] == -9:
        if '&' not in row['Species']:
            url = 'http://marinespecies.org/rest/AphiaIDByName/%s?marine_only=true' % row['Species'].replace(" ","%20")
            input_taxa_check.loc[index, 'AphiaID'] = worms_check(url)
        else:
            input_taxa_check.loc[index, 'AphiaID'] = 'Combination of taxa'

inputs_aphia = pd.merge(inputs, input_taxa_check, on='Species')


#%% 
# Get WoRMS scientific names from AphiaID provided using WoRMS web service
def worms_check(url):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    if response.code==204:
        list.append('No response.')
    elif response.code==200:
        e = response.read()
    return json.loads(e)
       
worms = pd.DataFrame()        

aphia_list = input_taxa_check['AphiaID'].tolist()
y = len(aphia_list)
if y<50:
    ids = ''
    for i in range(0,50):
        ids = ids + 'aphiaids%5B%5D=' + str(aphia_list[i]) + '&'
    url = 'http://www.marinespecies.org/rest/AphiaRecordsByAphiaIDs?%s' % ids[0:-1]
    worms = pd.DataFrame(worms_check(url), ignore_index=True)
elif y>50:
    for j in range(0,int(y/50)):
        ids = ''
        for i in range(j*50,min((j+1)*50,y)):
            ids = ids + 'aphiaids%5B%5D=' + str(aphia_list[i]) + '&'
        url = 'http://www.marinespecies.org/rest/AphiaRecordsByAphiaIDs?%s' % ids[0:-1]
        worms = pd.concat([worms, pd.DataFrame(worms_check(url))], ignore_index=True)
    ids = ''
    for i in range((j+1)*50,min((j+2)*50,y)):
        ids = ids + 'aphiaids%5B%5D=' + str(aphia_list[i]) + '&'
    url = 'http://www.marinespecies.org/rest/AphiaRecordsByAphiaIDs?%s' % ids[0:-1]
    worms = pd.concat([worms, pd.DataFrame(worms_check(url))], ignore_index=True)

input_taxa_check = pd.merge(input_taxa_check, worms[['AphiaID','scientificname']], how='left', on='AphiaID')

input_taxa_check = input_taxa_check.rename(index=str, columns={'scientificname': 'name_from_AphiaID'})

# Set column to indicate if a discrepancy to be resolved exists based on Scientific names not matching
a = input_taxa_check.Species == input_taxa_check.name_from_AphiaID
input_taxa_check['proceed'] = np.where(a, 'Yes', 'No')

# Subset those taxa where naming discrepancy exists
taxa_discrepancy = input_taxa_check[input_taxa_check['proceed']=='No'].reset_index(drop=True)

print("Number of Species with name discrepancy between taxon-AphiaID combination in file and WoRMS: %s" % len(taxa_discrepancy))
summary.append(["Number of Species with name discrepancy between taxon-AphiaID combination in file and WoRMS:" , len(taxa_discrepancy)])
display(taxa_discrepancy)

# Save taxa discrepancies
taxadis_file = os.path.join(results,'taxa_discrepancy.csv')
taxa_discrepancy.to_csv(taxadis_file, index=False)
print("Taxa discrepancies saved to: %s" % taxadis_file)

# Map AphiaID to S25 component TAXON for the non-duplicate AphiaID results in S25
taxa_map = pd.merge(input_taxa_check[['AphiaID','name_from_AphiaID']],
                    S25taxon_clean, 
                    how='left', 
                    on='AphiaID')
taxa_map = taxa_map.fillna(value={'TAXON': 'New TAXON required.'}).drop_duplicates()


# Add TAXON mapping to the main table based on the AphiaID provided    
param_combo = pd.merge(param_combo, taxa_map, how='left', on='AphiaID')

# Mark rows that do not require mapping as "Not applicable."    
param_combo = param_combo.fillna(value={'TAXON': 'not specified', 'name_from_AphiaID': '-9'})

print("Total combinations = %s" % (len(param_combo)))

### Display mapping progress

In [None]:
display(param_combo[(param_combo['DTYPE']=='CF')][['PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Species','AphiaID','Note','S06_label','S07_label','S27_label','S26_label','TAXON']])

#### New entries required in S25 semantic model TAXON vocabulary

In [None]:
display(taxa_map[taxa_map['TAXON']=='New TAXON required.'])

## Map remaining terms of the P01 semantic model

In [None]:
param_combo['S02_label'] = pd.Series(dtype='object')

param_combo.loc[(param_combo['BASIS']=='L'),'S06_label'] = 'Lipid-normalised concentration'
param_combo.loc[(param_combo['MUNIT']=='%'),'S06_label'] = 'Proportion'
param_combo.loc[(param_combo['MUNIT'].str.contains('Bq')),'S06_label'] = 'Activity'
param_combo.loc[(param_combo['MUNIT']=='ntu') & (param_combo['PARAM']=='TURB'),'S06_label'] = 'Turbidity'
param_combo.loc[(param_combo['MUNIT'].str.contains('Bq')==False) & (param_combo['MUNIT'].str[-1].isin(['l','g'])),'S06_label'] = 'Concentration'

param_combo.loc[(param_combo['MUNIT'].str[-1] == 'l'),'S02_label'] = 'per unit volume of the'
param_combo.loc[(param_combo['MUNIT'].str[-1] == 'g'),'S02_label'] = 'per unit mass of the'
param_combo.loc[(param_combo['BASIS']=='D'),'S02_label'] = 'per unit dry weight of'
param_combo.loc[(param_combo['BASIS']=='W'),'S02_label'] = 'per unit wet weight of'
param_combo.loc[(param_combo['BASIS']=='L'),'S02_label'] = 'in'
param_combo.loc[(param_combo['MUNIT']=='ntu') & (param_combo['PARAM']=='TURB'),'S02_label'] = 'of the'

param_combo.loc[(param_combo['MUNIT']=='-9') & (param_combo['CAS']!='-9') & (param_combo['DTYPE']=='CF') & (param_combo['S06_label']==''),'S06_label'] = 'Concentration'

param_combo['S03_label'] = 'not specified'
param_combo['S04_label'] = 'not specified'
param_combo['S05_label'] = 'not specified'

print("Number of rows in DataFrame: %s" % len(param_combo))

display(param_combo[(param_combo['DTYPE']=='CF')][['PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Species','AphiaID','Note','S06_label','S07_label','S27_label','S02_label','S26_label','TAXON']])



## Map matrix of the biota to appropriate NVS S25 SUBCOMPONENT and/or STAGE. Note some constraints based on taxa type applied in the code.

In [None]:
param_combo['SIZE'] = pd.Series(dtype='object')
param_combo['SEX'] = pd.Series(dtype='object')
param_combo['STAGE'] = pd.Series(dtype='object')
param_combo['SUBCOMPONENT'] = pd.Series(dtype='object')
param_combo['SUBGROUP'] = pd.Series(dtype='object')
param_combo['MORPHOLOGY'] = pd.Series(dtype='object')
param_combo['COLOUR'] = pd.Series(dtype='object')

# Direct 1-2-1 mappings
param_combo.loc[param_combo['MATRX'] == 'WO','SUBCOMPONENT'] = 'not specified'
param_combo.loc[param_combo['MATRX'] == 'TM','SUBCOMPONENT'] = 'muscle tissue'
param_combo.loc[param_combo['MATRX'] == 'SI','SUBCOMPONENT'] = 'not specified'
param_combo.loc[param_combo['MATRX'] == 'SH','SUBCOMPONENT'] = 'shell'
param_combo.loc[param_combo['MATRX'] == 'MU&EP','SUBCOMPONENT'] = 'muscle tissues and skin'
param_combo.loc[param_combo['MATRX'] == 'LI','SUBCOMPONENT'] = 'liver'
param_combo.loc[param_combo['MATRX'] == 'KI','SUBCOMPONENT'] = 'kidney'
param_combo.loc[param_combo['MATRX'] == 'GO','SUBCOMPONENT'] = 'gonads'
param_combo.loc[param_combo['MATRX'] == 'GI','SUBCOMPONENT'] = 'gill'
param_combo.loc[param_combo['MATRX'] == 'FE','SUBCOMPONENT'] = 'feathers'
param_combo.loc[param_combo['MATRX'] == 'FA','SUBCOMPONENT'] = 'body fat'                                             
param_combo.loc[param_combo['MATRX'] == 'EP','SUBCOMPONENT'] = 'skin'
param_combo.loc[param_combo['MATRX'] == 'BS','SUBCOMPONENT'] = 'blood serum'
param_combo.loc[param_combo['MATRX'] == 'BR','SUBCOMPONENT'] = 'brain'
param_combo.loc[param_combo['MATRX'] == 'BL','SUBCOMPONENT'] = 'blood'
param_combo.loc[param_combo['MATRX'] == 'BC','SUBCOMPONENT'] = 'blood cells'
                                                 
param_combo.loc[param_combo['MATRX'].isin(['EG','EH','RO']),'STAGE'] = 'eggs'
                                                 
param_combo.loc[param_combo['MATRX'] == 'EH','SUBCOMPONENT'] = 'egg yolk and albumen homogenate'

param_combo.loc[(param_combo['MATRX'] == 'MU') & (param_combo['Species'] == 'Loligo vulgaris'),'SUBCOMPONENT'] = 'flesh'
param_combo.loc[(param_combo['MATRX'] == 'MU') & (param_combo['Species'] != 'Loligo vulgaris'),'SUBCOMPONENT'] = 'muscle tissue'
param_combo.loc[(param_combo['MATRX'] == 'EX') & (param_combo['Species'] == 'Mytilus edulis'),'SUBCOMPONENT'] = 'shell'
param_combo.loc[(param_combo['MATRX'] == 'EX') & (param_combo['Species'] != 'Mytilus edulis'),'SUBCOMPONENT'] ='Checking species-matrx combo validity with ICES.'
param_combo.loc[(param_combo['MATRX'] == 'BB') & (param_combo['Note'] != 'Fish'),'SUBCOMPONENT'] = 'blubber'
param_combo.loc[(param_combo['MATRX'] == 'BB') & (param_combo['Note'] == 'Fish'),'SUBCOMPONENT'] = 'Checking species-matrx combo validity with ICES.'

param_combo.loc[(param_combo['MATRX'] == 'SB') & (param_combo['Species'].isin(['Gobius','Crangon crangon','Mysidacea'])==False),'SUBCOMPONENT'] = 'flesh'
param_combo.loc[(param_combo['MATRX'] == 'SB') & (param_combo['Species'].isin(['Gobius','Crangon crangon','Mysidacea'])),'SUBCOMPONENT'] = 'Checking species-matrx combo validity with ICES.'

# Mark empty SUBCOMPONENT and STAGE cells as 'not specified'
param_combo = param_combo.fillna(value={'SUBCOMPONENT': 'not specified', 
                                        'STAGE': 'not specified',
                                        'COLOUR': 'not specified',
                                        'SUBGROUP': 'not specified',
                                        'SIZE': 'not specified',
                                        'SEX': 'not specified',
                                        'MORPHOLOGY': 'not specified',
                                       }
                                )

print("Number of rows in DataFrame: %s" % len(param_combo))

display(param_combo[(param_combo['DTYPE']=='CF')][['DTYPE','MATRX','Species','AphiaID','Note','TAXON','SUBCOMPONENT','STAGE','COLOUR','SUBGROUP','SIZE','MORPHOLOGY','SEX']].drop_duplicates().head(20))

### Download Biological Entity (S25) semantic model from the NVS

In [None]:
#%% Get the latest semantic model vocabulary contents from the NVS Sparql endpoint

a1 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0A++++PREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A++++%0D%0A++++select+%3F"
a2 = "+%3F"
a3 = "%0D%0A++++where+%7B%0D%0A++++%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2F"
a4 = "%2Fcurrent%2F%3E+skos%3Amember+%3Furl+.%0D%0A++++%3Furl+skos%3AprefLabel+%3F"
a5 = "+.%0D%0A++++%3Furl+skos%3Anotation+%3Fc+.%0D%0A++++%3Furl+owl%3Adeprecated+%27false%27+.%0D%0A++++BIND%28replace%28str%28%3Fc%29%2C%27SDN%3A"
a6 = "%3A%3A%27%2C%27%27%2C%27i%27%29+AS+%3F"
a7 = "%29%0D%0A++++%7D&output=csv&stylesheet="

S09 = pd.read_csv(a1+'S09'+a2+'S09_label'+a3+'S09'+a4+'S09_label'+a5+'S09'+a6+'S09'+a7)
S10 = pd.read_csv(a1+'S10'+a2+'S10_label'+a3+'S10'+a4+'S10_label'+a5+'S10'+a6+'S10'+a7)
S11 = pd.read_csv(a1+'S11'+a2+'S11_label'+a3+'S11'+a4+'S11_label'+a5+'S11'+a6+'S11'+a7)
S12 = pd.read_csv(a1+'S12'+a2+'S12_label'+a3+'S12'+a4+'S12_label'+a5+'S12'+a6+'S12'+a7)
S13 = pd.read_csv(a1+'S13'+a2+'S13_label'+a3+'S13'+a4+'S13_label'+a5+'S13'+a6+'S13'+a7)
S14 = pd.read_csv(a1+'S14'+a2+'S14_label'+a3+'S14'+a4+'S14_label'+a5+'S14'+a6+'S14'+a7)
S15 = pd.read_csv(a1+'S15'+a2+'S15_label'+a3+'S15'+a4+'S15_label'+a5+'S15'+a6+'S15'+a7)
S25 = pd.read_csv(a1+'S25'+a2+'S25_label'+a3+'S25'+a4+'S25_label'+a5+'S25'+a6+'S25'+a7)

# Download semantic component mapping

urlS09 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS09+%3FS25%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS09%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS09%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS09%29+.%0D%0A%7D&output=csv&stylesheet="
S09_S25 = pd.read_csv(urlS09)
print("S09_S25 mapping downloaded.")

urlS10 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS10+%3FS25%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS10%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS10%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS10%29+.%0D%0A%7D&output=csv&stylesheet="
S10_S25 = pd.read_csv(urlS10)
print("S10_S25 mapping downloaded.")

urlS11 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS11+%3FS25%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS11%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Anarrower+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS11%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS11%29+.%0D%0A%7D&output=csv&stylesheet="
S11_S25 = pd.read_csv(urlS11)
print("S11_S25 mapping downloaded.")

urlS12 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS12+%3FS25%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS12%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS12%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS12%29+.%0D%0A%7D&output=csv&stylesheet="
S12_S25 = pd.read_csv(urlS12)
print("S12_S25 mapping downloaded.")

urlS13 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS13+%3FS25%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS13%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS13%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS13%29+.%0D%0A%7D&output=csv&stylesheet="
S13_S25 = pd.read_csv(urlS13)
print("S13_S25 mapping downloaded.")

urlS14 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS14+%3FS25%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS14%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS14%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS14%29+.%0D%0A%7D&output=csv&stylesheet="
S14_S25 = pd.read_csv(urlS14)
print("S14_S25 mapping downloaded.")

urlS15 = "http://vocab.nerc.ac.uk/sparql/sparql?query=PREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+owl%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23%3E%0D%0A%0D%0Aselect+distinct+%3FS15+%3FS25%0D%0Awhere+%7B%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS15%2Fcurrent%2F%3E+skos%3Amember+%3Furla+.%0D%0A%3Furla+owl%3Adeprecated+%27false%27+.%0D%0A%3Furla+skos%3Anotation+%3Fn2+.%0D%0A%3Furla+skos%3Arelated+%3Furlb+.%0D%0A%3Chttp%3A%2F%2Fvocab.nerc.ac.uk%2Fcollection%2FS25%2Fcurrent%2F%3E+skos%3Amember+%3Furlb+.%0D%0A%3Furlb+owl%3Adeprecated+%27false%27+.%0D%0A%3Furlb+skos%3Anotation+%3Fn1+.%0D%0ABIND%28replace%28%3Fn1%2C+%22SDN%3AS25%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS25%29+.%0D%0ABIND%28replace%28%3Fn2%2C+%22SDN%3AS15%3A%3A%22%2C+%22%22%2C+%22i%22%29+AS+%3FS15%29+.%0D%0A%7D&output=csv&stylesheet="
S15_S25 = pd.read_csv(urlS15)
print("S15_S25 mapping downloaded.")

# Build S25 semantic model dataframe
S25 = pd.merge(S25, S09_S25, how='left', on='S25')
S25 = pd.merge(S25, S10_S25, how='left', on='S25')
S25 = pd.merge(S25, S11_S25, how='left', on='S25')
S25 = pd.merge(S25, S12_S25, how='left', on='S25')
S25 = pd.merge(S25, S13_S25, how='left', on='S25')
S25 = pd.merge(S25, S14_S25, how='left', on='S25')
S25 = pd.merge(S25, S15_S25, how='left', on='S25')

S25 = S25.fillna(value={'S09': 'S09133',
                        'S10': 'S104',
                        'S11': 'S1131',
                        'S12': 'S1219',
                        'S13': 'S1319',
                        'S14': 'S1430',
                        'S15': 'S152',
                       }
                )

S25['TAXON'] = S25['S25_label'].str.replace(r" \[.*\]","", regex=True)

print("S25 semantic model dataframe constructed")

In [None]:
# Map biological entity labels to vocabulary codes
param_combo = pd.merge(param_combo, S09, how='left', left_on='SIZE', right_on='S09_label').drop(columns=['SIZE'])
param_combo = pd.merge(param_combo, S10, how='left', left_on='SEX', right_on='S10_label').drop(columns=['SEX'])
param_combo = pd.merge(param_combo, S11, how='left', left_on='STAGE', right_on='S11_label').drop(columns=['STAGE'])
param_combo = pd.merge(param_combo, S12, how='left', left_on='SUBCOMPONENT', right_on='S12_label').drop(columns=['SUBCOMPONENT'])
param_combo = pd.merge(param_combo, S13, how='left', left_on='SUBGROUP', right_on='S13_label').drop(columns=['SUBGROUP'])
param_combo = pd.merge(param_combo, S14, how='left', left_on='MORPHOLOGY', right_on='S14_label').drop(columns=['MORPHOLOGY'])
param_combo = pd.merge(param_combo, S15, how='left', left_on='COLOUR', right_on='S15_label').drop(columns=['COLOUR'])

# Map Biological entity semantic model combinations to table combinations

param_combo = pd.merge(param_combo, S25, how='left', on=['TAXON','S09','S10','S11','S12','S13','S14','S15'])

# Reset water and sediment defaults for biological entity
param_combo.loc[param_combo['DTYPE'].isin(['CW','CS']),'S25'] = 'BE007736'
param_combo.loc[param_combo['DTYPE'].isin(['CW','CS']),'S25_label'] = 'not applicable'

S25new = param_combo[(param_combo['DTYPE']=='CF') & (param_combo['S25'].isnull())][['Species','AphiaID','MATRX','S25','S25_label','TAXON','S09_label','S10_label','S11_label','S12_label','S13_label','S14_label','S15_label','S09','S10','S11','S12','S13','S14','S15']].drop_duplicates().sort_values(['Species','MATRX'])

# Save new biological entity combinations
S25new_file = os.path.join(results,'new_S25.csv')
S25new.to_csv(S25new_file, index=False)
print("New biological entity combinations saved to: %s" % taxadis_file)
print("New biological entity combinations: %s" % len(S25new))

print("\nNumber of rows in DataFrame: %s" % len(param_combo))
if len(inputs)<len(param_combo):
    print("\nPotential duplicate rows introduced into the DataFrame. Please check output below:")
    display(param_combo[(param_combo['rowID'].duplicated(keep=False))][['rowID','PARAM','MATRX','Species','AphiaID','S25','S25_label','S09_label','S10_label','S11_label','S12_label','S13_label','S14_label','S15_label']])

### Display new biological entity combinations for S25 creation

In [None]:
display(param_combo[(param_combo['DTYPE']=='CF') & (param_combo['S25'].isnull())][['Species','Note','AphiaID','MATRX','S25','S25_label','TAXON','S11_label','S12_label']].drop_duplicates().sort_values(['Species','MATRX']))

In [None]:
# Map P01 semantic component labels to vocabulary codes
param_combo = pd.merge(param_combo, S06, how='left', on='S06_label')
param_combo = pd.merge(param_combo, S07, how='left', on='S07_label')
param_combo = pd.merge(param_combo, S02, how='left', on='S02_label')
param_combo = pd.merge(param_combo, S26, how='left', on='S26_label')
param_combo = pd.merge(param_combo, S03, how='left', on='S03_label')
param_combo = pd.merge(param_combo, S04, how='left', on='S04_label')
param_combo = pd.merge(param_combo, S05, how='left', on='S05_label')

# Map P01 semantic model combinations to table combinations

param_combo = pd.merge(param_combo, P01, how='left', on=['S06','S07','S27','S02','S26','S25','S03','S04','S05'])


# Replaces any problematic text characters from the NVS imported into the DataFrame
# that will cause issues when writing the output to file.
param_combo.replace(u'\xa0',u' ', regex=True, inplace=True)
param_combo.replace(u'\u2019',u"'", regex=True, inplace=True)

# Reorder columns for output
param_combo = param_combo[['rowID','PARGROUP','PRNAM','CAS','DTYPE','PARAM','MUNIT','MATRX','BASIS','METPT','METOA','Note','AphiaID','Species',
                          'P01_Code','P01','P01_label','S06_label','S07_label','S27_label','S02_label','S26_label','S25_label',
                          'TAXON','S09_label','S10_label','S11_label','S12_label','S13_label','S14_label','S15_label',
                          'S06','S07','S27','S02','S26','S25','S09','S10','S11','S12','S13','S14','S15']]

print("Total combinations in file = %s" % (len(param_combo)))

### Determine if any input row has been mapped to more than one P01 code where mapping was possible

In [None]:
P01_duplicates = param_combo[(param_combo['P01'].notnull()) & (param_combo['rowID'].duplicated(keep=False))][['rowID','PARAM','MUNIT','MATRX','BASIS','METPT','Species','AphiaID','Note','P01','P01_label']]

if len(P01_duplicates)>0:
    print("\nPotential duplicate P01 mappings in the DataFrame. Please check output below:")
    display(P01_duplicates)
else:
    print("No P01 duplicates for each input file row where a mapping was possible.")

# Save the results of the ICES to NVS semantic model mapping to file.

In [None]:
# Save complete DataFrame
full_results = os.path.join(results,'complete_output.csv')
param_combo.to_csv(full_results, index=False)
print("Full results set saved to: %s" % full_results)

x=len(param_combo[param_combo['P01_Code']!='-9'])
y=len(param_combo[(param_combo['P01_Code']=='-9') & (param_combo['P01'].notnull())])
z=len(param_combo[(param_combo['P01_Code']=='-9') & (param_combo['P01'].isnull())])

# Split out those combinations that have already been mapped to P01 in the parameter set
previous_P01 = os.path.join(results,'previous_P01.csv')
param_combo[param_combo['P01_Code']!='-9'].to_csv(previous_P01, index=False)
print("Subset of previously mapped P01 terms saved to: %s" % full_results)

# Retain those combinations that have not yet been mapped to P01
mapped_P01 = os.path.join(results,'mapped_P01.csv')
param_combo[(param_combo['P01_Code']=='-9') & (param_combo['P01'].notnull())].to_csv(mapped_P01, index=False)
print("Subset of newly mapped P01 terms saved to: %s" % mapped_P01)

# Retain those combinations that have not yet been mapped to P01
new_P01 = os.path.join(results,'new_P01.csv')
param_combo[(param_combo['P01_Code']=='-9') & (param_combo['P01'].isnull())].to_csv(new_P01, index=False)
print("Subset of possible P01 terms for creation saved to: %s" % new_P01)
    
# Create summary information
summary = pd.DataFrame([["Processing started:" , (start.strftime('%Y-%m-%d %H:%M:%S'))],
["Processing finished:" , datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
["" , ""],
["Rows input:", len(inputs)],
["Rows output:" , len(param_combo)],
["Rows with P01 provided:" , x],
["Rows successfully mapped:" , y],
["Rows still to be mapped:" , z],
["Because:", ""],
["    Non-chemical PARAMs to be mapped:" , len(alt_mapping)],
["    New chemical substances (S27) for mapping:" , len(S27new)],
["    New matrix terms (S26) for creation:" , len(S26new)],
["    New biological entities (S25) for creation:" , len(S25new)],
["    Taxa discrepancies to be resolved:" , len(taxa_discrepancy)]])

print("\nProcess results all saved to directory: %s" % results)
display(summary)

if len(param_combo)!= len(inputs):
    print("Check for duplicated results as number of rows out do not match number of rows in original file")