# IHCC matching pipeline

The following notebooks describes a workflow that:

    1. Loads a template
    2. For each term in the template, determine a (set of) matches
    3. Merge the suggestions back into the template and safe

In [1]:
config_file='/Users/matentzn/knocean/data-harmonization/src/mapping-suggest/mapping-suggest-config.yml'
template='/Users/matentzn/knocean/data-harmonization/templates/koges.tsv'
template_out='/Users/matentzn/knocean/data-harmonization/templates/_koges_simple.tsv'

In [2]:
import pandas as pd
from argparse import ArgumentParser
from lib import load_ihcc_config, map_term

parser = ArgumentParser()
parser.add_argument("-c", "--config", dest="config_file",
                    help="Config file", metavar="FILE")
parser.add_argument("-t", "--template", dest="tsv_path",
                    help="Template file file", metavar="FILE")
parser.add_argument("-o", "--output", dest="tsv_out_path",
                    help="Output file", metavar="FILE")
args = parser.parse_args()

In [3]:
class YClass( object ):
    pass

args = YClass()
setattr(args,'config_file',config_file)
setattr(args,'tsv_path',template)
setattr(args,'tsv_out_path',template_out)

## Loading config

In [4]:
config = load_ihcc_config(args.config_file)
zooma_annotate=config["zooma_annotate"]
oxo_mapping=config["oxo_mapping"]
ols_term=config["ols_term"]
ols_oboid=config["ols_oboid"]

confidence_map = ["HIGH", "GOOD", "MEDIUM", "LOW"] # These are the default confidence levels from Zooma
print(config)

{'zooma_annotate': 'http://localhost:8009/zooma/v2/api/services/annotate?propertyValue=', 'oxo_mapping': 'http://localhost:8008/api/mappings?fromId=', 'ols_term': 'http://localhost:8080/api/terms?iri=', 'ols_oboid': 'http://localhost:8080/api/terms?obo_id='}


## Loading Data

In [5]:
tsv=pd.read_csv(args.tsv_path,sep="\t")
del tsv['Suggested Categories']
tsv_terms=tsv['Label'].values[2:]

## Generating Matches

In [6]:
matches=[]

for term in tsv_terms:
    #print("Matching "+term)
    matches.extend(map_term(term,zooma_annotate, ols_term, ols_oboid))
                        
df=pd.DataFrame(matches,columns=['term','match','match_label','confidence'])
df

Unnamed: 0,term,match,match_label,confidence
0,Education level,GECKO:0000065,education,HIGH
1,Marital status,PATO:0001995,socio-demographic and economic characteristics,HIGH
2,Medical history,MONDO:0000001,diseases,HIGH
3,Relationship,PATO:0001995,socio-demographic and economic characteristics,GOOD
4,Lifestyle,GECKO:0000067,lifestyle and behaviours,GOOD
5,Physical activity,GECKO:0000104,physical activity,HIGH
6,Exercises,OGMS:0000020,signs and symptoms,GOOD
7,Reproductive history,GECKO:0000114,reproduction,HIGH
8,Number of pregnancies,GECKO:0000114,reproduction,GOOD
9,Breastfeeding,GECKO:0000067,lifestyle and behaviours,HIGH


## Transform matches into the right format and merge into template

In [7]:
dfs=df[~df['match'].str.startswith("https://purl.ihccglobal.org/")].copy()
dfs['Suggested Categories']=dfs[['confidence', 'match', 'match_label']].agg(' '.join, axis=1)
dfs=dfs[['term','Suggested Categories']]
dfsagg=dfs.groupby('term', as_index=False).agg(lambda x: ' | '.join(set(x.dropna())))
dfx = pd.merge(tsv, dfsagg, how='left', left_on=['Label'], right_on=['term'])
del dfx['term']

dfx

Unnamed: 0,Term ID,Label,Parent Term,Definition,GECKO Category,Comment,Suggested Categories
0,ID,LABEL,C % SPLIT=|,A definition,,,
1,,is-required;,,,,,
2,KoGES:0000001,Core Variables,,,,,
3,KoGES:0000002,Core Questionnaires,KoGES:0000001,,questionnaire/survey data,,
4,KoGES:0000003,Socio-demographic data,KoGES:0000002,,socio-demographic and economic characteristics,,
...,...,...,...,...,...,...,...
202,KoGES:0000204,Calcium (Urine),KoGES:0000187,,urine,,
203,KoGES:0000205,Sodium (Urine),KoGES:0000187,,urine,,
204,KoGES:0000206,Potassium (Urine),KoGES:0000187,,urine,,
205,KoGES:0000207,Vitamin C,KoGES:0000187,,urine,no CMO term for vit C levels in urine,HIGH GECKO:0000072 nutrition


## Save template

In [8]:
with open(args.tsv_out_path,'w') as write_csv:
    write_csv.write(dfx.to_csv(sep='\t', index=False))
