In [1]:
import pandas as pd
import re
import requests as requests
import sqlite3
import string
import urllib
import yaml

import scoped_mapping

from datetime import datetime
from pkg_resources import get_distribution, DistributionNotFound
from strsimpy.cosine import Cosine
from xml.etree import ElementTree
from tdda import rexpy

# Settings required from user:

See repo README for notes on setting up SQLite databases of OBO ontologies with semantic-sql, relation-graph and rdftab

### How should the mapping effort be scoped?
Additional taxonomic filters may be applied below
- soil
- sediment
- plant-associated

In [2]:
scoping_col = "env_package_normalization.EnvPackage"
scoping_value = "soil"

### What Biosample field should be mapped to ontology classes?
- env_broad_scale
- env_local_scale
- env_medium

In [3]:
biosample_col_to_map = "env_broad_scale"
# biosample_col_to_map = "env_medium"

In [4]:
# where do we require a single ontology and where can we use multiple?
primary_onto_prefix = "ENVO"
# 'gaz',
# need to rank results
# additional_preferred_ontologies = ["ncbitaxon", "obi", "pato", "efo"]
additional_preferred_ontologies = ["ncbitaxon", "po"]

In [5]:
# from https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml
# see also https://www.ncbi.nlm.nih.gov/biosample/docs/packages/
biosample_packages_file = "../../target/biosample_packages.xml"

# from ftp://ftp.ncbi.nlm.nih.gov//biosample/biosample_set.xml.gz
# via harmonized_table.db.gz
# in https://drive.google.com/drive/u/0/folders/1eL0v0stoduahjDpoDJIk3z2pJBAU4b2Y
biosample_sqlite_file = "../../target/harmonized-table.db"
ncbitaxon_sqlite_file = "../../../scoped-mapping/semantic-sql/db/ncbitaxon.db"

# /Users/MAM/Documents/gitrepos/biosample-analysis/target/harmonized-table.db

chars_to_whiteout = "._-"
my_query_fields = ""  # OLS weighted default
my_row_req = 3
my_string_dist_arg = 2
my_max_string_dist = 0.1
salvage_max_dist = 0.2

### Settings for manual review of OLS search-based results
Results based on merging Biosample annotations to ontology classes by label or embedded ID are not exported for review at this time

In [6]:
# ols_review_file = "../local/ols_review.tsv"
ols_review_seperator = "\t"

strategy_col = "strategy"
include_col = "include"

first_pass_include_val = True
# first_pass_strategy_val = "env_braod_scale vs envo and gaz @ 0.1"
salvage_include_val = False
# salvage_strategy_val    = "env_braod_scale vs all of ols @ 0.2"

### Settings for SSSOM output

In [7]:
sssom_subject_prefix = biosample_col_to_map + "_" + scoping_col + "_" + scoping_value
sssom_file = "../" + sssom_subject_prefix + "_SSSOM.tsv"

In [8]:
# per_biosample_mapping_results_file = "per_biosample_scoped_mapping_results.tsv"

----

In [9]:
biosample_cnx = sqlite3.connect(biosample_sqlite_file)
ncbitaxon_cnx = sqlite3.connect(ncbitaxon_sqlite_file)
primary_ontology_sqlite_file = (
    "../../../scoped-mapping/semantic-sql/db/" + primary_onto_prefix.lower() + ".db"
)
primary_ontology_cnx = sqlite3.connect(primary_ontology_sqlite_file)
additional_preferred_ontologies.insert(0, primary_onto_prefix.lower())
additional_preferred_ontologies_str = ",".join(additional_preferred_ontologies)

In [10]:
q = "select * from id_patterns"
[id_patterns_frame, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)
id_patterns_frame

Unnamed: 0,ontology,id_pattern
0,BFO,BFO:\d{7}
1,CARO,CARO:\d{7}
2,CHEBI,"CHEBI:\d{4,6}"
3,ENVO,"ENVO:\d{7,8}"
4,FAO,FAO:0000001
5,FOODON,FOODON:\d{8}
6,GO,GO:\d{7}
7,IAO,IAO:\d{7}
8,NCBITaxon,NCBITaxon:\d+
9,OBI,OBI:\d{7}


In [11]:
id_patterns = dict(zip(id_patterns_frame.ontology, id_patterns_frame.id_pattern))
# id_patterns

- env_package = env_package annotation from NCBI Biosample file XXX
- count = number of biosamples using that env_package annotation
- lhs = checklist info
- rhs = potential package info
- extract = potential OBO ID from rhs column (currently harcoded and only looking for ENVO IDs)
- remaining_string = rhs/string, with potential OBO IDs removed
- remaining_tidied = remaining_string with case, whitespace and punctuation normailzastion
- rt_override = some remaining_tidied values can be replaced according to env_package_overrides
- EnvPackage = corresponding de-normalized value from package_dictionary
- is_canonical = false when EnvPackage is NaN

In [12]:
cursor = biosample_cnx.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('non_bsattribute_columns',), ('harmonized_attrib_pivot',), ('biosample',), ('biosample_sample_taxon_summary',), ('all_tax_labels',), ('ids_labs_selected_ontolgies',), ('id_patterns',), ('package_dictionary',), ('env_package_normalization',), ('repaired_env_package',)]


In [13]:
q = "select * from env_package_normalization"
[env_package_normalization, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx
)

# env_package_normalization

## Get a table of MIxS annotations to be mapped to ontology classes.

Explicitly scope based on normalized package data. These values were set at the top of this notebook.


In [14]:
print(biosample_col_to_map)
print(scoping_col)
print(scoping_value)

env_medium
env_package_normalization.EnvPackage
soil


**In this case, the scoping includes an inner join requirement for 'unclassified entities'**

In [15]:
q = (
    "select "
    + biosample_col_to_map
    + """, count(*) as count
from
    biosample b
join env_package_normalization on
    b.env_package = env_package_normalization.env_package
inner join biosample_sample_taxon_summary stic on
    b.taxonomy_id = stic.biosample_taxid
where """
    + scoping_col
    + " = '"
    + scoping_value
    + "' group by "
    + biosample_col_to_map
    + """
order by
    count(*) desc"""
)

[mapping_candidates, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

mapping_candidates

Unnamed: 0,env_medium,count
0,soil,12596
1,Soil,439
2,peat soil,242
3,rhizosphere,238
4,ENVO:soil,214
...,...,...
105,rhizosphere?soil?,1
106,soil sample,1
107,"soil, water",1
108,"surface sediment, water",1


----

## The Biosample format allows for pipe-delimited environmental package lists. 

Separate those out into their components.

----


In [16]:
multi_frames = []
for row in mapping_candidates.itertuples(index=True, name="PandaZ"):
    split_check = row[1]
    if split_check is None:
        split_check = ""
    splitted = pd.Series(split_check.split("|"))
    splitted_count = len(splitted)
    repeated = [split_check] * splitted_count
    repeated = pd.Series(repeated)
    as_frame = pd.DataFrame(dict(repeated=repeated, splitted=splitted)).reset_index()
    seq_list = list(range(1, splitted_count + 1))
    as_frame["part_count"] = splitted_count
    as_frame["seq"] = seq_list
    multi_frames.append(as_frame)
concat_frame = pd.concat(multi_frames)
concat_frame = concat_frame[["repeated", "splitted", "part_count", "seq"]]
concat_frame

Unnamed: 0,repeated,splitted,part_count,seq
0,soil,soil,1,1
0,Soil,Soil,1,1
0,peat soil,peat soil,1,1
0,rhizosphere,rhizosphere,1,1
0,ENVO:soil,ENVO:soil,1,1
...,...,...,...,...
0,rhizosphere?soil?,rhizosphere?soil?,1,1
0,soil sample,soil sample,1,1
0,"soil, water","soil, water",1,1
0,"surface sediment, water","surface sediment, water",1,1


In [17]:
mapping_candidates = mapping_candidates.merge(
    concat_frame, left_on=biosample_col_to_map, right_on="repeated", how="left"
)

# mapping_candidates

## Normalize a few different ways `OBO` IDs have been entered
In the Biosample metadata

In [18]:
# # this helps by standardizing what could be an ID
# but if the RHS is text, then envo should just be removed
mapping_candidates["term_tidy"] = mapping_candidates.splitted.str.replace(
    primary_onto_prefix.lower() + "[:_ ]",
    primary_onto_prefix + ":",
    regex=True,
    case=False,
)

# mapping_candidates

# Now try to extract ontology terms that are already present

In [19]:
candidate_series_decomposition = scoped_mapping.decompose_series(
    mapping_candidates["term_tidy"], id_patterns[primary_onto_prefix]
)

# candidate_series_decomposition

mapping_candidates = pd.concat(
    [mapping_candidates, candidate_series_decomposition], axis=1
)

mapping_candidates[
    "remaining_tidied"
] = mapping_candidates.remaining_tidied.str.replace(
    primary_onto_prefix.lower() + "[:_ ]", "", regex=True, case=False
)

mapping_candidates

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,term_tidy,string,extract,remaining_string,remaining_tidied
0,soil,12596,soil,soil,1,1,soil,soil,,soil,soil
1,Soil,439,Soil,Soil,1,1,Soil,Soil,,Soil,soil
2,peat soil,242,peat soil,peat soil,1,1,peat soil,peat soil,,peat soil,peat soil
3,rhizosphere,238,rhizosphere,rhizosphere,1,1,rhizosphere,rhizosphere,,rhizosphere,rhizosphere
4,ENVO:soil,214,ENVO:soil,ENVO:soil,1,1,ENVO:soil,ENVO:soil,,ENVO:soil,soil
...,...,...,...,...,...,...,...,...,...,...,...
105,rhizosphere?soil?,1,rhizosphere?soil?,rhizosphere?soil?,1,1,rhizosphere?soil?,rhizosphere?soil?,,rhizosphere?soil?,rhizosphere soil
106,soil sample,1,soil sample,soil sample,1,1,soil sample,soil sample,,soil sample,soil sample
107,"soil, water",1,"soil, water","soil, water",1,1,"soil, water","soil, water",,"soil, water",soil water
108,"surface sediment, water",1,"surface sediment, water","surface sediment, water",1,1,"surface sediment, water","surface sediment, water",,"surface sediment, water",surface sediment water


## Join the extracted IDs with their labels

_Start by connecting to the rdftab database from which the terms and label-like annotations will be obtained?_

## extracting the labels

In [20]:
q = "select stanza as subject, value from ids_labs_selected_ontolgies"

[onto_labels, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

onto_labels

Unnamed: 0,subject,value
0,BFO:0000001,entity
1,BFO:0000002,continuant
2,BFO:0000003,occurrent
3,BFO:0000004,independent continuant
4,BFO:0000006,spatial region
...,...,...
13129,UBERON:3000972,head external integument structure
13130,UBERON:3000977,body external integument structure
13131,UBERON:3010200,vasculature of respiratory integument
13132,UBERON:4100000,skeletal element projection


## and merging 

In [21]:
# mapping_candidates.columns

In [22]:
mapping_candidates = mapping_candidates.merge(
    onto_labels, left_on="extract", right_on="subject", how="left"
)
mapping_candidates

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,term_tidy,string,extract,remaining_string,remaining_tidied,subject,value
0,soil,12596,soil,soil,1,1,soil,soil,,soil,soil,,
1,Soil,439,Soil,Soil,1,1,Soil,Soil,,Soil,soil,,
2,peat soil,242,peat soil,peat soil,1,1,peat soil,peat soil,,peat soil,peat soil,,
3,rhizosphere,238,rhizosphere,rhizosphere,1,1,rhizosphere,rhizosphere,,rhizosphere,rhizosphere,,
4,ENVO:soil,214,ENVO:soil,ENVO:soil,1,1,ENVO:soil,ENVO:soil,,ENVO:soil,soil,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,rhizosphere?soil?,1,rhizosphere?soil?,rhizosphere?soil?,1,1,rhizosphere?soil?,rhizosphere?soil?,,rhizosphere?soil?,rhizosphere soil,,
111,soil sample,1,soil sample,soil sample,1,1,soil sample,soil sample,,soil sample,soil sample,,
112,"soil, water",1,"soil, water","soil, water",1,1,"soil, water","soil, water",,"soil, water",soil water,,
113,"surface sediment, water",1,"surface sediment, water","surface sediment, water",1,1,"surface sediment, water","surface sediment, water",,"surface sediment, water",surface sediment water,,


## Use cosine string distance to see if the labels match closely enough

I.e. the labels claimed by the Biosample data set and the labels asserted in the ontology. if they're close enough, consider the assigned ID legit


_How close is close enough?_

In [23]:
my_cosine_obj = Cosine(my_string_dist_arg)
mapping_candidates["value"] = mapping_candidates["value"].fillna("")
mapping_candidates["remaining_tidied"] = mapping_candidates["remaining_tidied"].fillna(
    ""
)
mapping_candidates["cosine"] = mapping_candidates.apply(
    lambda my_row: my_cosine_obj.distance(
        my_row["remaining_tidied"].lower(), my_row["value"].lower()
    ),
    axis=1,
)
mapping_candidates

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,term_tidy,string,extract,remaining_string,remaining_tidied,subject,value,cosine
0,soil,12596,soil,soil,1,1,soil,soil,,soil,soil,,,1.0
1,Soil,439,Soil,Soil,1,1,Soil,Soil,,Soil,soil,,,1.0
2,peat soil,242,peat soil,peat soil,1,1,peat soil,peat soil,,peat soil,peat soil,,,1.0
3,rhizosphere,238,rhizosphere,rhizosphere,1,1,rhizosphere,rhizosphere,,rhizosphere,rhizosphere,,,1.0
4,ENVO:soil,214,ENVO:soil,ENVO:soil,1,1,ENVO:soil,ENVO:soil,,ENVO:soil,soil,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,rhizosphere?soil?,1,rhizosphere?soil?,rhizosphere?soil?,1,1,rhizosphere?soil?,rhizosphere?soil?,,rhizosphere?soil?,rhizosphere soil,,,1.0
111,soil sample,1,soil sample,soil sample,1,1,soil sample,soil sample,,soil sample,soil sample,,,1.0
112,"soil, water",1,"soil, water","soil, water",1,1,"soil, water","soil, water",,"soil, water",soil water,,,1.0
113,"surface sediment, water",1,"surface sediment, water","surface sediment, water",1,1,"surface sediment, water","surface sediment, water",,"surface sediment, water",surface sediment water,,,1.0


**Previously, we did a reality check on the claimed IDs and labels. If a label is claimed without any ID, that could still be a path to an ontology term.**

We'll be doing some merging, so make sure column names are meaningful and unique


In [24]:
mapping_candidates.columns = [
    biosample_col_to_map,
    "count",
    "repeated",
    "splitted",
    "part_count",
    "seq",
    "term_tidy",
    "string",
    "extract",
    "remaining_string",
    "remaining_tidied",
    "term_id",
    "lab_from_id",
    "lfi_cosine",
]
mapping_candidates = mapping_candidates.merge(
    onto_labels, left_on="remaining_tidied", right_on="value", how="left"
)

mapping_candidates.columns = [
    biosample_col_to_map,
    "count",
    "repeated",
    "splitted",
    "part_count",
    "seq",
    "term_tidy",
    "string",
    "extract",
    "remaining_string",
    "remaining_tidied",
    "term_id",
    "lab_from_id",
    "lfi_cosine",
    "term_id_from_lab",
    "value",
]


mapping_candidates

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,term_tidy,string,extract,remaining_string,remaining_tidied,term_id,lab_from_id,lfi_cosine,term_id_from_lab,value
0,soil,12596,soil,soil,1,1,soil,soil,,soil,soil,,,1.0,ENVO:00001998,soil
1,soil,12596,soil,soil,1,1,soil,soil,,soil,soil,,,1.0,ENVO:00001998,soil
2,Soil,439,Soil,Soil,1,1,Soil,Soil,,Soil,soil,,,1.0,ENVO:00001998,soil
3,Soil,439,Soil,Soil,1,1,Soil,Soil,,Soil,soil,,,1.0,ENVO:00001998,soil
4,peat soil,242,peat soil,peat soil,1,1,peat soil,peat soil,,peat soil,peat soil,,,1.0,ENVO:00005774,peat soil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,rhizosphere?soil?,1,rhizosphere?soil?,rhizosphere?soil?,1,1,rhizosphere?soil?,rhizosphere?soil?,,rhizosphere?soil?,rhizosphere soil,,,1.0,,
152,soil sample,1,soil sample,soil sample,1,1,soil sample,soil sample,,soil sample,soil sample,,,1.0,,
153,"soil, water",1,"soil, water","soil, water",1,1,"soil, water","soil, water",,"soil, water",soil water,,,1.0,,
154,"surface sediment, water",1,"surface sediment, water","surface sediment, water",1,1,"surface sediment, water","surface sediment, water",,"surface sediment, water",surface sediment water,,,1.0,,


In [25]:
mapping_candidates = mapping_candidates.drop_duplicates()

mc_tifl_split = mapping_candidates["term_id_from_lab"].str.split(":")

import math

prefix_when_possible = []
for one_split in mc_tifl_split:
    if type(one_split) == list:
        os_first = one_split[0]
        prefix_when_possible.append(os_first)
    else:
        prefix_when_possible.append("")

prefix_when_possible = [each_string.lower() for each_string in prefix_when_possible]

# prefix_when_possible

In [26]:
if "onto_prefix" in mapping_candidates.columns:
    mapping_candidates = mapping_candidates.drop("onto_prefix", 1)

mapping_candidates.loc[:, "onto_prefix"] = pd.Series(prefix_when_possible).values

# mapping_candidates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [27]:
ranks = list(range(len(additional_preferred_ontologies)))

d = {"onto_prefix": additional_preferred_ontologies, "rank": ranks}
onto_rank_frame = pd.DataFrame(d)

onto_rank_frame

Unnamed: 0,onto_prefix,rank
0,envo,0
1,ncbitaxon,1
2,po,2


In [28]:
mapping_candidates = mapping_candidates.merge(
    onto_rank_frame, on="onto_prefix", how="left"
)

# mapping_candidates

In [29]:
best_maps = mapping_candidates.groupby(biosample_col_to_map)["rank"].min().reset_index()
# best_maps

In [30]:
mapping_candidates = mapping_candidates.merge(
    best_maps, on=[biosample_col_to_map, "rank"]
)
# mapping_candidates

## Find consensus term IDs and labels


In [31]:
mapping_candidates["consensus_id"] = mapping_candidates["term_id_from_lab"]
mapping_candidates["consensus_lab"] = mapping_candidates["value"]

flag = mapping_candidates["consensus_id"].isnull() & (
    (
        ~mapping_candidates["term_id"].isnull()
        & mapping_candidates.lfi_cosine.le(my_max_string_dist)
    )
    | (
        ~mapping_candidates["term_id"].isnull()
        & mapping_candidates["remaining_tidied"].eq("")
    )
)

replacements = mapping_candidates.loc[flag, "term_id"]
mapping_candidates.loc[flag, "consensus_id"] = replacements

replacements = mapping_candidates.loc[flag, "lab_from_id"]
mapping_candidates.loc[flag, "consensus_lab"] = replacements

flag = mapping_candidates.consensus_id.isna()
antiflag = ~flag
mapping_candidates["id_or_lab_ok"] = antiflag

mapping_candidates["assembled_consensus"] = (
    mapping_candidates["consensus_lab"]
    + " ["
    + mapping_candidates["consensus_id"]
    + "]"
)

In [32]:
mapping_candidates

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,term_tidy,string,extract,remaining_string,...,lab_from_id,lfi_cosine,term_id_from_lab,value,onto_prefix,rank,consensus_id,consensus_lab,id_or_lab_ok,assembled_consensus
0,soil,12596,soil,soil,1,1,soil,soil,,soil,...,,1.0,ENVO:00001998,soil,envo,0.0,ENVO:00001998,soil,True,soil [ENVO:00001998]
1,Soil,439,Soil,Soil,1,1,Soil,Soil,,Soil,...,,1.0,ENVO:00001998,soil,envo,0.0,ENVO:00001998,soil,True,soil [ENVO:00001998]
2,peat soil,242,peat soil,peat soil,1,1,peat soil,peat soil,,peat soil,...,,1.0,ENVO:00005774,peat soil,envo,0.0,ENVO:00005774,peat soil,True,peat soil [ENVO:00005774]
3,rhizosphere,238,rhizosphere,rhizosphere,1,1,rhizosphere,rhizosphere,,rhizosphere,...,,1.0,ENVO:00005801,rhizosphere,envo,0.0,ENVO:00005801,rhizosphere,True,rhizosphere [ENVO:00005801]
4,ENVO:soil,214,ENVO:soil,ENVO:soil,1,1,ENVO:soil,ENVO:soil,,ENVO:soil,...,,1.0,ENVO:00001998,soil,envo,0.0,ENVO:00001998,soil,True,soil [ENVO:00001998]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,rhizosphere?soil?,1,rhizosphere?soil?,rhizosphere?soil?,1,1,rhizosphere?soil?,rhizosphere?soil?,,rhizosphere?soil?,...,,1.0,,,,,,,False,
106,soil sample,1,soil sample,soil sample,1,1,soil sample,soil sample,,soil sample,...,,1.0,,,,,,,False,
107,"soil, water",1,"soil, water","soil, water",1,1,"soil, water","soil, water",,"soil, water",...,,1.0,,,,,,,False,
108,"surface sediment, water",1,"surface sediment, water","surface sediment, water",1,1,"surface sediment, water","surface sediment, water",,"surface sediment, water",...,,1.0,,,,,,,False,


## For which Biosample annotations were no easy mappings found?
How many Biosamples uses those annotations?

In [33]:
flag = ~mapping_candidates.id_or_lab_ok
needs_search = mapping_candidates.loc[flag]

needs_search = needs_search[["remaining_tidied", "count"]]

sum_by_needed = needs_search.groupby("remaining_tidied")["count"].sum()

sum_by_needed = sum_by_needed.to_frame()
sum_by_needed["remaining_tidied"] = sum_by_needed.index

sum_by_needed = sum_by_needed.sort_values("count", ascending=False)
sum_by_needed.reset_index(drop=True)

# sum_by_needed.to_sql("sum_by_needed", biosample_cnx, if_exists="replace", index=False)

Unnamed: 0,count,remaining_tidied
0,162,organic soil
1,113,
2,99,dna
3,90,mine tailings
4,80,mineral soil
...,...,...
60,1,mud soil
61,1,paddy soil
62,1,replicate 1 soil from baodongyu of taoyuan
63,1,replicate 1 soil from gushi of taoyuan


In [34]:
# sum_by_needed

## Extract the tidied strings

In [35]:
ebs_raw_list = list(sum_by_needed["remaining_tidied"])
ebs_raw_list.sort()

## Submit those tidied strings to a search engine

Specifically OLS search. This takes roughly one second per unique post-tidied submission

_Turn logging back on to show status?_
_Print the count and pre- and post- datestamps_
`datetime.now().replace(microsecond=0).isoformat()` ?


In [36]:
print(datetime.now().replace(microsecond=0).isoformat())
print(len(sum_by_needed.index))
print((len(sum_by_needed.index)) / 60)

2021-07-05T19:10:38
65
1.0833333333333333


In [37]:
ebs_search_res = scoped_mapping.search_get_annotations_wrapper(
    ebs_raw_list,
    bad_chars=chars_to_whiteout,
    cat_name=biosample_col_to_map,
    ontoprefix=additional_preferred_ontologies_str,
    query_fields="",
    rr=5,
    string_dist_arg=my_string_dist_arg,
)
# ebs_search_res.to_sql(
#     "ols_search_results", biosample_cnx, if_exists="replace", index=False
# )

## Filter out the best of the acceptable mappings
From a string distance perspective

In [38]:
my_best_acceptable = scoped_mapping.get_best_acceptable(
    ebs_search_res, max_string_dist=my_max_string_dist
)
my_best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
45,env_medium,bacteria,bacteria,Bacteria,1,0.0,NCBITaxon:2,Bacteria,1,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_2,ncbitaxon
305,env_medium,mine tailings,mine tailings,mine tailing,1,0.036,ENVO:00000003,mine tailing,4,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00000003,envo
425,env_medium,peat,peat,peat,1,0.0,ENVO:00005774,peat soil,1,ENVO,hasRelatedSynonym,,http://purl.obolibrary.org/obo/ENVO_00005774,envo
553,env_medium,sandy soil,sandy soil,sandy soil,1,0.0,ENVO:00002229,arenosol,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002229,envo


----

## Filter out the submissions with no acceptable matches

In [39]:
no_acceptable_mappings = scoped_mapping.get_no_acceptable_mappings(
    ebs_search_res, my_best_acceptable
)

no_acceptable_mappings

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
0,env_medium,,,,1,0.000,,,1,,,,,
1,env_medium,13,13,bacterium A9 13-13,1,0.541,NCBITaxon:1469932,bacterium A9 13-13,1,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_1469932,ncbitaxon
3,env_medium,13,13,Phage 13,2,0.622,NCBITaxon:12424,Phage 13,3,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_12424,ncbitaxon
2,env_medium,13,13,Pasteurellaceae bacterium 13/13,3,0.667,NCBITaxon:1347399,Pasteurellaceae bacterium 13/13,2,NCBITAXON,label,label,http://purl.obolibrary.org/obo/NCBITaxon_1347399,ncbitaxon
4,env_medium,13,13,Bacteriophage 13,4,0.742,NCBITaxon:12424,Phage 13,3,NCBITAXON,hasExactSynonym,equivalent name,http://purl.obolibrary.org/obo/NCBITaxon_12424,ncbitaxon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,env_medium,topsoil,topsoil,soil erosion,1,0.631,ENVO:01000706,soil erosion,3,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_01000706,envo
647,env_medium,topsoil,topsoil,luvisol,2,0.833,ENVO:00002248,luvisol,4,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00002248,envo
648,env_medium,topsoil,topsoil,lixisol,3,0.833,ENVO:00002242,lixisol,5,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00002242,envo
645,env_medium,topsoil,topsoil,stagnosol,4,0.856,ENVO:00002274,stagnosol,2,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00002274,envo


## Try searching the failures against all ontologies in OLS
what would it be like to use a local OLS instance? (latency-wise)

In [40]:
still_unmapped = list(set(list(no_acceptable_mappings["raw"])))
still_unmapped.sort()

print(len(still_unmapped))

still_unmapped

61


['',
 '13',
 '15',
 '16s rrna gene sequence',
 '22',
 '24',
 '4',
 '6',
 'alfisols',
 'biomat',
 'cryoconite',
 'decomposed litter oe layer',
 'dna',
 'fermentation humic sustrate',
 'forest soil biochar',
 'genome',
 'genomic bacterial dna of the ssu rrna gene v5 v7 hypervariable region',
 'hypolith scappings from under rocks',
 'juvenile from soil aluvial loam',
 'juvenile from soil diluvial sand',
 'juvenile from soil loess loam',
 'juveniles meloidogyne hapla from soil diluvial sand',
 'leaves',
 'litters',
 'mineral soil',
 'mixed fungi',
 'molecular ecology sample w1 3',
 'molecular ecology sample w1 6',
 'molecular ecology sample x1 3',
 'molecular ecology sample x1 6',
 'moss',
 'mud soil',
 'not applicable',
 'organic soil',
 'paddy soil',
 'permafrost soil',
 'picea litter',
 'pinus litter',
 'replicate 1 soil from baodongyu of taoyuan',
 'replicate 1 soil from gushi of taoyuan',
 'replicate 1 soil from jiaxing',
 'replicate 1 soil from leizhou',
 'replicate 1 soil from yingt

In [41]:
print(datetime.now().replace(microsecond=0).isoformat())
print(len(sum_by_needed.index))
print((len(sum_by_needed.index)) / 60)

2021-07-05T19:11:35
65
1.0833333333333333


In [42]:
salvage_search_res = scoped_mapping.search_get_annotations_wrapper(
    still_unmapped,
    bad_chars="._-",
    cat_name="salvage",
    ontoprefix="",
    query_fields="",
    rr=5,
    string_dist_arg=2,
)

# salvage_search_res.to_sql(
#     "salvage_search_res", biosample_cnx, if_exists="replace", index=False
# )

## We appear to be at a point of diminishing returns
At the very least, will require some review

In [43]:
salvage_best_acceptable = scoped_mapping.get_best_acceptable(
    salvage_search_res, max_string_dist=salvage_max_dist
)
salvage_best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
11,salvage,15,15,15,1,0.0,NCIT:C113429,Fifteen,5,NCIT,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C113429,ncit
65,salvage,4,4,4,1,0.0,NCIT:C172971,Worry Score 4,3,NCIT,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C172971,ncit
92,salvage,6,6,6,1,0.0,NCIT:C179016,COVID-19 Disease Severity 6,3,NCIT,hasExactSynonym,,http://purl.obolibrary.org/obo/NCIT_C179016,ncit
111,salvage,cryoconite,cryoconite,cryoconite hole,1,0.198,ENVO:03000039,cryoconite hole,2,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_03000039,envo
129,salvage,dna,dna,DNA,1,0.0,NCIT:C449,DNA,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C449,ncit
166,salvage,genome,genome,Genome,1,0.0,NCIT:C16629,Genome,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C16629,ncit
295,salvage,leaves,leaves,leaves,1,0.0,CO_366:0000273,leaves,1,CO_366,label,label,http://www.cropontology.org/rdf/CO_366:0000273,co_366
524,salvage,not applicable,not applicable,Not Applicable,1,0.0,NCIT:C48660,Not Applicable,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C48660,ncit
580,salvage,paddy soil,paddy soil,rice paddy soil,1,0.198,ENVO:00005740,paddy field soil,1,ENVO,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00005740,envo
592,salvage,permafrost soil,permafrost soil,permafrost,1,0.198,ENVO:00000134,permafrost,1,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_00000134,envo


## Save the mappings for review!

_Note that the merge on ID or label mappings are not included here_


In [44]:
my_best_acceptable[include_col] = first_pass_include_val
salvage_best_acceptable[include_col] = salvage_include_val
best_and_salvage = pd.concat([my_best_acceptable, salvage_best_acceptable])

best_and_salvage["ols_curation_notes"] = ""

# best_and_salvage.to_csv(ols_review_file, sep=ols_review_seperator, index=False)

best_and_salvage = best_and_salvage[
    [
        "category",
        "raw",
        "query",
        "name",
        "obo_id",
        "label",
        "ontology_prefix",
        "include",
        "ols_curation_notes",
        "scope",
        "type",
        "iri",
        "ontology_name",
        "string_dist_rank",
        "string_dist",
        "search_rank",
    ]
]

In [45]:
best_and_salvage = best_and_salvage.sort_values(
    ["include", "ontology_prefix"], ascending=(True, True)
)
best_and_salvage

Unnamed: 0,category,raw,query,name,obo_id,label,ontology_prefix,include,ols_curation_notes,scope,type,iri,ontology_name,string_dist_rank,string_dist,search_rank
295,salvage,leaves,leaves,leaves,CO_366:0000273,leaves,CO_366,False,,label,label,http://www.cropontology.org/rdf/CO_366:0000273,co_366,1,0.0,1
111,salvage,cryoconite,cryoconite,cryoconite hole,ENVO:03000039,cryoconite hole,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_03000039,envo,1,0.198,2
580,salvage,paddy soil,paddy soil,rice paddy soil,ENVO:00005740,paddy field soil,ENVO,False,,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00005740,envo,1,0.198,1
592,salvage,permafrost soil,permafrost soil,permafrost,ENVO:00000134,permafrost,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_00000134,envo,1,0.198,1
778,salvage,rhizosphere soil,rhizosphere soil,rhizosphere,ENVO:00005801,rhizosphere,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_00005801,envo,1,0.184,1
968,salvage,surface sediment water,surface sediment water,sediment surface,ENVO:01001192,sediment surface,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_01001192,envo,1,0.155,2
11,salvage,15,15,15,NCIT:C113429,Fifteen,NCIT,False,,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C113429,ncit,1,0.0,5
65,salvage,4,4,4,NCIT:C172971,Worry Score 4,NCIT,False,,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C172971,ncit,1,0.0,3
92,salvage,6,6,6,NCIT:C179016,COVID-19 Disease Severity 6,NCIT,False,,hasExactSynonym,,http://purl.obolibrary.org/obo/NCIT_C179016,ncit,1,0.0,3
129,salvage,dna,dna,DNA,NCIT:C449,DNA,NCIT,False,,label,label,http://purl.obolibrary.org/obo/NCIT_C449,ncit,1,0.0,1


In [46]:
best_and_salvage.to_clipboard(index=False)

## print mapped column and scoping

In [47]:
print(scoping_col)
print(scoping_value)
print(biosample_col_to_map)

env_package_normalization.EnvPackage
soil
env_medium


In [48]:
raise SystemExit("Don't skip reviewing and saving!")

SystemExit: Don't skip reviewing and saving!

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Do some review in a spreadsheet application!
Specifically, columns `include` and `ols_curation_notes`

Then Save

**Add counts back in to motivate manual lookups if necessary**

----

## Now read the reviewed spreadsheet back in

In [49]:
# curated = pd.read_csv(ols_review_file, sep=ols_review_seperator)
# curated

## OR copy and paste

In [50]:
curated = pd.read_clipboard()
curated

Unnamed: 0,category,raw,query,name,obo_id,label,ontology_prefix,include,ols_curation_notes,scope,type,iri,ontology_name,string_dist_rank,string_dist,search_rank
0,salvage,paddy soil,paddy soil,rice paddy soil,ENVO:00005740,paddy field soil,ENVO,True,,has_related_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00005740,envo,1,0.198,1
1,salvage,leaves,leaves,leaves,CO_366:0000273,leaves,CO_366,False,,label,label,http://www.cropontology.org/rdf/CO_366:0000273,co_366,1,0.0,1
2,salvage,cryoconite,cryoconite,cryoconite hole,ENVO:03000039,cryoconite hole,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_03000039,envo,1,0.198,2
3,salvage,permafrost soil,permafrost soil,permafrost,ENVO:00000134,permafrost,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_00000134,envo,1,0.198,1
4,salvage,rhizosphere soil,rhizosphere soil,rhizosphere,ENVO:00005801,rhizosphere,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_00005801,envo,1,0.184,1
5,salvage,surface sediment water,surface sediment water,sediment surface,ENVO:01001192,sediment surface,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_01001192,envo,1,0.155,2
6,salvage,15,15,15,NCIT:C113429,Fifteen,NCIT,False,,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C113429,ncit,1,0.0,5
7,salvage,4,4,4,NCIT:C172971,Worry Score 4,NCIT,False,,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C172971,ncit,1,0.0,3
8,salvage,6,6,6,NCIT:C179016,COVID-19 Disease Severity 6,NCIT,False,,hasExactSynonym,,http://purl.obolibrary.org/obo/NCIT_C179016,ncit,1,0.0,3
9,salvage,dna,dna,DNA,NCIT:C449,DNA,NCIT,False,,label,label,http://purl.obolibrary.org/obo/NCIT_C449,ncit,1,0.0,1


## Filter on `include`

In [51]:
flag = curated["include"]
curated = curated.loc[flag]

curated.columns = "ols_" + curated.columns
# curated.to_sql("curated", biosample_cnx, if_exists="replace", index=False)
# curated

In [52]:
merge_search_merged = mapping_candidates.merge(
    curated, how="outer", left_on="remaining_tidied", right_on="ols_raw"
)

In [53]:
flag = (
    ~merge_search_merged["id_or_lab_ok"]
    & ~merge_search_merged["ols_obo_id"].eq("")
    & ~merge_search_merged["ols_obo_id"].isna()
)

replacement = merge_search_merged.loc[flag, "ols_obo_id"]

merge_search_merged.loc[flag, "consensus_id"] = replacement

replacement = merge_search_merged.loc[flag, "ols_label"]

merge_search_merged.loc[flag, "consensus_lab"] = replacement

replacement = (
    merge_search_merged.loc[flag, "consensus_lab"]
    + " ["
    + merge_search_merged.loc[flag, "consensus_id"]
    + "]"
)

merge_search_merged.loc[flag, "assembled_consensus"] = replacement

In [54]:
# merge_search_merged.to_sql(
#     "merge_search_merged", biosample_cnx, if_exists="replace", index=False
# )

In [55]:
summary = merge_search_merged[[biosample_col_to_map, "count", "assembled_consensus"]]
summary = summary.sort_values("count", ascending=False)
# summary

In [56]:
frame_for_sssom = merge_search_merged

na_flag = frame_for_sssom["splitted"].isna()
frame_for_sssom = frame_for_sssom[~na_flag]

na_flag = frame_for_sssom["consensus_id"].isna()
frame_for_sssom = frame_for_sssom[~na_flag]

raw_queries = list(frame_for_sssom["splitted"])
urlencodeds = list(map(urllib.parse.quote, raw_queries))
urlencodeds = [sssom_subject_prefix + ":" + item for item in urlencodeds]

results_rows = len(frame_for_sssom.index)
iso8601_stamp = datetime.now().replace(microsecond=0).isoformat()

sssom_frame = {
    "subject_category": [biosample_col_to_map] * results_rows,
    "subject_label": frame_for_sssom["splitted"],
    "predicate_id": ["skos:relatedMatch"] * results_rows,
    "object_id": frame_for_sssom["consensus_id"],
    "object_label": frame_for_sssom["consensus_lab"],
    "match_type": ["Lexical"] * results_rows,
    "creator_id": ["https://github.com/turbomam/scoped-mapping"] * results_rows,
    #     "mapping_tool": ["https://www.ebi.ac.uk/ols/docs/api"] * results_rows,
    "mapping_date": [iso8601_stamp] * results_rows,
    #     "confidence": 1 - frame_for_sssom["string_dist"],
    "subject_id": urlencodeds,
}

sssom_frame = pd.DataFrame(sssom_frame)

sssom_frame.to_csv(sssom_file, sep=ols_review_seperator, index=False)

# sssom_frame

# From here down requires refactoring and generalization 

That may even be true for some of the blocks above this!

## Join together pipe-delimited env_broad_scale mappings 
_Any other constraints? If part_count is > 1, do we need to check for empty or null env_broad_scale, repeated or splitted?_

In [57]:
multi_part_flag = merge_search_merged["part_count"] > 1
multi_part_flag.value_counts()

False    110
Name: part_count, dtype: int64

In [58]:
multi_part_frame = merge_search_merged.loc[multi_part_flag]

multi_part_frame = multi_part_frame[
    [
        biosample_col_to_map,
        "count",
        "repeated",
        "splitted",
        "part_count",
        "seq",
        "consensus_id",
        "consensus_lab",
    ]
]

multi_part_frame = multi_part_frame.sort_values(
    [biosample_col_to_map, "seq"], ascending=(True, True)
)

multi_part_frame

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,consensus_id,consensus_lab


In [59]:
part_count = multi_part_frame.groupby([biosample_col_to_map], sort=False)[
    "part_count"
].max()
part_count.value_counts()

Series([], Name: part_count, dtype: int64)

In [60]:
seq_max = multi_part_frame.groupby([biosample_col_to_map], sort=False)["seq"].max()
seq_max

Series([], Name: seq, dtype: int64)

In [61]:
parts_check = pd.DataFrame(dict(part_count=part_count, seq_max=seq_max)).reset_index()

all_parts_flag = parts_check["seq_max"] == parts_check["part_count"]
ebs_with_all_parts = parts_check.loc[all_parts_flag, biosample_col_to_map]
ebs_with_all_parts

all_parts_flag = multi_part_frame[biosample_col_to_map].isin(list(ebs_with_all_parts))
all_parts_frame = multi_part_frame.loc[all_parts_flag]
all_parts_frame

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,consensus_id,consensus_lab


In [62]:
unique_pipesep_ebs = list(set(list(all_parts_frame[biosample_col_to_map])))
unique_pipesep_ebs.sort()
repipe_dict_list = []
for one_pipesep in unique_pipesep_ebs:
    flag = all_parts_frame[biosample_col_to_map] == one_pipesep
    temp_frame = all_parts_frame.loc[flag]
    ids_list = list(temp_frame["consensus_id"])
    lab_list = list(temp_frame["consensus_lab"])
    s = "|"
    ids_repipe = s.join(ids_list)
    lab_repipe = s.join(lab_list)

    repipe_dict = {
        biosample_col_to_map: one_pipesep,
        "consensus_id": ids_repipe,
        "consensus_lab": lab_repipe,
    }
    repipe_dict_list.append(repipe_dict)
repipe_frame = pd.DataFrame(repipe_dict_list)
repipe_frame

In [63]:
sssom_min_cols = sssom_frame[["subject_label", "object_id", "object_label"]]

sssom_min_cols.columns = [biosample_col_to_map, "consensus_id", "consensus_lab"]

# sssom_min_cols

In [64]:
repipe_rows = len(repipe_frame.index)
if repipe_rows > 0:
    direct_and_repipe = pd.concat([sssom_min_cols, repipe_frame])
else:
    direct_and_repipe = sssom_min_cols
# direct_and_repipe

In [65]:
direct_and_repipe.to_sql(
    "direct_and_repipe", biosample_cnx, if_exists="replace", index=False
)

## refactor this even more?


In [66]:
q = (
    """
select
	b.id, b."""
    + biosample_col_to_map
    + """,
	dar.consensus_id ,
	dar.consensus_lab
from
	direct_and_repipe dar
left join biosample b on
	b."""
    + biosample_col_to_map
    + """ = dar."""
    + biosample_col_to_map
    + """
inner join biosample_sample_taxon_summary stic on
	b.taxonomy_id = stic.biosample_taxid
join env_package_normalization on
	b.env_package = env_package_normalization.env_package
where """
    + scoping_col
    + " = '"
    + scoping_value
    + "'"
)

[per_biosample_scoped_mapping_results, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx, print_timing=True
)

2021-07-05 19:30:31.127529
2021-07-05 19:30:31.367504
0:00:00.239975


In [67]:
per_biosample_scoped_mapping_results["notes"] = scoping_col + "=" + scoping_value

table_name = "repaired_" + biosample_col_to_map

per_biosample_scoped_mapping_results.to_sql(
    table_name, biosample_cnx, if_exists="append", index=False
)

per_biosample_scoped_mapping_results

Unnamed: 0,id,env_medium,consensus_id,consensus_lab,notes
0,631934,oil contaminated soil,ENVO:00002875,oil contaminated soil,env_package_normalization.EnvPackage=soil
1,631935,oil contaminated soil,ENVO:00002875,oil contaminated soil,env_package_normalization.EnvPackage=soil
2,631936,oil contaminated soil,ENVO:00002875,oil contaminated soil,env_package_normalization.EnvPackage=soil
3,631937,oil contaminated soil,ENVO:00002875,oil contaminated soil,env_package_normalization.EnvPackage=soil
4,631938,greenhouse soil,ENVO:00005780,greenhouse soil,env_package_normalization.EnvPackage=soil
...,...,...,...,...,...
15266,1906787,arable soil,ENVO:00005742,arable soil,env_package_normalization.EnvPackage=soil
15267,1906788,arable soil,ENVO:00005742,arable soil,env_package_normalization.EnvPackage=soil
15268,1906789,arable soil,ENVO:00005742,arable soil,env_package_normalization.EnvPackage=soil
15269,1906790,arable soil,ENVO:00005742,arable soil,env_package_normalization.EnvPackage=soil
