In [1]:
import pandas as pd
import re
import requests as requests
import sqlite3
import string
import urllib
import yaml

import scoped_mapping

from datetime import datetime
from pkg_resources import get_distribution, DistributionNotFound
from strsimpy.cosine import Cosine
from xml.etree import ElementTree
from tdda import rexpy

import math

# from copy import deepcopy

# Settings required from user:

See repo README for notes on setting up SQLite databases of OBO ontologies with semantic-sql, relation-graph and rdftab

### How should the mapping effort be scoped?
Additional taxonomic filters may be applied below
- soil
- sediment
- plant-associated
- ( water )

In [2]:
scoping_col = "env_package_normalization.EnvPackage"
scoping_value = "water"

### What Biosample field should be mapped to ontology classes?
- env_broad_scale
- env_local_scale
- env_medium

In [3]:
biosample_col_to_map = "env_medium"

In [4]:
# USE CONSISTENT CAPITALIZASTION FOR ONTOLOGY NAMES
# where do we require a single ontology and where can we use multiple?
primary_onto_prefix = "ENVO"
# AVOID ncit, gaz
# additional_preferred_ontologies = ["obi", "pato", "efo"]
additional_preferred_ontologies = ["po", "ncbitaxon"]
# additional_preferred_ontologies = ["ncbitaxon", "po"]

# ontology choce and ranking: root should probably be interpreted as a part plant, 
#   not as the common ancestory of all life forms

In [5]:
# from https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml
# see also https://www.ncbi.nlm.nih.gov/biosample/docs/packages/
biosample_packages_file = "../../target/biosample_packages.xml"

# from ftp://ftp.ncbi.nlm.nih.gov//biosample/biosample_set.xml.gz
# via harmonized_table.db.gz
# in https://drive.google.com/drive/u/0/folders/1eL0v0stoduahjDpoDJIk3z2pJBAU4b2Y
biosample_sqlite_file = "../../target/harmonized-table.db"
ncbitaxon_sqlite_file = "../../../scoped-mapping/semantic-sql/db/ncbitaxon.db"

primary_ontology_sqlite_prefix = "../../../scoped-mapping/semantic-sql/db/"

chars_to_whiteout = "._-"
my_query_fields = ""  # OLS weighted default
my_row_req = 3

# what is this relative to the max distance?
my_string_dist_arg = 2

my_max_string_dist = 0.1
salvage_max_dist = 0.2

xquery_pivot_in_cell_delim = "|||"
rejoin_delim = "|"

### Settings for manual review of OLS search-based results
Results based on merging Biosample annotations to ontology classes by label or embedded ID are not exported for review at this time

In [6]:
# ols_review_file = "../local/ols_review.tsv"
ols_review_seperator = "\t"

strategy_col = "strategy"
include_col = "include"

first_pass_include_val = True

salvage_include_val = False


### Settings for SSSOM output

In [7]:
sssom_subject_prefix = biosample_col_to_map + "_" + scoping_col + "_" + scoping_value
sssom_file = "../../target/" + sssom_subject_prefix + "_SSSOM.tsv"

In [8]:
# per_biosample_mapping_results_file = "per_biosample_scoped_mapping_results.tsv"

----

In [9]:
biosample_cnx = sqlite3.connect(biosample_sqlite_file)
ncbitaxon_cnx = sqlite3.connect(ncbitaxon_sqlite_file)
primary_ontology_sqlite_file = (
    primary_ontology_sqlite_prefix + primary_onto_prefix.lower() + ".db"
)
primary_ontology_cnx = sqlite3.connect(primary_ontology_sqlite_file)
additional_preferred_ontologies.insert(0, primary_onto_prefix.lower())
additional_preferred_ontologies_str = ",".join(additional_preferred_ontologies)

In [10]:
q = "select * from id_patterns"
[id_patterns_frame, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)
id_patterns_frame

Unnamed: 0,ontology,id_pattern
0,BFO,BFO:\d{7}
1,CARO,CARO:\d{7}
2,CHEBI,"CHEBI:\d{4,6}"
3,ENVO,"ENVO:\d{7,8}"
4,FAO,FAO:0000001
5,FOODON,FOODON:\d{8}
6,GO,GO:\d{7}
7,IAO,IAO:\d{7}
8,NCBITaxon,NCBITaxon:\d+
9,OBI,OBI:\d{7}


In [11]:
id_patterns = dict(zip(id_patterns_frame.ontology, id_patterns_frame.id_pattern))

- env_package = env_package annotation from NCBI Biosample file XXX
- count = number of biosamples using that env_package annotation
- lhs = checklist info
- rhs = potential package info
- extract = potential OBO ID from rhs column (currently harcoded and only looking for ENVO IDs)
- remaining_string = rhs/string, with potential OBO IDs removed
- remaining_tidied = remaining_string with case, whitespace and punctuation normailzastion
- rt_override = some remaining_tidied values can be replaced according to env_package_overrides
- EnvPackage = corresponding de-normalized value from package_dictionary
- is_canonical = false when EnvPackage is NaN

In [12]:
# cursor = biosample_cnx.cursor()
# cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
# print(cursor.fetchall())

In [13]:
q = "select * from env_package_normalization"
[env_package_normalization, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx
)


## Get a table of MIxS annotations to be mapped to ontology classes.

Explicitly scope based on normalized package data. These values were set at the top of this notebook.


In [14]:
print(biosample_col_to_map)
print(scoping_col)
print(scoping_value)

env_medium
env_package_normalization.EnvPackage
water


**In this case, the scoping includes an inner join requirement for 'unclassified entities'**

In [15]:
q = (
    "select "
    + biosample_col_to_map
    + """, count(*) as count
from
    biosample b
join env_package_normalization on
    b.env_package = env_package_normalization.env_package
inner join biosample_sample_taxon_summary stic on
    b.taxonomy_id = stic.biosample_taxid
where """
    + scoping_col
    + " = '"
    + scoping_value
    + "' group by "
    + biosample_col_to_map
    + """
order by
    count(*) desc"""
)

[mapping_candidates, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

mapping_candidates.to_csv("mapping_candidates.csv")


----

## The Biosample format allows for pipe-delimited environmental package lists. 

They haven't been tidied yet.

Singletons haven't been filtered out.
- keep them because some may be easy to map anyway?

Separate those out into their components.

----


In [16]:
multi_frames = []
for row in mapping_candidates.itertuples(index=True, name="PandaZ"):
    split_check = row[1]
    if split_check is None:
        split_check = ""
    splitted = pd.Series(split_check.split(xquery_pivot_in_cell_delim))
    splitted_count = len(splitted)
    repeated = [split_check] * splitted_count
    repeated = pd.Series(repeated)
    as_frame = pd.DataFrame(dict(repeated=repeated, splitted=splitted)).reset_index()
    seq_list = list(range(1, splitted_count + 1))
    as_frame["part_count"] = splitted_count
    as_frame["seq"] = seq_list
    multi_frames.append(as_frame)
concat_frame = pd.concat(multi_frames)
concat_frame = concat_frame[["repeated", "splitted", "part_count", "seq"]]


In [17]:
mapping_candidates = mapping_candidates.merge(
    concat_frame, left_on=biosample_col_to_map, right_on="repeated", how="left"
)

mapping_candidates

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq
0,fresh water,5847,fresh water,fresh water,1,1
1,water,1913,water,water,1,1
2,sea water [ENVO:00002149],1100,sea water [ENVO:00002149],sea water [ENVO:00002149],1,1
3,"particulate matter, including plankton (ENVO:x...",941,"particulate matter, including plankton (ENVO:x...","particulate matter, including plankton (ENVO:x...",1,1
4,2225,879,2225,2225,1,1
...,...,...,...,...,...,...
208,seawater/brine transition zone lower halocline...,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,1,1
209,seawater/brine transition zone lower halocline...,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,1,1
210,seawater/brine transition zone upper halocline...,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,1,1
211,seawater/brine transition zone upper halocline...,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,1,1


## Normalize a few different ways `OBO` IDs have been entered
In the Biosample metadata

In [18]:
# this helps by standardizing what could be an ID
#   but if the RHS is text, then "envo" should just be removed
mapping_candidates["term_tidy"] = mapping_candidates.splitted.str.replace(
    primary_onto_prefix.lower() + "[:_ ]",
    primary_onto_prefix + ":",
    regex=True,
    case=False,
)


# Now try to extract ontology terms that are already present

In [19]:
# should try joining agsinst additional SQLite ontology databases
candidate_series_decomposition = scoped_mapping.decompose_series(
    mapping_candidates["term_tidy"], id_patterns[primary_onto_prefix]
)

mapping_candidates = pd.concat(
    [mapping_candidates, candidate_series_decomposition], axis=1
)

mapping_candidates[
    "remaining_tidied"
] = mapping_candidates.remaining_tidied.str.replace(
    primary_onto_prefix.lower() + "[:_ ]", "", regex=True, case=False
)


## Join the extracted IDs with their labels

_Start by connecting to the rdftab database from which the terms and label-like annotations will be obtained?_

## extracting the labels

In [20]:
q = "select stanza as subject, value from ids_labs_selected_ontolgies"

[onto_labels, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

onto_labels

Unnamed: 0,subject,value
0,BFO:0000001,entity
1,BFO:0000002,continuant
2,BFO:0000003,occurrent
3,BFO:0000004,independent continuant
4,BFO:0000006,spatial region
...,...,...
13129,UBERON:3000972,head external integument structure
13130,UBERON:3000977,body external integument structure
13131,UBERON:3010200,vasculature of respiratory integument
13132,UBERON:4100000,skeletal element projection


## and merging 

In [21]:
mapping_candidates = mapping_candidates.merge(
    onto_labels, left_on="extract", right_on="subject", how="left"
)


In [22]:
mapping_candidates.to_csv("mapping_candidates.csv")

## Use cosine string distance to see if the labels match closely enough

I.e. the labels claimed by the Biosample data set and the labels asserted in the ontology. if they're close enough, consider the assigned ID legit

_How close is close enough?_

In [23]:
my_cosine_obj = Cosine(my_string_dist_arg)
mapping_candidates["value"] = mapping_candidates["value"].fillna("")
mapping_candidates["remaining_tidied"] = mapping_candidates["remaining_tidied"].fillna(
    ""
)
mapping_candidates["cosine"] = mapping_candidates.apply(
    lambda my_row: my_cosine_obj.distance(
        my_row["remaining_tidied"].lower(), my_row["value"].lower()
    ),
    axis=1,
)


**Previously, we did a reality check on the claimed IDs and labels. If a label is claimed without any ID, that could still be a path to an ontology term.**

We'll be doing some merging, so make sure column names are meaningful and unique


In [24]:
# what's a better pattern for subsetting and renaming columns?

mapping_candidates.columns = [
    biosample_col_to_map,
    "count",
    "repeated",
    "splitted",
    "part_count",
    "seq",
    "term_tidy",
    "string",
    "extract",
    "remaining_string",
    "remaining_tidied",
    "term_id",
    "lab_from_id",
    "lfi_cosine",
]
mapping_candidates = mapping_candidates.merge(
    onto_labels, left_on="remaining_tidied", right_on="value", how="left"
)

mapping_candidates.columns = [
    biosample_col_to_map,
    "count",
    "repeated",
    "splitted",
    "part_count",
    "seq",
    "term_tidy",
    "string",
    "extract",
    "remaining_string",
    "remaining_tidied",
    "term_id",
    "lab_from_id",
    "lfi_cosine",
    "term_id_from_lab",
    "value",
]

mapping_candidates = mapping_candidates.drop_duplicates()

# mapping_candidates

In [25]:
mc_tifl_split = mapping_candidates["term_id_from_lab"].str.split(":")

prefix_when_possible = []
for one_split in mc_tifl_split:
    if type(one_split) == list:
        os_first = one_split[0]
        prefix_when_possible.append(os_first)
    else:
        prefix_when_possible.append("")

prefix_when_possible = [each_string.lower() for each_string in prefix_when_possible]

# prefix_when_possible


In [26]:
# WHY DID I MAKE THIS SO COMPLICATED?

# if "onto_prefix" in mapping_candidates.columns:
#     mapping_candidates = mapping_candidates.drop("onto_prefix", 1)

# mapping_candidates = mapping_candidates.assign(onto_prefix='')

mapping_candidates['onto_prefix'] = prefix_when_possible

# mapping_candidates

In [27]:
ranks = list(range(len(additional_preferred_ontologies)))

d = {"onto_prefix": additional_preferred_ontologies, "rank": ranks}
onto_rank_frame = pd.DataFrame(d)

onto_rank_frame

Unnamed: 0,onto_prefix,rank
0,envo,0
1,po,1
2,ncbitaxon,2


In [28]:
mapping_candidates = mapping_candidates.merge(
    onto_rank_frame, on="onto_prefix", how="left"
)

# mapping_candidates

In [29]:
best_maps = mapping_candidates.groupby(biosample_col_to_map)["rank"].min().reset_index()

# best_maps

In [30]:
mapping_candidates = mapping_candidates.merge(
    best_maps, on=[biosample_col_to_map, "rank"]
)

mapping_candidates

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,term_tidy,string,extract,remaining_string,remaining_tidied,term_id,lab_from_id,lfi_cosine,term_id_from_lab,value,onto_prefix,rank
0,fresh water,5847,fresh water,fresh water,1,1,fresh water,fresh water,,fresh water,fresh water,,,1.0,ENVO:00002011,fresh water,envo,0.0
1,water,1913,water,water,1,1,water,water,,water,water,,,1.0,CHEBI:15377,water,chebi,
2,sea water [ENVO:00002149],1100,sea water [ENVO:00002149],sea water [ENVO:00002149],1,1,sea water [ENVO:00002149],sea water [ENVO:00002149],ENVO:00002149,sea water,sea water,ENVO:00002149,sea water,0.0,ENVO:00002149,sea water,envo,0.0
3,"particulate matter, including plankton (ENVO:x...",941,"particulate matter, including plankton (ENVO:x...","particulate matter, including plankton (ENVO:x...",1,1,"particulate matter, including plankton (ENVO:x...","particulate matter, including plankton (ENVO:x...",,"particulate matter, including plankton (ENVO:x...",particulate matter including plankton xxxxxxxx,,,1.0,,,,
4,2225,879,2225,2225,1,1,2225,2225,,2225,2225,,,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,seawater/brine transition zone lower halocline...,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,1,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,,seawater/brine transition zone lower halocline...,seawater brine transition zone lower halocline...,,,1.0,,,,
209,seawater/brine transition zone lower halocline...,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,1,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,,seawater/brine transition zone lower halocline...,seawater brine transition zone lower halocline...,,,1.0,,,,
210,seawater/brine transition zone upper halocline...,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,1,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,,seawater/brine transition zone upper halocline...,seawater brine transition zone upper halocline...,,,1.0,,,,
211,seawater/brine transition zone upper halocline...,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,1,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,,seawater/brine transition zone upper halocline...,seawater brine transition zone upper halocline...,,,1.0,,,,


## Find consensus term IDs and labels


In [31]:
mapping_candidates["consensus_id"] = mapping_candidates["term_id_from_lab"]
mapping_candidates["consensus_lab"] = mapping_candidates["value"]

flag = mapping_candidates["consensus_id"].isnull() & (
    (
        ~mapping_candidates["term_id"].isnull()
        & mapping_candidates.lfi_cosine.le(my_max_string_dist)
    )
    | (
        ~mapping_candidates["term_id"].isnull()
        & mapping_candidates["remaining_tidied"].eq("")
    )
)

replacements = mapping_candidates.loc[flag, "term_id"]
mapping_candidates.loc[flag, "consensus_id"] = replacements

replacements = mapping_candidates.loc[flag, "lab_from_id"]
mapping_candidates.loc[flag, "consensus_lab"] = replacements

flag = mapping_candidates.consensus_id.isna()
antiflag = ~flag
mapping_candidates["id_or_lab_ok"] = antiflag

mapping_candidates["assembled_consensus"] = (
    mapping_candidates["consensus_lab"]
    + " ["
    + mapping_candidates["consensus_id"]
    + "]"
)

In [32]:
mapping_candidates

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,term_tidy,string,extract,remaining_string,...,lab_from_id,lfi_cosine,term_id_from_lab,value,onto_prefix,rank,consensus_id,consensus_lab,id_or_lab_ok,assembled_consensus
0,fresh water,5847,fresh water,fresh water,1,1,fresh water,fresh water,,fresh water,...,,1.0,ENVO:00002011,fresh water,envo,0.0,ENVO:00002011,fresh water,True,fresh water [ENVO:00002011]
1,water,1913,water,water,1,1,water,water,,water,...,,1.0,CHEBI:15377,water,chebi,,CHEBI:15377,water,True,water [CHEBI:15377]
2,sea water [ENVO:00002149],1100,sea water [ENVO:00002149],sea water [ENVO:00002149],1,1,sea water [ENVO:00002149],sea water [ENVO:00002149],ENVO:00002149,sea water,...,sea water,0.0,ENVO:00002149,sea water,envo,0.0,ENVO:00002149,sea water,True,sea water [ENVO:00002149]
3,"particulate matter, including plankton (ENVO:x...",941,"particulate matter, including plankton (ENVO:x...","particulate matter, including plankton (ENVO:x...",1,1,"particulate matter, including plankton (ENVO:x...","particulate matter, including plankton (ENVO:x...",,"particulate matter, including plankton (ENVO:x...",...,,1.0,,,,,,,False,
4,2225,879,2225,2225,1,1,2225,2225,,2225,...,,1.0,,,,,,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,seawater/brine transition zone lower halocline...,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,1,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,,seawater/brine transition zone lower halocline...,...,,1.0,,,,,,,False,
209,seawater/brine transition zone lower halocline...,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,1,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,,seawater/brine transition zone lower halocline...,...,,1.0,,,,,,,False,
210,seawater/brine transition zone upper halocline...,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,1,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,,seawater/brine transition zone upper halocline...,...,,1.0,,,,,,,False,
211,seawater/brine transition zone upper halocline...,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,1,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,,seawater/brine transition zone upper halocline...,...,,1.0,,,,,,,False,


## For which Biosample annotations were no easy mappings found?
How many Biosamples uses those annotations?

In [33]:
flag = ~mapping_candidates.id_or_lab_ok
needs_search = mapping_candidates.loc[flag]

needs_search = needs_search[["remaining_tidied", "count"]]

sum_by_needed = needs_search.groupby("remaining_tidied")["count"].sum()

sum_by_needed = sum_by_needed.to_frame()
sum_by_needed["remaining_tidied"] = sum_by_needed.index

sum_by_needed = sum_by_needed.sort_values("count", ascending=False)
sum_by_needed = sum_by_needed.reset_index(drop=True)

sum_by_needed

# sum_by_needed.to_sql("sum_by_needed", biosample_cnx, if_exists="replace", index=False)

Unnamed: 0,count,remaining_tidied
0,941,particulate matter including plankton xxxxxxxx
1,879,2225
2,850,1000060
3,844,
4,517,seawater
...,...,...
105,1,rhizosphere 6
106,1,rhizosphere 7
107,1,rhizosphere 8
108,1,rhizosphere 9


In [34]:
# remove singletons
sum_by_needed['keep_flag'] = sum_by_needed['count'].gt(1)

sum_by_needed = sum_by_needed.loc[sum_by_needed['keep_flag']]
sum_by_needed

Unnamed: 0,count,remaining_tidied,keep_flag
0,941,particulate matter including plankton xxxxxxxx,True
1,879,2225,True
2,850,1000060,True
3,844,,True
4,517,seawater,True
...,...,...,...
65,2,deep chlorophyll maximum,True
66,2,brownish colored sea ice,True
67,2,supraglacial lake water,True
68,2,sterivex cartridges,True


## Extract the tidied strings

In [35]:
generic_raw_list = list(sum_by_needed["remaining_tidied"])
generic_raw_list.sort()

## Submit those tidied strings to a search engine

Specifically OLS search. This takes roughly one second per unique post-tidied submission

_Turn logging back on to show status?_
_Print the count and pre- and post- datestamps_
`datetime.now().replace(microsecond=0).isoformat()` ?


In [36]:
print(datetime.now().replace(microsecond=0).isoformat())
print(len(sum_by_needed.index))

2021-07-07T08:08:17
70


In [37]:

generic_search_res = scoped_mapping.search_get_annotations_wrapper(
    generic_raw_list,
    bad_chars=chars_to_whiteout,
    cat_name=biosample_col_to_map,
    ontoprefix=additional_preferred_ontologies_str,
    query_fields="",
    rr=my_row_req,
    string_dist_arg=my_string_dist_arg,
)
# generic_search_res.to_sql(
#     "ols_search_results", biosample_cnx, if_exists="replace", index=False
# )

## Filter out the best of the acceptable mappings
From a string distance perspective

save generic_search_res to db?

In [38]:
my_best_acceptable = scoped_mapping.get_best_acceptable(
    generic_search_res, max_string_dist=my_max_string_dist
)


In [39]:
if my_best_acceptable is None:
    pass
else:
    my_best_acceptable = my_best_acceptable.assign(category='preferred ontologies')

In [40]:
my_best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
26,preferred ontologies,brownish colored sea ice,brownish colored sea ice,brownish colored sea ice,1,0.0,ENVO:01001190,brown sea ice,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_01001190,envo
43,preferred ontologies,coastal water,coastal water,coastal water,1,0.0,ENVO:00002150,coastal sea water,1,ENVO,has_broad_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00002150,envo
105,preferred ontologies,freshwater,freshwater,freshwater,1,0.0,ENVO:00002011,fresh water,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002011,envo
238,preferred ontologies,ocean water,ocean water,ocean water,1,0.0,ENVO:00002149,sea water,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002149,envo
305,preferred ontologies,seawater,seawater,seawater,1,0.0,ENVO:00002149,sea water,1,ENVO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002149,envo


----

## Filter out the submissions with no acceptable matches

In [41]:
no_acceptable_mappings = scoped_mapping.get_no_acceptable_mappings(
    generic_search_res, my_best_acceptable
)


## Try searching the failures against all ontologies in OLS
what would it be like to use a local OLS instance? (latency-wise)

In [42]:
still_unmapped = list(set(list(no_acceptable_mappings["raw"])))
still_unmapped.sort()

print(len(still_unmapped))

still_unmapped

65


['',
 '00002110',
 '00002150',
 '1000060',
 '2225',
 'aluminum',
 'aquifer water',
 'borehole water',
 'brass plastic',
 'cardboard',
 'cast iron',
 'cdna libraries generated from cells filtered from seawater',
 'ci',
 'deep chlorophyll maximum',
 'diffuse fluid',
 'dna',
 'dna isolated from cells filtered from seawater',
 'encrusting benthic community',
 'env 00002010',
 'environmental dna extracted from a hot spring',
 'filtered seawater xxxxxxxxx',
 'filtered water',
 'for seawater',
 'free living fraction',
 'genomic dna',
 'genomic dna isolated from cells filtered from seawater',
 'hdpe',
 'liquid culture',
 'liquid culture fraction 8µm',
 'lysis buffer',
 'marine',
 'marine habitat',
 'marine water',
 'mat',
 'mat sediment water slurry',
 'missing',
 'na',
 'near bottom water',
 'not applicable',
 'oil water mixture',
 'particle associated fraction',
 'particulate matter including plankton xxxxxxxx',
 'particulate matter including planktonic material',
 'particulate matter xxxxxx

In [43]:
print(datetime.now().replace(microsecond=0).isoformat())
print(len(still_unmapped))


2021-07-07T08:08:49
65


In [44]:
salvage_search_res = scoped_mapping.search_get_annotations_wrapper(
    still_unmapped,
    bad_chars="._-",
    cat_name="salvage",
    ontoprefix="",
    query_fields="",
    rr=my_row_req,
    string_dist_arg=my_string_dist_arg,
)

# salvage_search_res.to_sql(
#     "salvage_search_res", biosample_cnx, if_exists="replace", index=False
# )

## We appear to be at a point of diminishing returns
At the very least, will require some review

In [45]:
# remove ncit and gaz hits? 
salvage_best_acceptable = scoped_mapping.get_best_acceptable(
    salvage_search_res, max_string_dist=salvage_max_dist
)
salvage_best_acceptable

Unnamed: 0,category,raw,query,name,string_dist_rank,string_dist,obo_id,label,search_rank,ontology_prefix,scope,type,iri,ontology_name
17,salvage,aluminum,aluminum,Aluminum,1,0.0,NCIT:C93162,Aluminum,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C93162,ncit
58,salvage,ci,ci,ci,1,0.0,,ci,1,CCO,label,label,http://identifiers.org/ncbigene/43767,cco
63,salvage,deep chlorophyll maximum,deep chlorophyll maximum,deep chlorophyll maximum layer,1,0.109,ENVO:01000326,deep chlorophyll maximum layer,1,ENVO,label,label,http://purl.obolibrary.org/obo/ENVO_01000326,envo
83,salvage,dna,dna,DNA,1,0.0,NCIT:C449,DNA,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C449,ncit
168,salvage,filtered water,filtered water,filtered seawater,1,0.132,MICRO:0001773,filtered seawater,1,MICRO,label,label,http://purl.obolibrary.org/obo/MICRO_0001773,micro
177,salvage,genomic dna,genomic dna,Genomic DNA,1,0.0,NCIT:C95940,Genomic DNA,1,NCIT,label,label,http://purl.obolibrary.org/obo/NCIT_C95940,ncit
192,salvage,hdpe,hdpe,hdpe,1,0.0,NCIT:C83759,High Density Polyethylene,1,NCIT,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C83759,ncit
226,salvage,lysis buffer,lysis buffer,tcl lysis buffer,1,0.144,NCIT:C178572,Buffer TCL,2,NCIT,synonym,synonym,http://purl.obolibrary.org/obo/NCIT_C178572,ncit
271,salvage,marine water,marine water,marine waterbody,1,0.144,ENVO:00001999,marine water body,3,MICRO,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00001999,micro
272,salvage,mat,mat,MAT,1,0.0,format:3626,MAT,1,EDAM,label,label,http://edamontology.org/format_3626,edam


## Save the mappings for review!

_Note that the merge on ID or label mappings are not included here_


In [46]:
salvage_best_acceptable[include_col] = salvage_include_val

# had been implicitly assuming that my_best_acceptable and salvage_best_acceptable
#   were both data frames, even if they had 0 rows
# this super hacky and should be temporary only
# looks like its a lott better when a function can't returna. data frame or None
# what if they are both None objects?!
if my_best_acceptable is None:
    if salvage_best_acceptable is None:
        pass
    else:
        best_and_salvage = salvage_best_acceptable
else:
    my_best_acceptable[include_col] = first_pass_include_val
    if salvage_best_acceptable is None:
        best_and_salvage = my_best_acceptable
    else:
        best_and_salvage = pd.concat([my_best_acceptable, salvage_best_acceptable])

In [47]:
best_and_salvage["ols_curation_notes"] = ""

# best_and_salvage.to_csv(ols_review_file, sep=ols_review_seperator, index=False)

best_and_salvage = best_and_salvage[
    [
        "category",
        "raw",
        "query",
        "name",
        "obo_id",
        "label",
        "ontology_prefix",
        "include",
        "ols_curation_notes",
        "scope",
        "type",
        "iri",
        "ontology_name",
        "string_dist_rank",
        "string_dist",
        "search_rank",
    ]
]

In [48]:
best_and_salvage = best_and_salvage.merge(
    onto_labels, how="left", left_on="obo_id", right_on="subject"
)

# for esoteric ontologies, may need to look up with OLS API?
#  if we want to differentialte imported and owner labels that is

In [49]:
best_and_salvage = best_and_salvage[
    [
        "category",
        "raw",
        "query",
        "name",
        "obo_id",
        "value",
        "label",
        "ontology_prefix",
        "include",
        "ols_curation_notes",
        "scope",
        "type",
        "iri",
        "ontology_name",
        "string_dist_rank",
        "string_dist",
        "search_rank",
    ]
]

best_and_salvage = best_and_salvage.sort_values(
    ["include", "ontology_prefix"], ascending=(True, True)
)

best_and_salvage = best_and_salvage.drop_duplicates()

best_and_salvage = best_and_salvage.reset_index(drop = True)

best_and_salvage

Unnamed: 0,category,raw,query,name,obo_id,value,label,ontology_prefix,include,ols_curation_notes,scope,type,iri,ontology_name,string_dist_rank,string_dist,search_rank
0,salvage,ci,ci,ci,,,ci,CCO,False,,label,label,http://identifiers.org/ncbigene/43767,cco,1,0.0,1
1,salvage,na,na,,CO_357:2100001,,,CO_357,False,,label,label,http://www.cropontology.org/rdf/CO_357:2100001,co_357,1,0.0,1
2,salvage,mat,mat,MAT,format:3626,,MAT,EDAM,False,,label,label,http://edamontology.org/format_3626,edam,1,0.0,1
3,salvage,deep chlorophyll maximum,deep chlorophyll maximum,deep chlorophyll maximum layer,ENVO:01000326,deep chlorophyll maximum layer,deep chlorophyll maximum layer,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_01000326,envo,1,0.109,1
4,salvage,sea water oligotrophic water,sea water oligotrophic water,oligotrophic water,ENVO:00002223,oligotrophic water,oligotrophic water,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_00002223,envo,1,0.123,1
5,salvage,supraglacial lake water,supraglacial lake water,supraglacial lake,ENVO:03000035,supraglacial lake,supraglacial lake,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_03000035,envo,1,0.134,1
6,salvage,polypropylene,polypropylene,polypropylene,FOODON:03500021,,polypropylene,GENEPIO,False,,label,label,http://purl.obolibrary.org/obo/FOODON_03500021,genepio,1,0.0,1
7,salvage,filtered water,filtered water,filtered seawater,MICRO:0001773,,filtered seawater,MICRO,False,,label,label,http://purl.obolibrary.org/obo/MICRO_0001773,micro,1,0.132,1
8,salvage,marine water,marine water,marine waterbody,ENVO:00001999,marine water body,marine water body,MICRO,False,,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00001999,micro,1,0.144,3
9,salvage,aluminum,aluminum,Aluminum,NCIT:C93162,,Aluminum,NCIT,False,,label,label,http://purl.obolibrary.org/obo/NCIT_C93162,ncit,1,0.0,1


## print mapped column and scoping

In [50]:
bas_rowcount = len(best_and_salvage.index)

sca  = [ scoping_col ]          * bas_rowcount
sva  = [ scoping_value ]        * bas_rowcount
ctma = [ biosample_col_to_map ] * bas_rowcount

scs  = pd.Series(sca)
svs  = pd.Series(sva)
ctms = pd.Series(ctma)

constants = pd.concat([scs, svs, ctms], axis=1)

constants.columns = ['scoping_col','scoping_value','biosample_col_to_map']

# temp = pd.concat([constants, best_and_salvage], axis=1, ignore_index=True)

best_and_salvage = pd.concat([constants, best_and_salvage], axis=1)

best_and_salvage


Unnamed: 0,scoping_col,scoping_value,biosample_col_to_map,category,raw,query,name,obo_id,value,label,ontology_prefix,include,ols_curation_notes,scope,type,iri,ontology_name,string_dist_rank,string_dist,search_rank
0,env_package_normalization.EnvPackage,water,env_medium,salvage,ci,ci,ci,,,ci,CCO,False,,label,label,http://identifiers.org/ncbigene/43767,cco,1,0.0,1
1,env_package_normalization.EnvPackage,water,env_medium,salvage,na,na,,CO_357:2100001,,,CO_357,False,,label,label,http://www.cropontology.org/rdf/CO_357:2100001,co_357,1,0.0,1
2,env_package_normalization.EnvPackage,water,env_medium,salvage,mat,mat,MAT,format:3626,,MAT,EDAM,False,,label,label,http://edamontology.org/format_3626,edam,1,0.0,1
3,env_package_normalization.EnvPackage,water,env_medium,salvage,deep chlorophyll maximum,deep chlorophyll maximum,deep chlorophyll maximum layer,ENVO:01000326,deep chlorophyll maximum layer,deep chlorophyll maximum layer,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_01000326,envo,1,0.109,1
4,env_package_normalization.EnvPackage,water,env_medium,salvage,sea water oligotrophic water,sea water oligotrophic water,oligotrophic water,ENVO:00002223,oligotrophic water,oligotrophic water,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_00002223,envo,1,0.123,1
5,env_package_normalization.EnvPackage,water,env_medium,salvage,supraglacial lake water,supraglacial lake water,supraglacial lake,ENVO:03000035,supraglacial lake,supraglacial lake,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_03000035,envo,1,0.134,1
6,env_package_normalization.EnvPackage,water,env_medium,salvage,polypropylene,polypropylene,polypropylene,FOODON:03500021,,polypropylene,GENEPIO,False,,label,label,http://purl.obolibrary.org/obo/FOODON_03500021,genepio,1,0.0,1
7,env_package_normalization.EnvPackage,water,env_medium,salvage,filtered water,filtered water,filtered seawater,MICRO:0001773,,filtered seawater,MICRO,False,,label,label,http://purl.obolibrary.org/obo/MICRO_0001773,micro,1,0.132,1
8,env_package_normalization.EnvPackage,water,env_medium,salvage,marine water,marine water,marine waterbody,ENVO:00001999,marine water body,marine water body,MICRO,False,,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00001999,micro,1,0.144,3
9,env_package_normalization.EnvPackage,water,env_medium,salvage,aluminum,aluminum,Aluminum,NCIT:C93162,,Aluminum,NCIT,False,,label,label,http://purl.obolibrary.org/obo/NCIT_C93162,ncit,1,0.0,1


In [51]:
best_and_salvage.to_clipboard(index=False)

In [52]:
raise SystemExit("Don't skip reviewing and saving!")

SystemExit: Don't skip reviewing and saving!

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Do some review in a spreadsheet application!
Specifically, columns `include` and `ols_curation_notes`

Then copy back onto clipboard and resume below

**Add counts back in to motivate manual lookups if necessary**

----

In [53]:
curated = pd.read_clipboard()
curated

Unnamed: 0,scoping_col,scoping_value,biosample_col_to_map,category,raw,query,name,obo_id,value,label,ontology_prefix,include,ols_curation_notes,scope,type,iri,ontology_name,string_dist_rank,string_dist,search_rank
0,env_package_normalization.EnvPackage,water,env_medium,salvage,sea water oligotrophic water,sea water oligotrophic water,oligotrophic water,ENVO:00002223,oligotrophic water,oligotrophic water,ENVO,False,,label,label,http://purl.obolibrary.org/obo/ENVO_00002223,envo,1,0.123,1
1,env_package_normalization.EnvPackage,water,env_medium,salvage,deep chlorophyll maximum,deep chlorophyll maximum,deep chlorophyll maximum layer,ENVO:01000326,deep chlorophyll maximum layer,deep chlorophyll maximum layer,ENVO,True,,label,label,http://purl.obolibrary.org/obo/ENVO_01000326,envo,1,0.109,1
2,env_package_normalization.EnvPackage,water,env_medium,salvage,supraglacial lake water,supraglacial lake water,supraglacial lake,ENVO:03000035,supraglacial lake,supraglacial lake,ENVO,True,,label,label,http://purl.obolibrary.org/obo/ENVO_03000035,envo,1,0.134,1
3,env_package_normalization.EnvPackage,water,env_medium,salvage,polypropylene,polypropylene,polypropylene,FOODON:03500021,,polypropylene,GENEPIO,True,,label,label,http://purl.obolibrary.org/obo/FOODON_03500021,genepio,1,0.0,1
4,env_package_normalization.EnvPackage,water,env_medium,salvage,marine water,marine water,marine waterbody,ENVO:00001999,marine water body,marine water body,MICRO,True,,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00001999,micro,1,0.144,3
5,env_package_normalization.EnvPackage,water,env_medium,salvage,ci,ci,ci,,,ci,CCO,False,,label,label,http://identifiers.org/ncbigene/43767,cco,1,0.0,1
6,env_package_normalization.EnvPackage,water,env_medium,salvage,na,na,,CO_357:2100001,,,CO_357,False,,label,label,http://www.cropontology.org/rdf/CO_357:2100001,co_357,1,0.0,1
7,env_package_normalization.EnvPackage,water,env_medium,salvage,mat,mat,MAT,format:3626,,MAT,EDAM,False,,label,label,http://edamontology.org/format_3626,edam,1,0.0,1
8,env_package_normalization.EnvPackage,water,env_medium,salvage,filtered water,filtered water,filtered seawater,MICRO:0001773,,filtered seawater,MICRO,False,,label,label,http://purl.obolibrary.org/obo/MICRO_0001773,micro,1,0.132,1
9,env_package_normalization.EnvPackage,water,env_medium,salvage,aluminum,aluminum,Aluminum,NCIT:C93162,,Aluminum,NCIT,False,,label,label,http://purl.obolibrary.org/obo/NCIT_C93162,ncit,1,0.0,1


## Filter on `include`

In [54]:
flag = curated["include"]
curated = curated.loc[flag]

curated.columns = "ols_" + curated.columns
# curated.to_sql("curated", biosample_cnx, if_exists="replace", index=False)

curated

Unnamed: 0,ols_scoping_col,ols_scoping_value,ols_biosample_col_to_map,ols_category,ols_raw,ols_query,ols_name,ols_obo_id,ols_value,ols_label,ols_ontology_prefix,ols_include,ols_ols_curation_notes,ols_scope,ols_type,ols_iri,ols_ontology_name,ols_string_dist_rank,ols_string_dist,ols_search_rank
1,env_package_normalization.EnvPackage,water,env_medium,salvage,deep chlorophyll maximum,deep chlorophyll maximum,deep chlorophyll maximum layer,ENVO:01000326,deep chlorophyll maximum layer,deep chlorophyll maximum layer,ENVO,True,,label,label,http://purl.obolibrary.org/obo/ENVO_01000326,envo,1,0.109,1
2,env_package_normalization.EnvPackage,water,env_medium,salvage,supraglacial lake water,supraglacial lake water,supraglacial lake,ENVO:03000035,supraglacial lake,supraglacial lake,ENVO,True,,label,label,http://purl.obolibrary.org/obo/ENVO_03000035,envo,1,0.134,1
3,env_package_normalization.EnvPackage,water,env_medium,salvage,polypropylene,polypropylene,polypropylene,FOODON:03500021,,polypropylene,GENEPIO,True,,label,label,http://purl.obolibrary.org/obo/FOODON_03500021,genepio,1,0.0,1
4,env_package_normalization.EnvPackage,water,env_medium,salvage,marine water,marine water,marine waterbody,ENVO:00001999,marine water body,marine water body,MICRO,True,,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00001999,micro,1,0.144,3
19,env_package_normalization.EnvPackage,water,env_medium,preferred ontologies,brownish colored sea ice,brownish colored sea ice,brownish colored sea ice,ENVO:01001190,brown sea ice,brown sea ice,ENVO,True,,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_01001190,envo,1,0.0,1
20,env_package_normalization.EnvPackage,water,env_medium,preferred ontologies,coastal water,coastal water,coastal water,ENVO:00002150,coastal sea water,coastal sea water,ENVO,True,,has_broad_synonym,annotation,http://purl.obolibrary.org/obo/ENVO_00002150,envo,1,0.0,1
21,env_package_normalization.EnvPackage,water,env_medium,preferred ontologies,freshwater,freshwater,freshwater,ENVO:00002011,fresh water,fresh water,ENVO,True,,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002011,envo,1,0.0,1
22,env_package_normalization.EnvPackage,water,env_medium,preferred ontologies,ocean water,ocean water,ocean water,ENVO:00002149,sea water,sea water,ENVO,True,,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002149,envo,1,0.0,1
23,env_package_normalization.EnvPackage,water,env_medium,preferred ontologies,seawater,seawater,seawater,ENVO:00002149,sea water,sea water,ENVO,True,,synonym,synonym,http://purl.obolibrary.org/obo/ENVO_00002149,envo,1,0.0,1


In [55]:
merge_search_merged = mapping_candidates.merge(
    curated, how="outer", left_on="remaining_tidied", right_on="ols_raw"
)

In [56]:
# merge_search_merged.to_csv("merge_search_merged.csv")

In [57]:
flag = (
    ~merge_search_merged["id_or_lab_ok"]
    & ~merge_search_merged["ols_obo_id"].eq("")
    & ~merge_search_merged["ols_obo_id"].isna()
)

replacement = merge_search_merged.loc[flag, "ols_obo_id"]

merge_search_merged.loc[flag, "consensus_id"] = replacement

replacement = merge_search_merged.loc[flag, "ols_label"]

merge_search_merged.loc[flag, "consensus_lab"] = replacement

replacement = (
    merge_search_merged.loc[flag, "consensus_lab"]
    + " ["
    + merge_search_merged.loc[flag, "consensus_id"]
    + "]"
)

merge_search_merged.loc[flag, "assembled_consensus"] = replacement

merge_search_merged

Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,term_tidy,string,extract,remaining_string,...,ols_ontology_prefix,ols_include,ols_ols_curation_notes,ols_scope,ols_type,ols_iri,ols_ontology_name,ols_string_dist_rank,ols_string_dist,ols_search_rank
0,fresh water,5847,fresh water,fresh water,1,1,fresh water,fresh water,,fresh water,...,,,,,,,,,,
1,fresh water [ENVO:00002011],42,fresh water [ENVO:00002011],fresh water [ENVO:00002011],1,1,fresh water [ENVO:00002011],fresh water [ENVO:00002011],ENVO:00002011,fresh water,...,,,,,,,,,,
2,water,1913,water,water,1,1,water,water,,water,...,,,,,,,,,,
3,Water,474,Water,Water,1,1,Water,Water,,Water,...,,,,,,,,,,
4,missing|||water,2,missing|||water,water,2,2,water,water,,water,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,seawater/brine transition zone lower halocline...,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,1,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,,seawater/brine transition zone lower halocline...,...,,,,,,,,,,
209,seawater/brine transition zone lower halocline...,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,1,1,seawater/brine transition zone lower halocline...,seawater/brine transition zone lower halocline...,,seawater/brine transition zone lower halocline...,...,,,,,,,,,,
210,seawater/brine transition zone upper halocline...,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,1,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,,seawater/brine transition zone upper halocline...,...,,,,,,,,,,
211,seawater/brine transition zone upper halocline...,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,1,1,seawater/brine transition zone upper halocline...,seawater/brine transition zone upper halocline...,,seawater/brine transition zone upper halocline...,...,,,,,,,,,,


In [58]:
# merge_search_merged.to_sql(
#     "merge_search_merged", biosample_cnx, if_exists="replace", index=False
# )

In [59]:
summary = merge_search_merged[[biosample_col_to_map, "count", "assembled_consensus"]]
summary = summary.sort_values("count", ascending=False)

In [60]:
# could still make better use of SSSOM columns

frame_for_sssom = merge_search_merged

na_flag = frame_for_sssom["splitted"].isna()
frame_for_sssom = frame_for_sssom[~na_flag]

na_flag = frame_for_sssom["consensus_id"].isna()
frame_for_sssom = frame_for_sssom[~na_flag]

raw_queries = list(frame_for_sssom["splitted"])
urlencodeds = list(map(urllib.parse.quote, raw_queries))
urlencodeds = [sssom_subject_prefix + ":" + item for item in urlencodeds]

results_rows = len(frame_for_sssom.index)
iso8601_stamp = datetime.now().replace(microsecond=0).isoformat()

sssom_frame = {
    "subject_category": [biosample_col_to_map] * results_rows,
    "subject_label": frame_for_sssom["splitted"],
    "predicate_id": ["skos:relatedMatch"] * results_rows,
    "object_id": frame_for_sssom["consensus_id"],
    "object_label": frame_for_sssom["consensus_lab"],
    "match_type": ["Lexical"] * results_rows,
    "creator_id": ["https://github.com/turbomam/scoped-mapping"] * results_rows,
    #     "mapping_tool": ["https://www.ebi.ac.uk/ols/docs/api"] * results_rows,
    "mapping_date": [iso8601_stamp] * results_rows,
    #     "confidence": 1 - frame_for_sssom["string_dist"],
    "subject_id": urlencodeds,
}

sssom_frame = pd.DataFrame(sssom_frame)

sssom_frame.to_csv(sssom_file, sep=ols_review_seperator, index=False)



## Join together pipe-delimited _generic_ mappings 
_Any other constraints? If part_count is > 1, do we need to check for empty or null (generic), repeated or splitted?_

In [61]:
multi_part_flag = merge_search_merged["part_count"] > 1
multi_part_flag.value_counts()

False    207
True       6
Name: part_count, dtype: int64

In [62]:
multi_part_frame = merge_search_merged.loc[multi_part_flag]


In [63]:
multi_part_frame = multi_part_frame[
    [
        biosample_col_to_map,
        "count",
        "repeated",
        "splitted",
        "part_count",
        "seq",
        "consensus_id",
        "consensus_lab",
    ]
]

multi_part_frame = multi_part_frame.sort_values(
    [biosample_col_to_map, "seq"], ascending=(True, True)
)

# multi_part_frame

In [64]:
# remove any row whose biosample_col_to_map has any NaN consensus_ids
# deep-sea sediment biome from ENVO:01000030|||deep-sea sediment biome

bad_flag = multi_part_frame['consensus_id'].isna()
print(bad_flag)
bad_c2m = multi_part_frame[biosample_col_to_map][bad_flag]
print(bad_c2m)
keep_flag = ~ multi_part_frame[biosample_col_to_map].isin(bad_c2m)
print(keep_flag)
multi_part_frame = multi_part_frame[keep_flag]
multi_part_frame

43     False
10     False
161     True
4      False
79     False
8      False
Name: consensus_id, dtype: bool
161    missing|||water
Name: env_medium, dtype: object
43      True
10      True
161    False
4      False
79      True
8       True
Name: env_medium, dtype: bool


Unnamed: 0,env_medium,count,repeated,splitted,part_count,seq,consensus_id,consensus_lab
43,Coastal sea water [ENVO:00002150]|||sea water ...,7,Coastal sea water [ENVO:00002150]|||sea water ...,Coastal sea water [ENVO:00002150],2,1,ENVO:00002150,coastal sea water
10,Coastal sea water [ENVO:00002150]|||sea water ...,7,Coastal sea water [ENVO:00002150]|||sea water ...,sea water [ENVO:00002149],2,2,ENVO:00002149,sea water
79,oligotrophic water [ENVO:00002223]|||sea water...,34,oligotrophic water [ENVO:00002223]|||sea water...,oligotrophic water [ENVO:00002223],2,1,ENVO:00002223,oligotrophic water
8,oligotrophic water [ENVO:00002223]|||sea water...,34,oligotrophic water [ENVO:00002223]|||sea water...,sea water [ENVO:00002149],2,2,ENVO:00002149,sea water


In [65]:
part_count = multi_part_frame.groupby([biosample_col_to_map], sort=False)[
    "part_count"
].max()
part_count.value_counts()

2    2
Name: part_count, dtype: int64

In [66]:
seq_max = multi_part_frame.groupby([biosample_col_to_map], sort=False)["seq"].max()
# seq_max

In [67]:
parts_check = pd.DataFrame(dict(part_count=part_count, seq_max=seq_max)).reset_index()
# parts_check

In [68]:
all_parts_flag = parts_check["seq_max"] == parts_check["part_count"]
# all_parts_flag

In [69]:
generic_with_all_parts = parts_check.loc[all_parts_flag, biosample_col_to_map]
# generic_with_all_parts

In [70]:
all_parts_flag = multi_part_frame[biosample_col_to_map].isin(list(generic_with_all_parts))
all_parts_frame = multi_part_frame.loc[all_parts_flag]
# all_parts_frame


In [71]:
unique_pipesep_ebs = list(set(list(all_parts_frame[biosample_col_to_map])))
unique_pipesep_ebs.sort()
# print(unique_pipesep_ebs)
repipe_dict_list = []
for one_pipesep in unique_pipesep_ebs:
    flag = all_parts_frame[biosample_col_to_map] == one_pipesep
    temp_frame = all_parts_frame.loc[flag]
#     print(temp_frame)
    ids_list = list(temp_frame["consensus_id"])
    lab_list = list(temp_frame["consensus_lab"])
#     print(ids_list)
#     print(lab_list)
    ids_repipe = rejoin_delim.join(ids_list)
    lab_repipe = rejoin_delim.join(lab_list)

    repipe_dict = {
        biosample_col_to_map: one_pipesep,
        "consensus_id": ids_repipe,
        "consensus_lab": lab_repipe,
    }
    repipe_dict_list.append(repipe_dict)
repipe_frame = pd.DataFrame(repipe_dict_list)
repipe_frame

Unnamed: 0,env_medium,consensus_id,consensus_lab
0,Coastal sea water [ENVO:00002150]|||sea water ...,ENVO:00002150|ENVO:00002149,coastal sea water|sea water
1,oligotrophic water [ENVO:00002223]|||sea water...,ENVO:00002223|ENVO:00002149,oligotrophic water|sea water


In [72]:
sssom_min_cols = sssom_frame[["subject_label", "object_id", "object_label"]]

sssom_min_cols.columns = [biosample_col_to_map, "consensus_id", "consensus_lab"]


In [73]:
repipe_rows = len(repipe_frame.index)
if repipe_rows > 0:
    direct_and_repipe = pd.concat([sssom_min_cols, repipe_frame])
else:
    direct_and_repipe = sssom_min_cols


In [74]:
# this is a staging area for builing the repair tables
# same (?) stuff goes into SSSOM files?
direct_and_repipe.to_sql(
    "repair_staging", biosample_cnx, if_exists="replace", index=False
)

## refactor this even more?


In [75]:
q = (
    """
select
	b.id, b."""
    + biosample_col_to_map
    + """,
	rs.consensus_id ,
	rs.consensus_lab
from
	repair_staging rs
left join biosample b on
	b."""
    + biosample_col_to_map
    + """ = rs."""
    + biosample_col_to_map
    + """
inner join biosample_sample_taxon_summary stic on
	b.taxonomy_id = stic.biosample_taxid
join env_package_normalization on
	b.env_package = env_package_normalization.env_package
where """
    + scoping_col
    + " = '"
    + scoping_value
    + "'"
)

[per_biosample_scoped_mapping_results, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx, print_timing=True
)

per_biosample_scoped_mapping_results.columns = ['id','raw','consensus_id','consensus_lab']

# per_biosample_scoped_mapping_results

2021-07-07 08:11:44.027327
2021-07-07 08:11:44.442829
0:00:00.415502


In [76]:
per_biosample_scoped_mapping_results["biosample_col_to_map"] = biosample_col_to_map
per_biosample_scoped_mapping_results["scoping_col"] = scoping_col
per_biosample_scoped_mapping_results["scoping_value"] = scoping_value

# table_name = "repaired_" + biosample_col_to_map

# per_biosample_scoped_mapping_results = per_biosample_scoped_mapping_results.drop(biosample_col_to_map, 1)

per_biosample_scoped_mapping_results = per_biosample_scoped_mapping_results[['id', 'scoping_col', 'scoping_value', 
                                                                             'biosample_col_to_map', 'raw',
                                                                             'consensus_id', 'consensus_lab']]

# per_biosample_scoped_mapping_results

In [77]:
per_biosample_scoped_mapping_results.to_sql(
    "repaired_long", biosample_cnx, if_exists="append", index=False
)