In [None]:
import pandas as pd
import os
import scoped_mapping
import numpy as np
from scipy.stats import entropy


## _TODO: Add Postgres example_

In [None]:
# these rows in this dataset aren't intended to be related in any way
# it's really just three unrelated columns
data_file          = '../../data/webmap_enums.tsv'
delimiter          = '\t'
column_name        = 'Taxon'
chars_to_whiteout  = '._-'
my_ontoprefix      = 'ncbitaxon'
my_query_fields    = ''
my_row_request           = 3
string_dist_shingle_size = 2
my_max_string_dist       = 0.1

In [None]:
data_frame = pd.read_csv(data_file, sep='\t')
data_frame = data_frame.replace(np.nan, '')
data_frame

In [None]:
row_count = len(data_frame.index)

dfc = data_frame.columns
col_entropy = {}

for onecol in dfc:
    temp = data_frame[onecol].value_counts(dropna=False)
    temp = temp / row_count
    E = entropy(temp)
    col_entropy[onecol] = E

col_entropy = pd.DataFrame(list(col_entropy.items()),
                   columns=['column', 'entropy'])

col_entropy = col_entropy.sort_values('entropy', ascending=True)

col_entropy


In [None]:
data_frame['Engineering'].value_counts()

In [None]:
data_frame['Taxon'].value_counts()

In [None]:
data_frame['FAO'].value_counts()


In [None]:
data_list = data_frame[column_name]
null_flag = data_list.isnull()
null_flag.value_counts()
no_nans = data_list[~null_flag]
no_nans = list(set(list(no_nans)))
no_nans.sort()
# no_nans


In [None]:
## illustrates normalizastion and search space reducation
## but all of that is embedded in scoped_mapping.search_get_annotations_wrapper

# whiteout_frame = scoped_mapping.get_whiteout_frame(no_nans, replaced_chars=chars_to_whiteout)
# whiteout_frame
# whiteout_list = scoped_mapping.get_wo_list(whiteout_frame)
# whiteout_list

In [None]:
# about 1 second per unique submission
mappings = scoped_mapping.search_get_annotations_wrapper(no_nans,
                                                         bad_chars       = chars_to_whiteout,
                                                         cat_name        = column_name,
                                                         ontoprefix      = my_ontoprefix,
                                                         query_fields    = my_query_fields,
                                                         rr              = my_row_request,
                                                         string_dist_arg = string_dist_shingle_size)
mappings


In [None]:
my_best_acceptable = scoped_mapping.get_best_acceptable(mappings, max_string_dist=my_max_string_dist)

my_best_acceptable = my_best_acceptable.iloc[my_best_acceptable.raw.str.lower().argsort()]

my_best_acceptable

In the default configuration, `Simian virus 40` is incorrectly mapped to _Simian virus 41_, NCBITaxon:2560766. 

NCBITaxon:1891767 _Macaca mulatta polyomavirus 1_ is probably the correct mapping, with equivalent name _Simian virus 40_. NCBITaxon:10633 is an alternative ID. I have not found any configuration of `scoped_mapping.search_get_annotations_wrapper` to retrieve the correct term for this string.


In [None]:
successes = list(set(list(my_best_acceptable['raw'])))
successes.sort()
successes

In [None]:
no_acceptable_mappings = scoped_mapping.get_no_acceptable_mappings(mappings, my_best_acceptable)
no_acceptable_mappings

Good mappings could available in `no_acceptable_mappings`, but they may passed on to `my_best_acceptable` due to **string distances that are higher than the suggested 0.05.**

- `Ashbya.gossypii` matches NCBITaxon:33169 _Eremothecium gossypii_ through its related synonym _Ashbya goss**i**pii_, but the substitution of a _y_ for an _i_ results in a string distance of 0.143 under the suggested configuration.
Ashbya gossipii
- `Pseudomonas plasmid pVS1` matches NCBITaxon:219557 _Plasmid pVS1_ with a string distance of 0.246. No other relevant annotations are available, so the word 'Pseudomonas' becomes noise in the string distance calculation. Perhaps a string distance metric other than cosine would help?

Other inputs don't make it into the `best_acceptable` results because they consist of **a species name combined with a genus name.** **`scoped_mapping` does't have any method to specifically account for that yet.**

- `Lentivirus.human-immunodeficiency-virus1` hits _Human immunodeficiency virus_, NCBITaxon:12721 with a string distance of 0.120, due to the presence of the `Lentivirus` genus name.

- `Nepovirus.Tobacco-ringspot-virus` hits _Tobacco ringspot virus_, NCBITaxon:12282 with a string distance of 0.114, due to the presence of the genus name `Nepovirus`

In addition to increasing the string distance cutoff, some results can be salvaged by **changing which OLS fields are queried** or by **changing which characters in the input are replaced with whitespcae.**

- `herpes.simplex.virus-1`
    - NCBITaxon:10298 _Human alphaherpesvirus 1_ has the related genbank synonym _Herpes simplex virus 1_ and can be found  by prioritizing non-label annotations with the modification `...query_fields = 'annotations,label'...`
- `phage.lambda`
    - NCBITaxon:10710 has the label _Escherichia virus Lambda_. _Phage lambda_ and _lambda phage_ are assigned via several different synonym and annotation predicates. This hit can also be salvaged with `...query_fields = 'annotations,label'...`
- `SARS-CoV-2`
    - NCBITaxon:2697049 has the label _Severe acute respiratory syndrome coronavirus 2_ and the genbank acronym _SARS-CoV-2_. `...query_fields = 'annotations,label'...` is a partila solution for salvaging this term. It also requies a modification to the characters that are replaced with whitespace. We generally suggest replacing `._-`, but the hyphens are required in this case. So use `...bad_chars = '._'...` instead.


What to share? `my_best_acceptable` is of very very high **but not perfect quality**, ie there are a few false positives. For those one might want to reconsult the raw/complete `mappings`

`no_acceptable_mappings` is all one would need for manual review of "false" negatives


In [None]:
failures = list(set(list(no_acceptable_mappings['raw'])))
failures.sort()
failures

In [None]:
salvage_mappings = scoped_mapping.search_get_annotations_wrapper(failures,
                                                         bad_chars       = '._',
                                                         cat_name        = 'salvage',
                                                         ontoprefix      = my_ontoprefix,
                                                         query_fields    = 'annotations,label',
                                                         rr              = my_row_request,
                                                         string_dist_arg = string_dist_shingle_size)

my_salvage_acceptable = scoped_mapping.get_best_acceptable(salvage_mappings,
                                                           max_string_dist=0.15)

my_salvage_acceptable = my_salvage_acceptable.iloc[my_salvage_acceptable.raw.str.lower().argsort()]

my_salvage_acceptable