In [None]:
import pandas as pd
import re
import requests as requests
import sqlite3
import string
import urllib
import yaml

import scoped_mapping

from datetime import datetime
from pkg_resources import get_distribution, DistributionNotFound
from strsimpy.cosine import Cosine
from xml.etree import ElementTree
from tdda import rexpy

# import os

# User-provided data
See repo README for notes on setting up SQLite databases of OBO ontologies with semantic-sql, relation-graph and rdftab

In [None]:
# from https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml
# see also https://www.ncbi.nlm.nih.gov/biosample/docs/packages/
biosample_packages_file = "../../target/biosample_packages.xml"

# from ftp://ftp.ncbi.nlm.nih.gov//biosample/biosample_set.xml.gz
# via harmonized_table.db.gz
# in https://drive.google.com/drive/u/0/folders/1eL0v0stoduahjDpoDJIk3z2pJBAU4b2Y
biosample_sqlite_file = "../../target/harmonized-table.db"

# where do we require a single ontology and where can we use multiple?
target_onto_prefix = "ENVO"
# PO

first_pass_ontologies = ["envo", "ncbitaxon"]

----

## It's nice to see everything accounted for
(In terms of manual mappings for env_package.) But we can prioritize NMDC for now:

- Soil 15,777
- Sediment 7,147
- Plant-associated 3,142

Could some of these "no environmental package" mappings be losing important granularity?

Map `None` and '' to "no environmental package"?

In [None]:
env_package_overrides = {
    "built environment": "built",
    "misc environment": "miscellaneous",
    "missing": "no environmental package",
    "unknown": "no environmental package",
    "default": "no environmental package",
    "unspecified": "no environmental package",
    "not available": "no environmental package",
    "not collected": "no environmental package",
    "miscellaneous natural or artificial environment": "miscellaneous",
    "not applicable": "no environmental package",
    "soil-associated": "soil",
    "soil associated": "soil",
}

In [None]:
biosample_cnx = sqlite3.connect(biosample_sqlite_file)

first_pass_ontologies.insert(0, target_onto_prefix.lower())
first_pass_ontologies_str = ",".join(first_pass_ontologies)

## Determine ID patterns for common ontologies, like `ENVO`

In [None]:
# first_pass_ontologies, including primary but not ncbitaxon
# doesn't check for exisitence of DB files

temp = first_pass_ontologies
temp = [each_string.lower() for each_string in temp]
temp.sort()
temp = [each_string for each_string in temp if each_string != "ncbitaxon"]

print(temp)

first_pass_id_frames = []
for one_ontology in temp:
    print(one_ontology)
    one_db_file = "../../../scoped-mapping/semantic-sql/db/" + one_ontology.lower() + ".db"
    print(one_db_file)
    one_con = sqlite3.connect(one_db_file)

    # FIXED? this wont include a term unless it's a class with a label
    # add obsolete tags?
    # may want to make a local-label only frame for later tasks?

    q = """
select
	distinct s1.stanza,
	s2.value
from
	statements s1
left join statements s2 on
	s2.subject = s1.subject
where
	s1.predicate = 'rdf:type'
	and s1.object = 'owl:Class'
	and s1.stanza = s1.subject
	and s2.predicate = 'rdfs:label'"""
    [ids_labs_selected_ontolgies, query_duration] = scoped_mapping.timed_query(
        q, one_con
    )
    print(query_duration)
    ids_labs_selected_ontolgies["ontology"] = one_ontology
    first_pass_id_frames.append(ids_labs_selected_ontolgies)
    one_con.close()

In [None]:
ids_labs_selected_ontolgies = pd.concat(first_pass_id_frames)
ids_labs_selected_ontolgies.to_sql(
    "ids_labs_selected_ontolgies", biosample_cnx, if_exists="replace", index=False
)

ids_labs_selected_ontolgies

In [None]:
ids_labs_selected_ontolgies = scoped_mapping.add_prefix_col(
    ids_labs_selected_ontolgies, "stanza", "prefix"
)
ids_labs_selected_ontolgies

In [None]:
id_patterns = scoped_mapping.get_multi_term_patterns(
    ids_labs_selected_ontolgies, "stanza", "prefix"
)

id_patterns

In [None]:
ipf = pd.DataFrame(id_patterns.items(), columns=["ontology", "id_pattern"])
ipf.to_sql("id_patterns", biosample_cnx, if_exists="replace", index=False)

In [None]:
# # round trip
# ipd = dict(zip(ipf.ontology, ipf.id_pattern))

## Retreive `env_package` values from Biosample table

In [None]:
q = """
select
    env_package,
    count(*) as count
from
    biosample b
group by
    env_package
order by
    count(*) desc
"""
[env_package_count, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

env_package_count.to_csv("env_package_count")

env_package_count

## Apply some normalization rules to the `env_package` values

In [None]:
# bumped splits from two to three. should genrealize.

env_package_normalization = scoped_mapping.env_package_nomralizastion(
    env_package_count, "env_package", id_patterns[target_onto_prefix]
)


# getting rid of redundant? 'string' column
env_package_normalization = env_package_normalization[
    [   "env_package",
        "count",
        "lhs",
        "rhs",
        "extract",
        "remaining_string",
        "remaining_tidied",
    ]
]

env_package_normalization

In [None]:
package_dictionary = scoped_mapping.get_package_dictionary(biosample_packages_file)
package_dictionary.to_sql(
    "package_dictionary", biosample_cnx, if_exists="replace", index=False
)
package_dictionary

pick "EnvPackage"/"eptidy" from package dictionary as canonical, not "EnvPackageDisplay"/"epdtidy"


But still want to support making XXX values from YYY canonical according to "EnvPackageDisplay"/"epdtidy", so make a mapping/override table

In [None]:
package_dictionary = scoped_mapping.make_tidy_col(
    package_dictionary, "EnvPackage", "eptidy"
)
package_dictionary = scoped_mapping.make_tidy_col(
    package_dictionary, "EnvPackageDisplay", "epdtidy"
)

# update in sqlite
package_dictionary.to_sql(
    "package_dictionary", biosample_cnx, if_exists="replace", index=False
)

In [None]:
epd_to_ep = package_dictionary[["eptidy", "epdtidy"]]
# drop duplicates
epd_to_ep = epd_to_ep.drop_duplicates()

# drop blank eptidy rows
ep_blank_flag = epd_to_ep["eptidy"].eq("")
epd_to_ep = epd_to_ep.loc[~ep_blank_flag]

# drop rows where eptidy and epdtidy are the same
identical_flag = epd_to_ep["eptidy"] == epd_to_ep["epdtidy"]
epd_to_ep = epd_to_ep.loc[~identical_flag]

epd_to_ep

In [None]:
# and add to manualy asserted overrides above
overrides_supplement = dict(zip(epd_to_ep["epdtidy"], epd_to_ep["eptidy"]))

overrides_supplement

In [None]:
env_package_overrides.update(overrides_supplement)

env_package_overrides

In [None]:
env_package_normalization = scoped_mapping.add_overrides(
    env_package_normalization, "remaining_tidied", "rt_override", env_package_overrides
)

In [None]:
denorm_frame = package_dictionary[["EnvPackage", "eptidy"]]
denorm_frame = denorm_frame.drop_duplicates()
denorm_frame

In [None]:
env_package_normalization = env_package_normalization.merge(
    denorm_frame, how="left", left_on="rt_override", right_on="eptidy"
)

env_package_normalization = env_package_normalization[
    [
        "env_package",
        "count",
        "lhs",
        "rhs",
        "extract",
        "remaining_string",
        "remaining_tidied",
        "rt_override",
        "EnvPackage",
    ]
]

non_canonical_flag = env_package_normalization["EnvPackage"].isna()
env_package_normalization["is_canonical"] = True
env_package_normalization.loc[non_canonical_flag, "is_canonical"] = False

- env_package = env_package annotation from NCBI Biosample file XXX
- count = number of biosamples using that env_package annotation
- lhs = checklist info
- rhs = potential package info
- extract = potential OBO ID from rhs column (currently harcoded and only looking for ENVO IDs)
- remaining_string = rhs/string, with potential OBO IDs removed
- remaining_tidied = remaining_string with case, whitespace and punctuation normailzastion
- rt_override = some remaining_tidied values can be replaced according to env_package_overrides
- EnvPackage = corresponding de-normalized value from package_dictionary
- is_canonical = false when EnvPackage is NaN

In [None]:
env_package_normalization.to_sql(
    "env_package_normalization", biosample_cnx, if_exists="replace", index=False
)

In [None]:
env_package_normalization

## What do the successful normalizations look like?

In [None]:
q = """
select
    env_package,
    count,
    lhs,
    extract,
    EnvPackage
from
    env_package_normalization
where
    is_canonical = 1
"""
[successful_normalizastions, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx
)

print(query_duration)

successful_normalizastions

# Are there any normalization failures?

In [None]:
q = """
select
    env_package,
    count,
    lhs,
    extract,
    EnvPackage
from
    env_package_normalization
where
    is_canonical = 0
"""
[normalizastion_failures, query_duration] = scoped_mapping.timed_query(q, biosample_cnx)

print(query_duration)

normalizastion_failures

In [None]:
cursor = biosample_cnx.cursor()
statement = """
DROP TABLE if exists repaired_env_package ;

CREATE TABLE repaired_env_package AS
select
	b.id,
	epn.env_package as env_package_orig,
	epn.EnvPackage as env_package_rep
from
	env_package_normalization epn
join biosample b on
	b.env_package = epn.env_package
where
	is_canonical = 1
	and EnvPackage != '';
	
select
	count(*)
from
	repaired_env_package rep
"""

cursor.executescript(statement)
biosample_cnx.commit()