In [1]:
import pandas as pd
import re
import requests as requests
import sqlite3
import string
import urllib
import yaml

import scoped_mapping

from datetime import datetime
from pkg_resources import get_distribution, DistributionNotFound
from strsimpy.cosine import Cosine
from xml.etree import ElementTree
from tdda import rexpy

# User-provided data
See repo README for notes on setting up SQLite databases of OBO ontologies with semantic-sql, relation-graph and rdftab

In [2]:
# from https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml
# see also https://www.ncbi.nlm.nih.gov/biosample/docs/packages/
biosample_packages_file = "../../target/biosample_packages.xml"

# from ftp://ftp.ncbi.nlm.nih.gov//biosample/biosample_set.xml.gz
# via harmonized_table.db.gz
# in https://drive.google.com/drive/u/0/folders/1eL0v0stoduahjDpoDJIk3z2pJBAU4b2Y
biosample_sqlite_file = "../../target/harmonized-table.db"
ncbitaxon_sqlite_file = "../../../scoped-mapping/semantic-sql/db/ncbitaxon.db"

In [3]:
biosample_cnx = sqlite3.connect(biosample_sqlite_file)
ncbitaxon_cnx = sqlite3.connect(ncbitaxon_sqlite_file)

# Utilizing taxonomy for broad subsetting


**This uses an SQLite database in which the transitive closure over subClassOf has already been materialized. See the README and Makefile.**

Specifically, flag the Biosamples whose `taxon_id` indicates they are an unclassified entity. Ignoring the others will throw out samples of multicellular organisms, like fruit flies.



## Get a listing of all taxa that are transitive subclasses of `NCBITaxon:2787823`

I.e. 'unclassified entities'

In [4]:
q = """
select
    distinct s.subject
from
    entailed_edge ee
join statements s on
    ee.subject = s.subject
where
    ee.predicate = 'rdfs:subClassOf'
    and ee.object = 'NCBITaxon:2787823'
    and s.predicate = 'rdfs:label'
"""
[unclassified_taxa, query_duration] = scoped_mapping.timed_query(q, ncbitaxon_cnx)
unclassified_taxa["unclassified"] = True

print(query_duration)

unclassified_taxa

0:00:08.865928


Unnamed: 0,subject,unclassified
0,NCBITaxon:1006967,True
1,NCBITaxon:1041057,True
2,NCBITaxon:1046002,True
3,NCBITaxon:1046003,True
4,NCBITaxon:1046004,True
...,...,...
989,NCBITaxon:939928,True
990,NCBITaxon:941420,True
991,NCBITaxon:941421,True
992,NCBITaxon:941422,True


## Get taxon counts from the Biosample metadata

In [5]:
q = """
select
    taxonomy_id biosample_taxid,
    count(*) as count
from
    biosample b
group by
    taxonomy_id
order by
    count(*) desc
"""
[biosample_sample_taxon_summary, query_duration] = scoped_mapping.timed_query(
    q, biosample_cnx
)
biosample_sample_taxon_summary["curie"] = "NCBITaxon:" + biosample_sample_taxon_summary[
    "biosample_taxid"
].astype(str)

print(query_duration)

0:00:01.208535


## Merge the two taxonomy dataframes

I.e. flag the the Biosample records whose `taxonomy_id` field belongs to a subclass of 'unclassified entries'.

In [6]:
biosample_sample_taxon_summary = biosample_sample_taxon_summary.merge(
    unclassified_taxa, left_on="curie", right_on="subject", how="left"
)
biosample_sample_taxon_summary.unclassified.fillna(False, inplace=True)

biosample_sample_taxon_summary

Unnamed: 0,biosample_taxid,count,curie,subject,unclassified
0,9606,7562984,NCBITaxon:9606,,False
1,10090,1344373,NCBITaxon:10090,,False
2,2697049,1006079,NCBITaxon:2697049,,False
3,410658,385970,NCBITaxon:410658,NCBITaxon:410658,True
4,408170,383639,NCBITaxon:408170,NCBITaxon:408170,True
...,...,...,...,...,...
186439,999888,1,NCBITaxon:999888,,False
186440,999889,1,NCBITaxon:999889,,False
186441,999891,1,NCBITaxon:999891,,False
186442,999892,1,NCBITaxon:999892,,False


## Add labels to all taxa

In [7]:
q = """
select
    subject ,
    value
from statements
where
    predicate = 'rdfs:label' and subject = stanza
"""
[all_tax_labels, query_duration] = scoped_mapping.timed_query(q, ncbitaxon_cnx)

biosample_sample_taxon_summary = biosample_sample_taxon_summary.merge(
    all_tax_labels, left_on="curie", right_on="subject", how="left"
)

biosample_sample_taxon_summary = biosample_sample_taxon_summary[
    ["curie", "biosample_taxid", "count", "unclassified", "value"]
]
biosample_sample_taxon_summary.columns = [
    "curie",
    "biosample_taxid",
    "count",
    "unclassified",
    "label",
]

print(query_duration)
biosample_sample_taxon_summary.to_sql(
    "biosample_sample_taxon_summary", biosample_cnx, if_exists="replace", index=False
)

all_tax_labels.to_sql("all_tax_labels", biosample_cnx, if_exists="replace", index=False)

biosample_sample_taxon_summary

0:00:11.585632


Unnamed: 0,curie,biosample_taxid,count,unclassified,label
0,NCBITaxon:9606,9606,7562984,False,Homo sapiens
1,NCBITaxon:10090,10090,1344373,False,Mus musculus
2,NCBITaxon:2697049,2697049,1006079,False,Severe acute respiratory syndrome coronavirus 2
3,NCBITaxon:410658,410658,385970,True,soil metagenome
4,NCBITaxon:408170,408170,383639,True,human gut metagenome
...,...,...,...,...,...
186439,NCBITaxon:999888,999888,1,False,Influenza A virus (A/turkey/Ontario/6213/1966(...
186440,NCBITaxon:999889,999889,1,False,Influenza A virus (A/turkey/Oregon/1/1971(mixed))
186441,NCBITaxon:999891,999891,1,False,Bacillus amyloliquefaciens TA208
186442,NCBITaxon:999892,999892,1,False,[Propionibacterium] humerusii P08


**Almost all of the taxa that are common in the biosample collection are either unclassified/metagenomes or easily recognized cellular organisms. Cellular organism samples are de-prioritized in this exercise**

Exceptions include:
- 32630 = synthetic construct (other entries; other sequences; artificial sequences)
    - 'other entries' would add 16k rows on top of the 1k 'unclassified entities'
    - metagenomes account for 331 of the 'unclassified entities'
    - there are also a small number of uncultured/unclassified microorganisms in the biosample dataset
- 77133 = uncultured bacterium (cellular organisms; Bacteria; environmental samples)
    - 'cellular organisms' would add 2M rows on top of the 1k 'unclassified entities'
    - 'cellular organisms; Bacteria; environmental samples' adds 26k
    
----