In [None]:
# ATTENTION: To run this, make sure to run jupyter notebook from physcraper's virtual environment.

# Development of function get_subtree:
# Get an opentree synthetic tree for a rank X, from a clade Y,
# distributed in geographic region Z.

# arguments
# ott_ids: Numeric. If given only one, it will get a subtree for the corresponding node
         # If given 2 or more, it will give back an induced subtree
# rank: Character string. Indicates the taxonomic rank to search for.
# clade: Character string or number (?). Indicates a phylogenetic context to search for.
# range: Character string. Indicates the geogaphic region to search for.
# ott_version: Character string


In [None]:
# define arguments
ott_ids = 494367
rank = "family"
ott_version = "3.1"

In [2]:
# download the ott file
! wget 'http://files.opentreeoflife.org/ott/ott3.1/ott3.1.tgz'

--2019-12-03 13:37:36--  http://files.opentreeoflife.org/ott/ott3.1/ott3.1.tgz
Resolving files.opentreeoflife.org... 129.237.33.155
Connecting to files.opentreeoflife.org|129.237.33.155|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.opentreeoflife.org/ott/ott3.1/ott3.1.tgz [following]
--2019-12-03 13:37:36--  https://files.opentreeoflife.org/ott/ott3.1/ott3.1.tgz
Connecting to files.opentreeoflife.org|129.237.33.155|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 106275258 (101M) [application/x-gzip]
Saving to: ‘ott3.1.tgz’


2019-12-03 13:38:11 (2.93 MB/s) - ‘ott3.1.tgz’ saved [106275258/106275258]



In [None]:
# unzip ott file
import gzip
import shutil
with gzip.open('ott3.1.tgz', 'rb') as fin:
    with open('ott3.1.tsv', 'wb') as fout:
        shutil.copyfileobj(fin, fout)

In [4]:
# subset ott file to get lines with the rank that we want and clean of invalid ranks
# this will only keep "good" ranks:
! grep -a "family" ott3.1.tsv | egrep -v "Incertae" | egrep -v "no rank" | egrep -v "major_rank_conflict" | egrep -v "uncultured" | egrep -v "barren" | egrep -v "extinct" | egrep -v "incertae" | egrep -v "unplaced" | egrep -v "hidden" | egrep -v "inconsistent"  | egrep -v "synonym" | egrep -v "in family" | egrep -v "species" | egrep -v "genus" | egrep -v "superfamily" | egrep -v "subfamily"> ott3.1red.tsv


In [5]:
# read the ott taxonomy file and get the ott ids
taxonomy_tsv = 'ott3.1red.tsv'
fi = open(taxonomy_tsv).readlines()
ott_ids = []
for lin in fi:
    lii = lin.split("\t")
    ott_ids.append(lii[0])

In [6]:
# now get the subtree of ott ids
import os
from physcraper import opentree_helpers
from physcraper.treetaxon import TreeTax

# json_file = "../OpenTree_SSB2020/tutorial/main.json"
# assert os.path.isfile(json_file) #check the file exists and the path is correct

# otu_dict = opentree_helpers.bulk_tnrs_load(json_file)

# ott_ids =set()
# for otu in otu_dict:
#    ott_ids.add(otu_dict[otu].get("^ot:ottId"))

#turn it back into a list
ott_ids = list(ott_ids)

In [15]:
ott_ids[1]

'578817'

In [16]:
_DEBUG = 1
def debug(msg):
    """short debugging command
    """
    if _DEBUG == 1:
        print(msg)
        
import sys
from physcraper.helpers import cd, standardize_label, to_string
citations_file = "citations_test.text"
cites = ''
f = open(citations_file,"w+")
linecount = 0
for study in res.json()['supporting_studies']:
        study = study.split('@')[0]
        index_url = 'https://api.opentreeoflife.org/v3/studies/find_studies'
        payload = json.dumps({"property":"ot:studyId","value":study,"verbose":"true"})
        res_cites = requests.post(index_url, data=payload, headers=headers)
        new_cite = res_cites.json()['matched_studies']
        debug(new_cite)
        linecount += 1
        f.write(to_string(new_cite[0]['ot:studyPublicationReference']) + '\n' + new_cite[0]['ot:studyPublication'] + '\n')
f.close()


NameError: name 'res' is not defined

In [17]:
_DEBUG = 1
def debug(msg):
    """short debugging command
    """
    if _DEBUG == 1:
        print(msg)
        
import sys
from physcraper.helpers import cd, standardize_label, to_string
def get_citations_from_json(study_id, citations_file):
    assert isinstance(citations_file, str) 
    f = open(citations_file,"w+")
    sys.stdout.write("Gathering citations ...")
    for study in study_id.json()['supporting_studies']:
        study = study.split('@')[0]
        index_url = 'https://api.opentreeoflife.org/v3/studies/find_studies'
        payload = json.dumps({"property":"ot:studyId","value":study,"verbose":"true"})
        res_cites = requests.post(index_url, data=payload, headers=headers)
        new_cite = res_cites.json()['matched_studies']
        debug(new_cite)
        f.write(to_string(new_cite[0]['ot:studyPublicationReference']) + '\n' + new_cite[0]['ot:studyPublication'] + '\n')
    f.close()
    sys.stdout.write("Citations printed to {}\n".format(citations_file))

        
        

In [11]:
from dendropy import Tree, DnaCharacterMatrix, DataSet, datamodel

def get_tree_from_synth(ott_ids, label_format="name", citation="cites.txt"):
    assert label_format in ['id', 'name', 'name_and_id']
    url = 'https://api.opentreeoflife.org/v3/tree_of_life/induced_subtree'
    headers = {'content-type':'application/json'}
    payload = json.dumps(dict(ott_ids=ott_ids, label_format = label_format))
    res = requests.post(url, data=payload, headers=headers)
    if res.status_code == 200:
        pass
    else:
        sys.stderr.write("error getting synth tree, {}, {}, {}\n".format(res.status_code, res.reason, res.json()['message']))
        return None
    # get_citations_from_json(res, citation) # returns file with citations
    tre = Tree.get(data=res.json()['newick'],
                   schema="newick",
                   suppress_internal_node_taxa=True)
    tre.suppress_unifurcations()
    return tre

In [13]:
json = "../OpenTree_SSB2020/tutorial/main.json"

In [14]:
tre = get_tree_from_synth(ott_ids, citation = "cites_test1.txt")

AttributeError: 'str' object has no attribute 'dumps'

In [None]:
# to plot the output, I'm using R for now
%load_ext rpy2.ipython

In [None]:
%%R -i df -w 5 -h 5 --units in -r 200
tree <- ape::read.tree("subtree.tre")
ape::plot.phylo(tree)