In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import string
import Bio
import Bio.Entrez as Entrez
import json
import time
import xmltodict
import copy

In [2]:
def trimPunc(astring):
    letters = set(list(string.ascii_lowercase))
    letters.add(" ")
    letters.add("[")
    letters.add("]")
    letters.add("-")
    L = list(astring)
    L2 = []
    for l in L:
        if l in letters:
            L2.append(l)
        else:
            break
    clean = "".join(L2).replace("[", "").replace("]", "")
    clean = clean.strip()
    clean = clean.replace(" type", "")
    clean = clean.replace(" ", "_")
    return (clean)

In [3]:
data = pd.read_csv("../data/both.tsv", sep="\t")
data['host'] = data['host'].str.replace(" ", "_")
data['host'] = data['host'].str.split("_").str.get(0) + "_" + data['host'].str.split("_").str.get(1)

In [4]:
data['path'] = data['path'].astype(str)

In [5]:
data['path_clean'] = data['path'].apply(lambda x: trimPunc(x))

In [6]:
data = data[data['path_clean'].str.find('uncultured') == -1]

In [7]:
pdict = dict()
for p in set(data['path_clean']):
    p2 = p
    if 'virus' in p and 'rotavirus' not in p:
        p = p.split("virus")[0] + "virus"
    if p.endswith("_str"):
        p = "_".join(p.split("_")[:-1])
    p = p.replace("_subsp", "")
    if p.endswith("_phe"):
        p = p.replace("_phe", "")
    p = p.replace("-", "_")
    if p == 'visna':
        p = 'visna_virus'
    p = p.replace('eastern_equine', 'equine')
    if p.endswith("_pig"):
        p = p.replace("_pig", "")
    if 'rotavirus' in p and not p.endswith('rotavirus_a'):
        p = 'rotavirus_a'
    p = p.replace(" ", "_")
    p = p.replace("rabies_virus", "rabies_lyssavirus")
    if 'sfpork' not in p and 'sfbeef' not in p:
        pdict[p2] = p
   

In [8]:
data = data[data['path_clean'].isin(pdict)]
data['path_clean'] = [pdict[p] for p in data['path_clean']]
data['microbe_name'] = data['path_clean']

In [9]:
data['country'] = data['country'].str.replace("central african republic", "central african rep.")
data['country'] = data['country'].str.replace("czech republic", "czechia")
data['country'] = data['country'].str.replace("falkland islands", "argentina")
data['country'] = data['country'].str.replace("hong kong", "china")
data['country'] = data['country'].str.replace("ivory coast", "côte d'ivoire")
data['country'] = data['country'].str.replace("republic of the congo", "congo")
data['country'] = data['country'].str.replace("united states", "united states of america")
data = data[np.invert(data['country'].isnull())]

In [10]:
data = data[data['path_type'].isin(['bacteria', 'viruses'])]

In [11]:
hosts = set(data['host'])
good_hosts = set()
for host in hosts:
    paths = set(data['path_clean'][data['host'] == host])
    if len(paths) >= 5:
        good_hosts.add(host)

In [12]:
data = data[data['host'].isin(good_hosts)]

In [13]:
data = data.drop_duplicates()

In [14]:
sppnames = {'cow':'bos_taurus',
             'dog': 'canis_lupus',
             'goat': 'capra_hircus',
             'horse': 'equus_caballus',
             'cat':'felis_catus',
             'rabbit': 'oryctolagus_cuniculus',
             'sheep': 'ovis_aries',
             'pig': 'sus_scrofa'}

In [15]:
sppnames2 = {'bos_taurus': 'cow',
             'canis_lupus': 'dog',
             'capra_hircus': 'goat',
             'equus_caballus': 'horse',
             'felis_catus': 'cat',
             'oryctolagus_cuniculus': 'rabbit',
             'ovis_aries': 'sheep',
             'sus_scrofa': 'pig'}

In [16]:
data.index = range(len(data))

In [17]:
data['host_name'] = [sppnames2[s] for s in data['host']]

In [18]:
data['host_scientific_name'] = data['host']

In [19]:
data['number_of_samples'] = data['count']

In [20]:
data['microbe_type'] = data['path_type']

In [21]:
data = data[['host_name', 'host_scientific_name', 'microbe_name', 'microbe_type', 'country', 'number_of_samples']]

In [22]:
d2 = data.groupby(['host_name', 'microbe_name', 'country'])
nsamples = d2['number_of_samples'].apply(sum).values
d2 = d2.first()
d3 = d2[d2.columns]
d3.index = range(len(d3))
d3['host_name'] = [d[0] for d in d2.index]
d3['microbe_name'] = [d[1] for d in d2.index]
d3['country'] = [d[2] for d in d2.index]
d3['number_of_samples'] = nsamples
data = copy.copy(d3)
data = data[['host_name', 'host_scientific_name', 'microbe_name', 'microbe_type', 'country', 'number_of_samples']]

In [23]:
d = ['sheep', 'ovis_aries', 'visna_virus', 'virus', 'china', 78]
d2 = ['horse', 'equus_caballus', 'borna_disease_virus', 'viruses', 'united states of america', '150']
d3 = ['pig', 'sus_scrofa', 'brucella_suis', 'bacteria', 'india', '120']

In [24]:
nrows = len(data)

In [25]:
data.loc[nrows+1] = d
data.loc[nrows+2] = d2
data.loc[nrows+3] = d3

In [26]:
data = data.sort_values(['microbe_name', 'country', 'host_name'])

In [27]:
txids = dict()
for m in set(data['microbe_name']):
    print (m)
    m2 = m.replace("_", " ")
    D = None
    while D is None:
        try:
            D = Entrez.esearch(db='taxonomy', term=m2, retmode='json').read()
        except HTTPError:
            time.sleep(20)
    I = json.loads(D)['esearchresult']['idlist']
    if len(I) == 1:
        txids[m] = I[0]
    else:
        print ("x", m)

equine_papillomavirus


Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


encephalomyocarditis_virus
campylobacter_coli
mycoplasma_hyopneumoniae
ovine_lentivirus
bartonella_henselae
cowpox_virus
actinobacillus_porcitonsillarum
porcine_picobirnavirus
staphylococcus_epidermidis
border_disease_virus
bovine_rotavirus_a
vaccinia_virus
orf_virus
chlamydia_abortus
escherichia_coli
caprine_herpesvirus
x caprine_herpesvirus
candidatus_mycoplasma_haemobos
bovine_ephemeral_fever_virus
porcine_circovirus
burkholderia_mallei_gb
x burkholderia_mallei_gb
anaplasma_marginale
feline_coronavirus
swine_hepatitis_e_virus
torque_teno_virus
lactococcus_lactis
borna_disease_virus
equine_encephalitis_virus
x equine_encephalitis_virus
sapelovirus
brucella_canis
porcine_reproductive_and_respiratory_syndrome_virus
classical_swine_fever_virus
equid_herpesvirus
x equid_herpesvirus
brachyspira_canis
equine_rotavirus_a
mycoplasma_haemofelis
escherichia_albertii
hepatitis_e_virus
porcine_parvovirus
norwalk_like_virus
suid_herpesvirus
x suid_herpesvirus
influenza_a_virus
arcobacter_trophiar

In [28]:
data = data[data['microbe_name'].isin(txids)]

In [29]:
data['microbe_id'] = [txids[i] for i in data['microbe_name']]

In [30]:
fdict = dict()
for nam, t in txids.items():
    x = Entrez.efetch(db='taxonomy', id=t, rettype='xml', retmode='xml').read()
    x2 = xmltodict.parse(x)['TaxaSet']['Taxon']['LineageEx']['Taxon']
    for d in x2:
        if d['Rank'] == 'family':
            fdict[nam] = d['ScientificName']

Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


In [31]:
data['microbe_family'] = [fdict[i] for i in data['microbe_name']]

In [32]:
data = data[['host_name', 'host_scientific_name', 'microbe_name', 'microbe_id', 'microbe_family', 'microbe_type', 'country', 'number_of_samples']]

In [33]:
data['number_of_samples'] = data['number_of_samples'].astype(int)

In [34]:
data.to_csv("clean_data_geo_host.tsv", sep="\t", index=None)

In [36]:
data[data['microbe_name'] == 'rabies_lyssavirus']

Unnamed: 0,host_name,host_scientific_name,microbe_name,microbe_id,microbe_family,microbe_type,country,number_of_samples
182,dog,canis_lupus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,afghanistan,6
119,cow,bos_taurus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,argentina,14
183,dog,canis_lupus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,argentina,23
120,cow,bos_taurus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,bangladesh,8
121,cow,bos_taurus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,botswana,7
184,dog,canis_lupus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,botswana,6
228,goat,capra_hircus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,botswana,6
18,cat,felis_catus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,brazil,14
122,cow,bos_taurus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,brazil,330
185,dog,canis_lupus,rabies_lyssavirus,11292,Rhabdoviridae,viruses,brazil,34
