# parse and condense all of the App outputs into a single succinct CSV for annotation integration

In [1]:
from pandas import DataFrame, concat
from glob import glob

dfs = []
for hits in glob("HMMER_output_TAB/*.txt"):
    file = open(hits, 'r')
    lines = file.readlines()
    newLines = []
    for line in lines:
        if any(x in line for x in ["---", "[ok]", ': ', "#\n"]):   continue
        if line.strip() == '#': continue
        newLines.append(line.replace("description of target", "KBaseObj").replace(" name", ''))
    columns = [x.strip() for x in newLines[0].split()][1:]
    rows = [[x.strip() for x in line.split()] for line in newLines[1:]]
    dfs.append(DataFrame(rows, columns=columns))


totDF = concat(dfs)
totDF.to_csv("CAZY_H100_hits.csv")
display(totDF)

Unnamed: 0,target,accession,tlen,query,accession.1,qlen,E-value,score,bias,#,...,score.1,bias.1,from,to,from.1,to.1,from.2,to.2,acc,KBaseObj
0,190263/400/1.f:EIBLKL_08285,-,659,GH5_20.hmm,-,344,2.3e-34,123.6,3.2,1,...,119.5,3.2,3,250,275,536,273,596,0.85,[190263/400/1]
1,190263/412/1.f:OOIBGL_06550,-,649,GH5_20.hmm,-,344,3.7e-33,119.6,3.1,1,...,117.7,3.3,3,249,265,525,263,536,0.83,[190263/412/1]
2,190263/412/1.f:OOIBGL_08215,-,579,GH5_20.hmm,-,344,1.8e-30,110.8,0.4,1,...,110.1,0.4,3,336,66,422,64,429,0.79,[190263/412/1]
3,190263/396/1.f:KAAOMA_00970,-,708,GH5_20.hmm,-,344,2.1e-26,97.4,1.5,1,...,94.9,1.5,3,306,59,377,57,413,0.73,[190263/396/1]
4,190263/396/1.f:KAAOMA_01800,-,744,GH5_20.hmm,-,344,1.1e-21,81.9,0.5,1,...,80.4,0.5,3,265,58,348,56,361,0.77,[190263/396/1]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,190263/562/1.f:KJGNCK_06255,-,346,GH5_2.hmm,-,237,1.6e-13,55.2,0.0,1,...,54.7,0.0,56,185,92,246,57,307,0.74,[190263/562/1]
38,190263/475/1.f:MCKEEP_04335,-,346,GH5_2.hmm,-,237,2.3e-13,54.6,0.1,1,...,54.2,0.1,56,184,92,240,55,306,0.77,[190263/475/1]
39,190263/396/1.f:KAAOMA_19800,-,936,GH5_2.hmm,-,237,5e-13,53.5,0.7,1,...,52.8,0.7,3,220,549,768,547,781,0.76,[190263/396/1]
40,190263/460/1.f:IHFGKI_14410,-,464,GH5_2.hmm,-,237,1.4e-12,52.1,0.6,1,...,51.5,0.6,61,235,133,334,83,336,0.82,[190263/460/1]


In [2]:
totDF.columns

Index(['target', 'accession', 'tlen', 'query', 'accession', 'qlen', 'E-value',
       'score', 'bias', '#', 'of', 'c-Evalue', 'i-Evalue', 'score', 'bias',
       'from', 'to', 'from', 'to', 'from', 'to', 'acc', 'KBaseObj'],
      dtype='object')

# Process the CAZY HTML output into a CSV

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("KBase HMMER Custom Model Profile.html", "r").read() , 'lxml-xml')

## parse the labels

In [17]:
# organism labels
horizontal_rows = soup.find_all(class_="horz-text")
organisms = []
for content in horizontal_rows:
    label = content.find("nobr")
    if label is None:  continue
    organisms.append(label.text.replace(".gbff_genome.RAST", ''))
print(organisms)

# feature labels
vertical_cols = soup.find_all(class_="vertical-text")
features = []
for content in vertical_cols:
    label = content.find("nobr")
    if label is None:  continue
    features.append(label.text.replace("\n", ''))
print(features)

['Acidaminococcus-fermentans-DSM-20731-MAF-2', 'Acidaminococcus-sp-D21-MAF-2', 'Adlercreutzia-equolifaciens-DSM-19450', 'Akkermansia-muciniphila-ATCC-BAA-835-MAF-2', 'Alistipes-finegoldii-DSM-17242', 'Alistipes-ihumii-AP11-MAF-2', 'Alistipes-indistinctus-YIT-12060-DSM-22520-MAF-2', 'Alistipes-onderdonkii-DSM-19147-MAF-2', 'Alistipes-putredinis-DSM-17216-MAF-2', 'Alistipes-senegalensis-JC50-DSM-25460-MAF-2', 'Alistipes-shahii-WAL-8301-DSM-19121-MAF-2', 'Anaerofustis-stercorihominis-DSM-17244', 'Anaerostipes-caccae-DSM-14662-MAF-2', 'Anaerotruncus-colihominis-DSM-17241-MAF-2', 'Bacteroides-caccae-ATCC-43185-MAF-2', 'Bacteroides-cellulosilyticus-DSM-14838-MAF-2', 'Bacteroides-coprocola-DSM-17136-MAF-2', 'Bacteroides-coprophilus-DSM-18228-MAF-2', 'Bacteroides-dorei-5-1-36-MAF-2', 'Bacteroides-dorei-DSM-17855-MAF-2', 'Bacteroides-eggerthii-DSM-20697-MAF-2', 'Bacteroides-finegoldii-DSM-17565-MAF-2', 'Bacteroides-fragilis-3-1-12-MAF-2', 'Bacteroides-intestinalis-DSM-17393-MAF-2', 'Bacteroides

## parse the # hits from each cell

In [None]:
rows = soup.find_all("tr")
elements = {}
for rowIndex, row in enumerate(rows):
    if rowIndex < 2:  continue
    label = row.find("nobr")
    if label is None:   break
    print(rowIndex)
    label = label.text.replace(".gbff_genome.RAST", '')
    elements[label] = []
    first = True
    # print(row is None)
    for td in row.find_all("td"):
        if first:  first = False  ;  continue
        hits = td.get("title")
        # print(hits)
        if hits is None:  hits = '0'
        elements[label].append(hits)
    print(len(elements[label]))

# DataFrame creation

In [47]:
test = ["0", "0", "2"]
# test.remove("0")
testSet = set(test)
testSet.remove("0")
print(testSet)

{'2'}


In [52]:
from pandas import DataFrame
from re import search

df = DataFrame(elements, index=features).T
display(df)
df.to_csv("CAZY_hits.csv")

# create a condensed CSV version
from json import dump
miniElements = {}
for org, hits in elements.items():
    hitsSet = set(hits)
    hitsSet.remove("0")
    miniElements[org] = list(hitsSet)
dump(miniElements, open("nonZeroHits.json", 'w'), indent=3)

df = DataFrame({org: [search(r"(\d+)(?= hit)", hit).group() if "hit" in hit else "0" for hit in hits]
                for org, hits in elements.items()}, index=features).T
display(df)
df.to_csv("CAZY_hits_numerical.csv")


Unnamed: 0,GH1,GH2,GH3,GH4,GH5,GH5_1,GH5_2,GH5_4,GH5_5,GH5_7,...,PL37,PL38,PL40,PL42,AA1,AA1_2,AA1_3,AA3,AA4,AA7
Acidaminococcus-fermentans-DSM-20731-MAF-2,0,0,1 hit KEEIED_07475,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acidaminococcus-sp-D21-MAF-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1 hit FCJLID_00635,0
Adlercreutzia-equolifaciens-DSM-19450,0,1 hit JACNIE_08440,1 hit JACNIE_10280,0,1 hit JACNIE_02790,0,1 hit JACNIE_02790,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Akkermansia-muciniphila-ATCC-BAA-835-MAF-2,0,6 hits IKGMED_01545 IKGMED_02795 IKGMED_04255 ...,1 hit IKGMED_10865,0,0,0,0,0,0,0,...,0,1 hit IKGMED_04025,0,0,1 hit IKGMED_09535,1 hit IKGMED_09535,1 hit IKGMED_09535,0,0,0
Alistipes-finegoldii-DSM-17242,0,8 hits LHJMMB_01735 LHJMMB_06230 LHJMMB_06925 ...,7 hits LHJMMB_00385 LHJMMB_05295 LHJMMB_05310 ...,0,2 hits LHJMMB_05365 LHJMMB_07895,0,1 hit LHJMMB_07895,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Solobacterium-moorei-DSM-22971-MAF-2,0,2 hits DIOFOH_06630 DIOFOH_12040,0,2 hits DIOFOH_07270 DIOFOH_12710,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Streptococcus-thermophilus-ATCC-19258-MAF-2,2 hits KJODLM_06495 KJODLM_09470,1 hit KJODLM_08190,1 hit KJODLM_05575,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Subdoligranulum-sp-4-3-54A2FAA-MAF-2,3 hits OLPOLE_00795 OLPOLE_04110 OLPOLE_14585,8 hits OLPOLE_00785 OLPOLE_03040 OLPOLE_03865 ...,8 hits OLPOLE_03035 OLPOLE_05475 OLPOLE_07015 ...,2 hits OLPOLE_04370 OLPOLE_13485,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Subdoligranulum-variabile-DSM-15176-MAF-2,6 hits LNLACP_00150 LNLACP_03700 LNLACP_06940 ...,7 hits LNLACP_00140 LNLACP_00175 LNLACP_05300 ...,11 hits LNLACP_00120 LNLACP_00955 LNLACP_05665...,2 hits LNLACP_07710 LNLACP_09015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,GH1,GH2,GH3,GH4,GH5,GH5_1,GH5_2,GH5_4,GH5_5,GH5_7,...,PL37,PL38,PL40,PL42,AA1,AA1_2,AA1_3,AA3,AA4,AA7
Acidaminococcus-fermentans-DSM-20731-MAF-2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acidaminococcus-sp-D21-MAF-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Adlercreutzia-equolifaciens-DSM-19450,0,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Akkermansia-muciniphila-ATCC-BAA-835-MAF-2,0,6,1,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,0,0,0
Alistipes-finegoldii-DSM-17242,0,8,7,0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Solobacterium-moorei-DSM-22971-MAF-2,0,2,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Streptococcus-thermophilus-ATCC-19258-MAF-2,2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Subdoligranulum-sp-4-3-54A2FAA-MAF-2,3,8,8,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Subdoligranulum-variabile-DSM-15176-MAF-2,6,7,11,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
elements

In [None]:
print([s for s in soup.children])

# processing the other HTML file

In [None]:
rows = soup.find_all("tr")
elements = {}
for rowIndex, row in enumerate(rows):
    if rowIndex < 2:  continue
    label = row.find("nobr")
    if label is None:   break
    print(rowIndex)
    label = label.text.replace(".gbff_genome.RAST", '')
    elements[label] = []
    first = True
    # print(row is None)
    for td in row.find_all("td"):
        if first:  first = False  ;  continue
        hits = td.get("title")
        # print(hits)
        if hits is None:  hits = '0'
        elements[label].append(hits)
    print(len(elements[label]))


from pandas import DataFrame
from re import search

df = DataFrame(elements, index=features).T
display(df)
df.to_csv("CAZY_hits.csv")

# create a condensed CSV version
from json import dump
miniElements = {}
for org, hits in elements.items():
    hitsSet = set(hits)
    hitsSet.remove("0")
    miniElements[org] = list(hitsSet)
dump(miniElements, open("nonZeroHits.json", 'w'), indent=3)

df = DataFrame({org: [search(r"(\d+)(?= hit)", hit).group() if "hit" in hit else "0" for hit in hits]
                for org, hits in elements.items()}, index=features).T
display(df)
df.to_csv("CAZY_hits_numerical.csv")
