In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
even = pd.read_csv("community_composition/even community.csv")
staggered = pd.read_csv("community_composition/staggered community.csv")

In [3]:
def read_fasta(filename):
    file_handle = open(filename,"r")
    seq = []
    seqid = []
    tmp_seq = ""
    for line in file_handle:
        if (line[0] == ">"):
            if tmp_seq != "":
                seq.append(tmp_seq)
            seqid.append(line.split("\n")[0][1:])
            tmp_seq = ""
        else:
            tmp_seq+=line.split("\n")[0]
    seq.append(tmp_seq)
    file_handle.close()
    new_seqid = []
    for sid in seqid:
        newid = sid.split("|")[1]
        new_seqid.append(newid)
    da = pd.DataFrame([new_seqid,seq],index = ["GenBank ID","sequence"]).transpose()
    return da

In [4]:
fwd = read_fasta("genomes/strain_fwd_16S.fasta")
rev = read_fasta("genomes/strain_rev_16S.fasta")

In [5]:
from Bio.Seq import Seq
rc = []
for seq in rev["sequence"].tolist():
    seq = str(seq)
    seq = Seq(seq)
    seqrc = str(seq.reverse_complement())
    rc.append(seqrc)
rev_rc = pd.DataFrame([rev["GenBank ID"].tolist(),rc],index = ["GenBank ID","sequence"]).transpose()

In [6]:
strain_16S = fwd.merge(rev_rc,how="outer").drop_duplicates(subset="GenBank ID")
del fwd
del rev

In [7]:
file = open("genomes/mock_full_length.fasta","w")
for i in range(0,10):
    file.write(">"+str(strain_16S.iloc[i,0])+"\n")
    file.write(str(strain_16S.iloc[i,1])+"\n")
file.close()

In [8]:
even = even.merge(strain_16S,how="outer",on="GenBank ID")
staggered = staggered.merge(strain_16S,how="outer",on="GenBank ID")

In [None]:
from anna16 import Preprocessing, CopyNumberPredictor

In [12]:
anna16_pred = {}
X = even["sequence"]
pp = Preprocessing()
X = pp.CountKmers(seqs = X)
model = CopyNumberPredictor(region = "full_length")
model.load("deployment/full_length.zip")
anna16_pred["full-length"] = model.predict(X)
anna16_fl = pd.DataFrame(anna16_pred["full-length"],columns = ["anna16_pred"])
even = pd.concat([even,anna16_fl],axis = 1)
staggered = pd.concat([staggered,anna16_fl],axis = 1)

In [13]:
even.to_csv("processed/even.csv",index=False)
staggered.to_csv("processed/staggered.csv",index=False)

In [14]:
def read_subregion_fasta(region):
    file_handle = open("genomes/"+region+".fasta","r")
    seq = []
    seqid = []
    tmp_seq = ""
    for line in file_handle:
        if (line[0] == ">"):
            if tmp_seq != "":
                seq.append(tmp_seq)
            seqid.append(line.split("\n")[0][1:])
            tmp_seq = ""
        else:
            tmp_seq+=line.split("\n")[0]
    seq.append(tmp_seq)
    file_handle.close()
    da = pd.DataFrame([seqid,seq],index = ["GenBank ID",region]).transpose()
    return da

In [15]:
for region in ["V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    da = read_subregion_fasta(region)
    da = even.merge(da,how="left",on = "GenBank ID")
    X = da[region]
    X = pp.CountKmers(seqs=X)
    model = CopyNumberPredictor(region = region)
    model.load("deployment/"+region+".zip")
    anna16_pred[region] = model.predict(X)



In [16]:
anna16_preds = pd.DataFrame(anna16_pred["full-length"],columns = ["full-length"])
for region in ["V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    anna16_preds = pd.concat([anna16_preds,pd.DataFrame(anna16_pred[region],columns = [region])],axis = 1)

In [17]:
anna16_preds.to_csv("products/anna16_pred.csv",index=False)