In [1]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.metrics import mean_squared_error
from statistics import mean, stdev

In [2]:
even = pd.read_csv("processed/even.csv")
staggered = pd.read_csv("processed/staggered.csv")

### Simulation of CopyRighter

In [3]:
class CopyRighterSimulator():
    def __init__(self,CopyRighterData):
        self.lib = {}
        for line in CopyRighterData:
            self.lib[line[0]] = line[1]

    def predict(self,X):
        pred = []
        for line in X:
            key=line
            value = self.lib.get(key,"N/A")
            taxalist = line.split(";")
            while value == "N/A":
                taxalist = taxalist[:-1]
                if len(taxalist) == 0:
                    value = 2.3539721350613916
                else:
                    key = self.combine(taxalist)
                    value = self.lib.get(key,"N/A")
            pred.append(value)
        return pred
                
    def combine(self,taxalist):
        string = ""
        for element in taxalist:
            string+=element
            string+=";"
        return string[:-1]

In [4]:
data = pd.read_csv("datasets/copyrighterdata.txt",sep="\t")
raw_taxa = data["taxonomy"].values.tolist()
copy_number = data["16S rRNA count"].values.tolist()
CopyRighterData = []
for i in range(0,len(raw_taxa)):
    line = raw_taxa[i]
    lineage = []
    lineage.append(line.replace("; ",";"))
    lineage.append(copy_number[i])
    CopyRighterData.append(lineage)

In [5]:
crs = CopyRighterSimulator(CopyRighterData)

In [6]:
cr_pred = {}
for region in ["full_length","V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    gg_taxa = pd.read_csv("taxa/gg_"+region+"_taxa.csv")
    gg_taxa.columns = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', "Species",'GenBank ID']
    gg_taxa = even.merge(gg_taxa,how="left",on = "GenBank ID")[['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']]
    gg_taxa = gg_taxa.values.tolist()
    new_gg = []
    for line in gg_taxa:
        taxa_str = ""
        for element in line:
            taxa_str+=str(element)
            taxa_str+=";"
        new_gg.append(taxa_str[:-2])
    cr_pred[region] = crs.predict(new_gg)

In [7]:
pd.DataFrame(cr_pred)

Unnamed: 0,full_length,V1-V2,V1-V3,V3-V4,V4,V4-V5,V6-V8,V7-V9
0,8.36435,8.36435,8.36435,8.36435,8.36435,8.36435,8.36435,8.23983
1,2.49703,2.49703,2.49703,2.49703,2.49703,2.49703,2.49703,2.49703
2,7.317,7.317,7.317,7.317,7.317,7.317,7.317,7.317
3,2.40183,2.40183,2.40183,2.40183,2.40183,2.40183,2.40183,2.40183
4,2.51393,2.51393,2.51393,2.51393,2.51393,2.51393,2.51393,2.51393
5,4.57988,7.0,4.57988,6.7533,6.7533,6.7533,4.57988,4.57988
6,5.68634,5.68634,5.68634,5.68634,5.68634,5.68634,5.68634,5.68634
7,3.58378,3.58378,3.58378,3.58378,3.58378,3.58378,3.58378,3.58378
8,5.15949,5.15949,5.15949,5.15949,5.15949,5.15949,5.15949,5.15949
9,4.93895,4.93895,4.93895,4.93895,4.93895,4.93895,4.93895,4.93895


### Simulation of rrnDB

In [8]:
class rrnDBSimulator():
    def __init__(self,rrnDBData):
        self.lib = {"domain":{},"phylum":{},"class":{},"order":{},"family":{},"genus":{}}
        self.ranks = ["domain","phylum","class","order","family","genus"]
        for line in rrnDBData:
            if line[0] in self.ranks:
                self.lib[line[0]][line[1]] = line[2]
        
            
    def predict(self,X):
        pred = []
        for line in X:
            key=line
            ranks = self.ranks
            value = self.lib.get(ranks[-1],{}).get(key[-1],"N/A")
            while value == "N/A":
                ranks = ranks[:-1]
                key = key[:-1]
                if len(key) == 0:
                    value = 2.3539721350613916
                else:
                    value = self.lib.get(ranks[-1],{}).get(key[-1],"N/A")
            pred.append(value)
        return pred

In [9]:
rrnDBData = pd.read_csv("datasets/rrnDB_pan-taxa stats_RDP.csv")
rrnDBData = rrnDBData.values.tolist()

In [10]:
rds = rrnDBSimulator(rrnDBData)

In [11]:
rrndb_pred = {}
for region in ["full_length","V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    rdp_taxa = pd.read_csv("taxa/rdp_"+region+"_taxa.csv")
    rdp_taxa.columns = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'GenBank ID']
    rdp_taxa = even.merge(rdp_taxa,how="left",on = "GenBank ID")[['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus']]
    rdp_taxa = rdp_taxa.values.tolist()
    rrndb_pred[region] = rds.predict(rdp_taxa)

In [12]:
pd.DataFrame(rrndb_pred)

Unnamed: 0,full_length,V1-V2,V1-V3,V3-V4,V4,V4-V5,V6-V8,V7-V9
0,10.33,10.33,10.33,10.33,10.33,10.33,10.33,10.33
1,3.38,3.38,3.38,3.38,3.38,3.38,3.38,3.38
2,5.32,5.32,5.32,5.32,5.32,5.32,5.32,5.32
3,3.29,3.29,3.29,3.29,3.29,3.29,3.29,3.29
4,5.59,5.59,5.59,5.59,5.59,5.59,5.59,5.59
5,7.0,7.0,7.0,7.0,7.0,7.0,6.98,7.0
6,5.33,5.33,5.33,5.33,5.33,5.33,5.33,5.33
7,2.71,2.71,2.71,2.71,2.71,2.71,2.71,2.71
8,5.66,5.66,5.66,5.66,5.66,5.66,5.66,5.66
9,5.36,5.36,5.36,5.36,5.36,5.36,5.36,5.36


In [13]:
cr_fl = pd.DataFrame(cr_pred["full_length"],columns = ["copyrighter_pred"])
rrndb_fl = pd.DataFrame(rrndb_pred["full_length"],columns = ["rrndb_pred"])

In [14]:
even = pd.concat([even,cr_fl,rrndb_fl],axis=1)
staggered = pd.concat([staggered,cr_fl,rrndb_fl],axis=1)

In [15]:
even.to_csv("processed/even_complete.csv",index=False)
staggered.to_csv("processed/staggered_complete.csv",index=False)

In [16]:
pd.DataFrame(rrndb_pred).to_csv("products/rrndb_pred.csv",index=False)
pd.DataFrame(cr_pred).to_csv("products/copyrighter_pred.csv",index=False)