In [1]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.metrics import mean_squared_error
from statistics import mean, stdev

In [2]:
def test_rmse(model,X_test,Y_test):
    test_preds = model.predict(X_test)
    mse = mean_squared_error(Y_test, test_preds)
    rmse = sqrt(mse)
    return rmse

In [3]:
filtered = pd.read_csv("datasets/full_length_testdata.filtered.csv")

### Simulation of CopyRighter

In [4]:
class CopyRighterSimulator():
    def __init__(self,CopyRighterData):
        self.lib = {}
        for line in CopyRighterData:
            self.lib[line[0]] = line[1]

    def predict(self,X):
        pred = []
        for line in X:
            key=line
            value = self.lib.get(key,"N/A")
            taxalist = line.split(";")
            while value == "N/A":
                taxalist = taxalist[:-1]
                if len(taxalist) == 0:
                    value = 2.3539721350613916
                else:
                    key = self.combine(taxalist)
                    value = self.lib.get(key,"N/A")
            pred.append(value)
        return pred
                
    def combine(self,taxalist):
        string = ""
        for element in taxalist:
            string+=element
            string+=";"
        return string[:-1]

In [5]:
data = pd.read_csv("datasets/copyrighterdata.txt",sep="\t")
raw_taxa = data["taxonomy"].values.tolist()
copy_number = data["16S rRNA count"].values.tolist()
CopyRighterData = []
for i in range(0,len(raw_taxa)):
    line = raw_taxa[i]
    lineage = []
    lineage.append(line.replace("; ",";"))
    lineage.append(copy_number[i])
    CopyRighterData.append(lineage)

In [6]:
crs = CopyRighterSimulator(CopyRighterData)

In [7]:
crs_performance = {}
for region in ["full_length","V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    greengenetaxa = pd.read_csv("taxa/gg_"+region+"_testtaxa.csv")
    greengenetaxa = greengenetaxa.merge(filtered[["accession"]],how="inner",on = ["accession"])
    greengenetaxa = greengenetaxa.dropna(subset=["copy_number"])
    ggX = greengenetaxa[['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', "Species"]]
    ggY = greengenetaxa['copy_number']
    newggX = []
    for line in ggX.values.tolist():
        lineage=""
        for element in line:
            lineage+=str(element)
            lineage+=";"
        lineage=lineage[:-1]
        newggX.append(lineage)
    crs_performance[region] = test_rmse(crs,newggX,ggY)

In [8]:
pd.DataFrame(crs_performance,index=["rmse"]).transpose()

Unnamed: 0,rmse
full_length,1.993535
V1-V2,1.934433
V1-V3,1.988888
V3-V4,1.824587
V4,1.8485
V4-V5,1.821856
V6-V8,1.952332
V7-V9,2.180699


### Simulation of rrnDB

In [9]:
class rrnDBSimulator():
    def __init__(self,rrnDBData):
        self.lib = {"domain":{},"phylum":{},"class":{},"order":{},"family":{},"genus":{}}
        self.ranks = ["domain","phylum","class","order","family","genus"]
        for line in rrnDBData:
            if line[0] in self.ranks:
                self.lib[line[0]][line[1]] = line[2]
        
            
    def predict(self,X):
        pred = []
        for line in X:
            key=line
            ranks = self.ranks
            value = self.lib.get(ranks[-1],{}).get(key[-1],"N/A")
            while value == "N/A":
                ranks = ranks[:-1]
                key = key[:-1]
                if len(key) == 0:
                    value = 2.3539721350613916
                else:
                    value = self.lib.get(ranks[-1],{}).get(key[-1],"N/A")
            pred.append(value)
        return pred

In [10]:
rrnDBData = pd.read_csv("datasets/rrnDB_pan-taxa stats_RDP.csv")
rrnDBData = rrnDBData.values.tolist()

In [11]:
rds = rrnDBSimulator(rrnDBData)

In [12]:
rds_performance = {}
for region in ["full_length","V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    rdptaxa = pd.read_csv("taxa/rdp_"+region+"_testtaxa.csv")
    rdptaxa = rdptaxa.merge(filtered[["accession"]],how="inner",on = ["accession"])
    rdptaxa = rdptaxa.dropna(subset=["copy_number"])
    ggX = rdptaxa[['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus']]
    ggY = rdptaxa['copy_number']
    rds_performance[region] = test_rmse(rds,ggX.values.tolist(),ggY.tolist())

In [13]:
pd.DataFrame(rds_performance,index=["rmse"]).transpose()

Unnamed: 0,rmse
full_length,1.082905
V1-V2,1.093868
V1-V3,1.081651
V3-V4,1.100793
V4,1.206523
V4-V5,1.078959
V6-V8,1.108867
V7-V9,1.109841


In [14]:
pd.concat([pd.DataFrame(crs_performance,index=["CopyRighter"]).transpose(),
pd.DataFrame(rds_performance,index=["rrnDB"]).transpose()],axis = 1).to_csv("performance/cprt_rrndb.csv")