In [None]:
import uuid

from DeepCodon.config.server_config import *
from DeepCodon.src.services.appendixTool import convert_fasta_to_aa
from DeepCodon.src.services.foundRareCodon import (
    AlignSeqs,
    blastp4RareCodon,
    getBlastpResult,
    searchDB,
    smoon,
)

input_file = "data/datasets/test_data/compare/rareDB_sampled.fasta"
# -------------run-----------------
# NOTE token dict = {query: [DatabaseToken,DatabaseSeq,QueryToken,SmoothToken]}
nowUuid = uuid.uuid4().hex

user_input_file = convert_fasta_to_aa(input_file, str(input_file) + "_aa")
bout, berr = blastp4RareCodon(nowUuid, user_input_file)
result_dict = getBlastpResult(nowUuid, user_input_file)

outToken = searchDB(input_query=result_dict)
alignToken = AlignSeqs(user_input_file, outToken)
smoothToken = smoon(alignToken)

[0, 0.9998245921768111, 0.7542536397123313, 0.32601608346011735, 0.2608128667680939, 0.3332323538321721, 0.3332323538321721, 0.45440775522568916, 0.4847016055740685, 0.9998113563478589, 0.8866251650631956, 0.9998437744102484, 0.0, 0.7240131011894502, 0.9998276159282883, 0.5624121231057647, 0.9999000099990001, 0.2608128667680939, 0.21734405564007825, 0.10867202782003912, 0.99978265594436, 0.2363206689692783, 0.2544991819669151, 0.9998182148700236, 0.3090347209598255, 0.8180330848936557, 0.9998182148700236, 0.42851021282673907, 0.9998571632623912, 0.9997727789138833, 0.15905476028175417, 0.15905476028175417, 0.8179959100204499, 0.9998276159282883, 0.6033442509912085, 0.12066885019824171, 0.9999000099990001, 0.9997872793022762, 0.2765369070410551, 0.5105296745373326, 0.3403531163582217, 0.9998113563478589, 0.8866251650631956, 0.9998630324613067, 0.3698123544719902, 0.9996970614965163, 0.42411390487730993, 0.04544421722335833, 0.06816632583503748, 0.6248437890527369, 0.4498875281179705, 0.

In [6]:
import pandas as pd

In [7]:
df_codonop = pd.read_csv(
    "src/tools/analysis/outs/nsrz/codonop.csv",
    header=None,
    names=["id", "input", "codonop", "gci", "gco"],
)
df_codonop = df_codonop[["id", "codonop"]]
df_icor = pd.read_csv(
    "src/tools/analysis/outs/nsrz/icor.csv",
    header=None,
    names=["id", "input", "icor", "gci", "gco"],
)
df_icor = df_icor[["id", "icor"]]


df_genescript = pd.read_csv(
    "src/tools/analysis/outs/nsrz/genescript.csv",
    header=None,
    names=["id", "protein", "genescript"],
)

# NOTE deepcodon choice
df_deepcodon = pd.read_csv("src/tools/analysis/outs/nsrz/deepcodon3.csv")
df_deepcodon.rename(
    columns={"id": "tid", "inputseq": "protein", "outputseq": "deepcodon"}, inplace=True
)
df_deepcodon = df_deepcodon[["tid", "protein", "deepcodon"]]

df_deepcodonft = pd.read_csv("src/tools/analysis/outs/nsrz/deepcodonft.csv")
df_deepcodonft.rename(
    columns={"id": "tid", "inputseq": "protein", "outputseq": "deepcodonft"},
    inplace=True,
)
df_deepcodonft = df_deepcodonft[["protein", "deepcodonft"]]

In [8]:
df_combine = pd.merge(df_codonop, df_icor, on="id")
df_combine = pd.merge(df_combine, df_genescript, on="id")
df_combine = pd.merge(df_combine, df_deepcodon, on="protein")
df_combine = pd.merge(df_combine, df_deepcodonft, on="protein")

In [9]:
df_combine["token"] = df_combine["tid"].apply(lambda x: smoothToken.get(x, [None])[-1])

In [10]:
df_combine = df_combine.dropna()

In [11]:
from DeepCodon.src.services.appendixTool import *


def assert_all_codon_rare(df, incolumn, outcolumn):
    filtered_codons = [
        k for k, v in property.items() if v < 0.3 and k not in {"<pad>", "<S>", "<E>"}
    ]
    for line, value in df.iterrows():
        tokens = ""
        seq = value[incolumn]
        for i in range(0, len(seq), 3):
            codon = seq[i : i + 3]
            if codon in filtered_codons:
                tokens += "1"
            else:
                tokens += "0"
        df.loc[line, outcolumn] = tokens
    return df

In [12]:
df_combine = assert_all_codon_rare(df_combine, "deepcodon", "deepcodon_token")
df_combine = assert_all_codon_rare(df_combine, "codonop", "codonop_token")
df_combine = assert_all_codon_rare(df_combine, "icor", "icor_token")
df_combine = assert_all_codon_rare(df_combine, "genescript", "genescript_token")
df_combine = assert_all_codon_rare(df_combine, "deepcodonft", "deepcodonft_token")

In [13]:
for line, value in df_combine.iterrows():
    now_token = value["token"]
    lineAll, lineCodonop, lineIcor, lineGenescript, lineDeepcodon, lineDeepcodonft = (
        0,
        0,
        0,
        0,
        0,
        0,
    )
    for num, t in enumerate(now_token):
        if t > 0.4:
            if value["codonop_token"][num] == "1":
                lineCodonop += 1
            if value["icor_token"][num] == "1":
                lineIcor += 1
            if value["genescript_token"][num] == "1":
                lineGenescript += 1
            if value["deepcodon_token"][num] == "1":
                lineDeepcodon += 1
            if value["deepcodonft_token"][num] == "1":
                lineDeepcodonft += 1
            lineAll += 1
    df_combine.loc[line, "lineAll"] = lineAll
    df_combine.loc[line, "lineCodonop"] = lineCodonop
    df_combine.loc[line, "lineIcor"] = lineIcor
    df_combine.loc[line, "lineGenescript"] = lineGenescript
    df_combine.loc[line, "lineDeepcodon"] = lineDeepcodon
    df_combine.loc[line, "lineDeepcodonft"] = lineDeepcodonft

In [14]:
df_combine[
    [
        "lineAll",
        "lineCodonop",
        "lineIcor",
        "lineGenescript",
        "lineDeepcodon",
        "lineDeepcodonft",
    ]
].sum()

lineAll            357.0
lineCodonop         45.0
lineIcor           106.0
lineGenescript     147.0
lineDeepcodon      119.0
lineDeepcodonft     91.0
dtype: float64