In [None]:
from global_variables import *

In [None]:
from kraft import make_variant_n_from_vcf_file_path, run_command

In [None]:
if "germ_dna.1" in PROJECT_JSON and "germ_dna.2" in PROJECT_JSON:

    print("Processing germ DNA ...")

    run_command(
        "{}/process_germ_dna.sh {} {} {} {} {} exome {}".format(
            CODE_DIRECTORY_PATH,
            DATA_DIRECTORY_PATH,
            PROJECT_JSON["n_job"],
            PROJECT_JSON["gb_memory"],
            PROJECT_JSON["germ_dna.1"],
            PROJECT_JSON["germ_dna.2"],
            OUTPUT_GERM_DNA_DIRECTORY_PATH,
        )
    )

In [None]:
if "soma_dna.1" in PROJECT_JSON and "soma_dna.2" in PROJECT_JSON:

    print("Processing soma DNA ...")

    run_command(
        "{}/process_soma_dna.sh {} {} {} {} {} {} {} exome {}".format(
            CODE_DIRECTORY_PATH,
            DATA_DIRECTORY_PATH,
            PROJECT_JSON["n_job"],
            PROJECT_JSON["gb_memory"],
            PROJECT_JSON["germ_dna.1"],
            PROJECT_JSON["germ_dna.2"],
            PROJECT_JSON["soma_dna.1"],
            PROJECT_JSON["soma_dna.2"],
            OUTPUT_SOMA_DNA_DIRECTORY_PATH,
        )
    )

In [None]:
if "soma_rna.1" in PROJECT_JSON and "soma_rna.2" in PROJECT_JSON:

    print("Processing soma RNA ...")

    run_command(
        "{}/process_soma_rna.sh {} {} {} {} {} {}".format(
            CODE_DIRECTORY_PATH,
            DATA_DIRECTORY_PATH,
            PROJECT_JSON["n_job"],
            PROJECT_JSON["gb_memory"],
            PROJECT_JSON["soma_rna.1"],
            PROJECT_JSON["soma_rna.2"],
            OUTPUT_SOMA_RNA_DIRECTORY_PATH,
        )
    )

In [None]:
from os import mkdir
from os.path import isdir, join
from shutil import rmtree

In [None]:
if isdir(SUMMARY_DIRECTORY_PATH):

    rmtree(SUMMARY_DIRECTORY_PATH)

mkdir(SUMMARY_DIRECTORY_PATH)

In [None]:
if isdir(OUTPUT_GERM_DNA_DIRECTORY_PATH):

    germ_variant_n = make_variant_n_from_vcf_file_path(
        join(OUTPUT_GERM_DNA_DIRECTORY_PATH, "snpeff", "variant.vcf.gz")
    )

    germ_variant_n.to_csv(
        join(SUMMARY_DIRECTORY_PATH, "germ_dna.variant_n.tsv"), sep="\t", header=True
    )

    print(germ_variant_n)

In [None]:
if isdir(OUTPUT_SOMA_DNA_DIRECTORY_PATH):

    soma_variant_n = make_variant_n_from_vcf_file_path(
        join(OUTPUT_SOMA_DNA_DIRECTORY_PATH, "snpeff", "variant.vcf.gz")
    )

    soma_variant_n.to_csv(
        join(SUMMARY_DIRECTORY_PATH, "soma_dna.variant_n.tsv"), sep="\t", header=True
    )

    print(soma_variant_n)

In [None]:
from pandas import Series, read_csv

In [None]:
if isdir(OUTPUT_SOMA_RNA_DIRECTORY_PATH):

    enst_tpm = read_csv(
        join(
            OUTPUT_SOMA_RNA_DIRECTORY_PATH, "kallisto", "transcriptome", "abundance.tsv"
        ),
        sep="\t",
        index_col=0,
    )["tpm"].sort_values(ascending=False)

    enst_tpm.index.name = "ENST"

    enst_tpm.name = "TPM"

    enst_tpm.to_csv(join(SUMMARY_DIRECTORY_PATH, "enst_tpm.tsv"), sep="\t", header=True)

    print(enst_tpm)

    enst_gene_name = read_csv(
        join(DATA_DIRECTORY_PATH, "grch", "enst_gene_name.tsv"), sep="\t"
    )

    enst_gene_name = dict(
        zip(enst_gene_name["Transcript stable ID version"], enst_gene_name["Gene name"])
    )

    gene_tpm = Series(
        enst_tpm.values,
        index=enst_tpm.index.map(lambda enst: enst_gene_name.get(enst, enst)),
    )

    size_before = gene_tpm.size

    gene_tpm = gene_tpm.groupby(by=gene_tpm.index).median().sort_values(ascending=False)

    print("Size: {} =(groupby)=> {}".format(size_before, gene_tpm.size))

    gene_tpm.index.name = "Gene"

    gene_tpm.name = "Median TPM"

    gene_tpm.to_csv(join(SUMMARY_DIRECTORY_PATH, "gene_tpm.tsv"), sep="\t", header=True)

    print(gene_tpm)

    virus_id_tpm = read_csv(
        join(OUTPUT_SOMA_RNA_DIRECTORY_PATH, "kallisto", "virus", "abundance.tsv"),
        sep="\t",
        index_col=0,
    )["tpm"].sort_values(ascending=False)

    virus_id_tpm.index.name = "Virus ID"

    virus_id_tpm.name = "TPM"

    virus_id_tpm.to_csv(
        join(SUMMARY_DIRECTORY_PATH, "virus_id_tpm.tsv"), sep="\t", header=True
    )

    print(virus_id_tpm)

    virus_id_name = read_csv(
        join(DATA_DIRECTORY_PATH, "virus", "sequences.csv"), index_col=0
    )["Species"].to_dict()

    virus_name_tpm = Series(
        virus_id_tpm.values,
        index=virus_id_tpm.index.map(
            lambda virus_id: virus_id_name.get(virus_id, virus_id)
        ),
    )

    size_before = virus_name_tpm.size

    virus_name_tpm = (
        virus_name_tpm.groupby(by=virus_name_tpm.index)
        .max()
        .sort_values(ascending=False)
    )

    print("Size: {} =(groupby)=> {}".format(size_before, virus_name_tpm.size))

    virus_name_tpm.index.name = "Virus Name"

    virus_name_tpm.name = "Max TPM"

    virus_name_tpm.to_csv(
        join(SUMMARY_DIRECTORY_PATH, "virus_name_tpm.tsv"), sep="\t", header=True
    )

    print(virus_name_tpm)