In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

import numpy as np
import pandas as pd
import plotly as pl

In [None]:
sys.path.insert(0, "..")

np.random.random(20121020)

pl.offline.init_notebook_mode(connected=True)

In [None]:
from sequencing_process.download_clinvar_vcf_gz import download_clinvar_vcf_gz
from sequencing_process.make_reference_genome import make_reference_genome
from sequencing_process.process_bam import (
    check_bam_using_samtools_flagstat,
    get_variants_from_bam_using_freebayes_and_multiprocess,
    get_variants_from_bam_using_strelka,
    mark_duplicates_in_bam_using_picard_markduplicates,
    sort_and_index_bam_using_samtools_sort_and_index,
)
from sequencing_process.process_fasta import faidx_fasta
from sequencing_process.process_fastq_gz import (
    align_fastq_gzs_using_bwa_mem,
    check_fastq_gzs_using_fastqc,
    trim_fastq_gzs_using_skewer,
)
from sequencing_process.process_vcf_gz import (
    annotate_vcf_gz_using_bcftools_annotate,
    annotate_vcf_gz_using_snpeff,
    filter_vcf_gz_using_bcftools_view,
    rename_chromosome_of_vcf_gz_using_bcftools_annotate,
)
from sequencing_process.support.support.path import clean_path

In [None]:
GRCH_DIRECTORY_PATH = clean_path("~/sequencing_process_grch")

assert os.path.isdir(GRCH_DIRECTORY_PATH)

PEOPLE_DIRECTORY_PATH = clean_path("~/sequencing_process_people")

assert os.path.isdir(PEOPLE_DIRECTORY_PATH)

REGIONS = tuple("chr{}".format(i) for i in range(1, 23)) + ("chrX", "chrY", "chrM")

N_JOB = 1

MEMORY = "8G"

VARIANT_METHOD = "freebayes"

CLINVAR_VERSION = None

OVERWRITE = True

In [None]:
try:

    FASTA_GZ_FILE_PATH = make_reference_genome(GRCH_DIRECTORY_PATH)

except FileExistsError:

    FASTA_GZ_FILE_PATH = "{}/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz".format(
        GRCH_DIRECTORY_PATH
    )

    assert os.path.isfile(FASTA_GZ_FILE_PATH)

    FASTA_FILE_PATH = FASTA_GZ_FILE_PATH[:-3]

    if not os.path.isfile("{}.fai".format(FASTA_GZ_FILE_PATH)):

        faidx_fasta(FASTA_FILE_PATH)

try:

    CLINVAR_VCF_GZ_FILE_PATH = download_clinvar_vcf_gz(
        GRCH_DIRECTORY_PATH, version=CLINVAR_VERSION
    )

except FileExistsError:

    CLINVAR_VCF_GZ_FILE_PATH = "{}/{}".format(
        GRCH_DIRECTORY_PATH,
        [
            file_name
            for file_name in os.listdir(GRCH_DIRECTORY_PATH)
            if "clinvar" in file_name and file_name.endswith(".vcf.gz")
        ][0],
    )

In [None]:
fastq_gz_0_file_path = "{}/simulation.bwa.read1.fastq.gz".format(PEOPLE_DIRECTORY_PATH)

assert os.path.isfile(fastq_gz_0_file_path)

fastq_gz_1_file_path = "{}/simulation.bwa.read2.fastq.gz".format(PEOPLE_DIRECTORY_PATH)

assert os.path.isfile(fastq_gz_1_file_path)

fastq_gz_0_trimmed_file_path, fastq_gz_1_trimmed_file_path = trim_fastq_gzs_using_skewer(
    (fastq_gz_0_file_path, fastq_gz_1_file_path),
    output_directory_path="{}/trimmed_fastq_gz".format(PEOPLE_DIRECTORY_PATH),
    n_job=N_JOB,
    overwrite=OVERWRITE,
)

check_fastq_gzs_using_fastqc(
    (
        fastq_gz_0_file_path,
        fastq_gz_1_file_path,
        fastq_gz_0_trimmed_file_path,
        fastq_gz_1_trimmed_file_path,
    ),
    n_job=N_JOB,
    overwrite=OVERWRITE,
)

In [None]:
bam_file_path = align_fastq_gzs_using_bwa_mem(
    (fastq_gz_0_trimmed_file_path, fastq_gz_1_trimmed_file_path),
    FASTA_GZ_FILE_PATH,
    n_job=N_JOB,
    output_bam_file_path="{}/aligned.bam".format(PEOPLE_DIRECTORY_PATH),
    overwrite=OVERWRITE,
)

sorted_and_indexed_bam_file_path = sort_and_index_bam_using_samtools_sort_and_index(
    bam_file_path, remove_input_bam_file_path=True, n_job=N_JOB, overwrite=OVERWRITE
)

duplicate_removed_bam_file_path = mark_duplicates_in_bam_using_picard_markduplicates(
    sorted_and_indexed_bam_file_path,
    memory=MEMORY,
    remove_duplicates=True,
    remove_input_bam_file_path_and_its_index=True,
    n_job=N_JOB,
    output_bam_file_path="{}/duplicate_removed.bam".format(PEOPLE_DIRECTORY_PATH),
    overwrite=OVERWRITE,
)

check_bam_using_samtools_flagstat(
    duplicate_removed_bam_file_path, n_job=N_JOB, overwrite=OVERWRITE
)

In [None]:
if VARIANT_METHOD not in ("freebayes", "strelka"):

    raise ValueError("Unknown VARIANT_METHOD: {}.".format(VARIANT_METHOD))

if VARIANT_METHOD == "freebayes":

    vcf_gz_file_path = get_variants_from_bam_using_freebayes_and_multiprocess(
        duplicate_removed_bam_file_path,
        FASTA_FILE_PATH,
        REGIONS,
        n_job=N_JOB,
        output_vcf_file_path="{}/{}.vcf".format(PEOPLE_DIRECTORY_PATH, VARIANT_METHOD),
        overwrite=OVERWRITE,
    )

    keep_filters = None

    include_expression = "10<=DP & 30<=QUAL & 10<=(QUAL/AO) & 1<=SRF & 1<=SRR & 1<=SAF & 1<=SAR & 1<=RPR & 1<=RPL"

elif VARIANT_METHOD == "strelka":

    vcf_gz_file_path = get_variants_from_bam_using_strelka(
        duplicate_removed_bam_file_path,
        FASTA_FILE_PATH,
        "{}/strelka".format(PEOPLE_DIRECTORY_PATH),
        n_job=N_JOB,
        overwrite=OVERWRITE,
    )

    keep_filters = ("PASS",)

    include_expression = "30<=QUAL"

In [None]:
filtered_vcf_gz_file_path = filter_vcf_gz_using_bcftools_view(
    vcf_gz_file_path,
    regions=REGIONS,
    keep_filters=keep_filters,
    include_expression=include_expression,
    n_job=N_JOB,
    output_vcf_file_path="{}/filtered.vcf".format(PEOPLE_DIRECTORY_PATH),
    overwrite=OVERWRITE,
)

chromosome_renamed_vcf_gz_file_path = rename_chromosome_of_vcf_gz_using_bcftools_annotate(
    filtered_vcf_gz_file_path,
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOB,
    output_vcf_file_path=filtered_vcf_gz_file_path.replace(
        ".vcf.gz", ".chromosome_renamed.vcf"
    ),
    overwrite=OVERWRITE,
)

snpeff_annotated_vcf_gz_file_path = annotate_vcf_gz_using_snpeff(
    chromosome_renamed_vcf_gz_file_path,
    "GRCh38.86",
    memory=MEMORY,
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOB,
    output_vcf_file_path=chromosome_renamed_vcf_gz_file_path.replace(
        ".vcf.gz", ".snpeff.vcf"
    ),
    overwrite=OVERWRITE,
)

clinvar_annotated_vcf_gz_file_path = annotate_vcf_gz_using_bcftools_annotate(
    snpeff_annotated_vcf_gz_file_path,
    CLINVAR_VCF_GZ_FILE_PATH,
    ("--columns =ID,INFO",),
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOB,
    output_vcf_file_path=snpeff_annotated_vcf_gz_file_path.replace(
        ".vcf.gz", ".clinvar.vcf"
    ),
    overwrite=OVERWRITE,
)