## Test Data Parser

In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import pickle

from oligo_designer_toolsuite.utils import GffParser, parse_fasta_header

%load_ext memory_profiler

In [None]:
# Initialize parser
parser = GffParser()

In [None]:
##### Test GFF3 parser
file_gff = os.path.join(os.path.dirname(os.getcwd()), "data/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gff")
dataframe_gff = parser.read_gff(file_gff, target_lines=10)

assert dataframe_gff.shape[1] == 23, "error: GFF3 dataframe not correctly loaded"

In [None]:
file_gff = "/Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.gtf"
target_lines = 100000

In [None]:
%mprun -f parser.read_gff parser.read_gff(file_gff, target_lines=target_lines)

In [None]:
# Initialize parser
parser = GffParser()

In [None]:
##### Test GTF parser
file_gtf = os.path.join(os.path.dirname(os.getcwd()), "data/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gtf")
dataframe_gtf = parser.read_gff(file_gtf, target_lines=10)

assert dataframe_gtf.shape[1] == 20, "error: GTF dataframe not correctly loaded"

## Test Fasta Header Parser

In [None]:
header = "ARPG3::transcript_id=XM4581;exon_id=XM4581_exon1::16:70265537-70265662(-)"
region, additional_information, coordinates = parse_fasta_header(header)
assert region == "ARPG3", "error: wrong region parsed"
assert coordinates["chromosome"] == ["16"], "error: wrong chrom parsed"
assert coordinates["start"] == [70265537], "error: wrong start parsed"
assert coordinates["end"] == [70265662], "error: wrong end parsed"
assert coordinates["strand"] == ["-"], "error: wrong strand parsed"
assert (
    additional_information == "transcript_id=XM4581;exon_id=XM4581_exon1"
), "error: wrong additional information parsed"

## Test VCF parser

In [1]:
import os
import subprocess
from cyvcf2 import VCF

In [None]:
file_vcf = "./SNP_testfiles/GCF_000001405.40.gz"
file_vcf_chr16_tmp = "./SNP_testfiles/GCF_000001405.40.chr16_tmp.vcf"
file_vcf_chr16 = "./SNP_testfiles/GCF_000001405.40.chr16.vcf"
file_chr_mapping = "./SNP_testfiles/chr_mapping.txt"

chr16_refseq_accession = "NC_000016.10"

In [None]:
cmd = f"bcftools view -r {chr16_refseq_accession} --output-file {file_vcf_chr16_tmp} --output-type v {file_vcf}"
process = subprocess.Popen(cmd, shell=True, cwd="./", stdout=subprocess.DEVNULL).wait()

[W::vcf_parse_info] Extreme INFO/RS value encountered and set to missing at NC_000016.10:566081


In [None]:
cmd = f"bcftools annotate --rename-chrs {file_chr_mapping} -o {file_vcf_chr16} -O v {file_vcf_chr16_tmp}"
process = subprocess.Popen(cmd, shell=True, cwd="./", stdout=subprocess.DEVNULL).wait()

In [5]:
vcf = VCF(file_vcf_chr16)
variant_type_identifier = "SNV"

In [9]:
for i, variant in enumerate(vcf): 

    variant_type = variant.INFO.get('VC')

    if variant_type == variant_type_identifier:

        #print(str(variant))
        print(f"REF:{variant.REF}")
        print(f"ALT:{variant.ALT}")

        print(f"CHROM:{variant.CHROM}")
        print(f"start:{variant.start}")
        print(f"end:{variant.end}")

        print(f"ID:{variant.ID}")
        print(f"VC:{variant_type}")

        print("\n")

    if i > 5:
        break

REF:T
ALT:['G']
CHROM:16
start:10166
end:10167
ID:rs1896972751
VC:SNV


REF:T
ALT:['C']
CHROM:16
start:10167
end:10168
ID:rs1207290905
VC:SNV


REF:G
ALT:['A']
CHROM:16
start:10168
end:10169
ID:rs558005370
VC:SNV


REF:G
ALT:['A', 'T']
CHROM:16
start:10169
end:10170
ID:rs1263140383
VC:SNV


REF:G
ALT:['A', 'C', 'T']
CHROM:16
start:10170
end:10171
ID:rs1037473004
VC:SNV


REF:A
ALT:['C']
CHROM:16
start:10171
end:10172
ID:rs1282658436
VC:SNV


