## Test FTP Loader

In [1]:
import os
import sys
from pathlib import Path
import shutil

cwd = os.getcwd()

from oligo_designer_toolsuite.utils import FtpLoaderEnsembl, FtpLoaderNCBI

In [3]:
##### Test Loader Ensemble

#Parameters
dir_output = os.path.join(os.path.dirname(os.getcwd()), 'output') # create the complete path for the output directory

species= 'homo_sapiens' # available species: human or mouse
annotation_release= '108'

#initialize
loader_ensemble=FtpLoaderEnsembl(dir_output, species, annotation_release)

#retrieve files
print(loader_ensemble.download_files("gff"))
print(loader_ensemble.download_files("gtf"))
print(loader_ensemble.download_files("fasta"))

shutil.rmtree(dir_output)

('/Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/tests/output/Homo_sapiens.GRCh38.108.gff3', '108', 'GRCh38')
('/Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/tests/output/Homo_sapiens.GRCh38.108.gtf', '108', 'GRCh38')
('/Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/tests/output/Homo_sapiens.GRCh38.dna_rm.primary_assembly.fa', '108', 'GRCh38')


In [4]:
##### Test Loader NCBI

#Parameters
dir_output = os.path.join(os.path.dirname(os.getcwd()), 'output') # create the complete path for the output directory

taxon = 'vertebrate_mammalian' # taxon the species belongs to
species= 'Homo_sapiens' # available species: human or mouse
annotation_release= 'current'

#initialize
loader_ncbi=FtpLoaderNCBI(dir_output, taxon, species, annotation_release)

#retrieve files
print(loader_ncbi.download_files("gff"))
print(loader_ncbi.download_files("gtf"))
print(loader_ncbi.download_files("fasta"))

shutil.rmtree(dir_output)

('/Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.gff', '110', 'GRCh38.p14')
('/Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.gtf', '110', 'GRCh38.p14')
('/Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.fna', '110', 'GRCh38.p14')


## Test Data Parser

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import pickle

from oligo_designer_toolsuite.utils import GffParser, parse_fasta_header

%load_ext memory_profiler

In [2]:
# Initialize parser
parser = GffParser()

In [None]:
##### Test GFF3 parser
file_gff = os.path.join(os.path.dirname(os.getcwd()), "data/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gff")
dataframe_gff = parser.read_gff(file_gff, target_lines=10)

assert dataframe_gff.shape[1] == 23, "error: GFF3 dataframe not correctly loaded"


In [6]:
file_gff = "/Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.gtf"
target_lines = 100000

In [5]:
%mprun -f parser.read_gff parser.read_gff(file_gff, target_lines=target_lines)

*** KeyboardInterrupt exception caught in code being profiled.


Filename: /Users/lisa.barros/projects/GP0002_Oligo_Designer_Toolsuite/Toolsuite/oligo-designer-toolsuite/oligo_designer_toolsuite/utils/_gff_parser.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    42    106.3 MiB    106.3 MiB           1       def read_gff(self, file: str, target_lines: int = None):
    43                                                 """Open an optionally gzipped GFF3/GTF file and return a pandas.DataFrame.
    44                                         
    45                                                 :param file: Filename of GFF3/GTF file.
    46                                                 :type file: str
    47                                                 :param target_lines: Read the first n lines or leave 'None' to read all lines, default: None
    48                                                 :type target_lines: int
    49                                                 :return: DataFrame with GFF§/GTF file content.
    5

In [None]:
# Initialize parser
parser = GffParser()

In [None]:
##### Test GTF parser
file_gtf = os.path.join(os.path.dirname(os.getcwd()), "data/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gtf")
dataframe_gtf = parser.read_gff(file_gtf, target_lines=10)

assert dataframe_gtf.shape[1] == 20, "error: GTF dataframe not correctly loaded"

## Test Fasta Header Parser

In [None]:
header = "ARPG3::transcript_id=XM4581;exon_id=XM4581_exon1::16:70265537-70265662(-)"
region, additional_information, coordinates = parse_fasta_header(header)
assert region == "ARPG3", "error: wrong region parsed"
assert coordinates["chromosome"] == ["16"], "error: wrong chrom parsed"
assert coordinates["start"] == [70265537], "error: wrong start parsed"
assert coordinates["end"] == [70265662], "error: wrong end parsed"
assert coordinates["strand"] == ["-"], "error: wrong strand parsed"
assert (
    additional_information == "transcript_id=XM4581;exon_id=XM4581_exon1"
), "error: wrong additional information parsed"