## Test Sequence Generator

In [1]:
import os
import shutil
import pandas as pd
pd.set_option('display.max_columns', None)

from oligo_designer_toolsuite.sequence_generator import CustomGenomicRegionGenerator, NcbiGenomicRegionGenerator, FtpLoaderEnsembl, FtpLoaderNCBI

### FTP Loader

In [2]:
##### Test Loader Ensemble

#Parameters
dir_output = os.path.join(os.path.dirname(os.getcwd()), 'output') # create the complete path for the output directory

species= 'homo_sapiens' # available species: human or mouse
annotation_release= '108'

#initialize
loader_ensemble=FtpLoaderEnsembl(dir_output, species, annotation_release)

#retrieve files
print(loader_ensemble.download_files("gff"))
print(loader_ensemble.download_files("gtf"))
print(loader_ensemble.download_files("fasta"))

shutil.rmtree(dir_output)

('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/Homo_sapiens.GRCh38.108.gff3', '108', 'GRCh38')
('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/Homo_sapiens.GRCh38.108.gtf', '108', 'GRCh38')
('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa', '108', 'GRCh38')


In [3]:
##### Test Loader NCBI

## Test for release <= 110
#Parameters
dir_output = os.path.join(os.path.dirname(os.getcwd()), 'output') # create the complete path for the output directory

taxon = 'vertebrate_mammalian' # taxon the species belongs to
species= 'Homo_sapiens' # available species: human or mouse
annotation_release= '110'

#initialize
loader_ncbi=FtpLoaderNCBI(dir_output, taxon, species, annotation_release)

#retrieve files
print(loader_ncbi.download_files("gff"))
print(loader_ncbi.download_files("gtf"))
print(loader_ncbi.download_files("fasta"))

shutil.rmtree(dir_output)

('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.gff', '110', 'GRCh38.p14')
('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.gtf', '110', 'GRCh38.p14')
('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.fna', '110', 'GRCh38.p14')


In [5]:
## Test for release > 110 -> changed folder structure
#Parameters
dir_output = os.path.join(os.path.dirname(os.getcwd()), 'output') # create the complete path for the output directory

taxon = 'vertebrate_mammalian' # taxon the species belongs to
species= 'Homo_sapiens' # available species: human or mouse
annotation_release= 'current'

#initialize
loader_ncbi=FtpLoaderNCBI(dir_output, taxon, species, annotation_release)

#retrieve files
print(loader_ncbi.download_files("gff"))
print(loader_ncbi.download_files("gtf"))
print(loader_ncbi.download_files("fasta"))

shutil.rmtree(dir_output)

('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.gff', 'GCF_000001405.40-RS_2023_10', 'GRCh38.p14')
('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.gtf', 'GCF_000001405.40-RS_2023_10', 'GRCh38.p14')
('/Users/lisa.barros/Desktop/oligo-designer-toolsuite/tests/output/GCF_000001405.40_GRCh38.p14_genomic.fna', 'GCF_000001405.40-RS_2023_10', 'GRCh38.p14')


### Genomic Region Generator with FTP loader

In [None]:
taxon = "vertebrate_mammalian"
species = "Homo_sapiens"
annotation_release = "current"

region_generator_ncbi_ftp = NcbiGenomicRegionGenerator(taxon, species, annotation_release)

In [None]:
print(region_generator_ncbi_ftp.annotation_file)
print(region_generator_ncbi_ftp.sequence_file)
print(region_generator_ncbi_ftp.files_source)
print(region_generator_ncbi_ftp.species)
print(region_generator_ncbi_ftp.annotation_release)
print(region_generator_ncbi_ftp.genome_assembly)

In [None]:
region_generator_ncbi_ftp.get_sequence_gene()
region_generator_ncbi_ftp.get_sequence_exon()
region_generator_ncbi_ftp.get_sequence_CDS()

region_generator_ncbi_ftp.get_sequence_UTR(five_prime=True, three_prime=True)
region_generator_ncbi_ftp.get_sequence_exon_exon_junction(block_size=50)

region_generator_ncbi_ftp.get_sequence_intergenic()
region_generator_ncbi_ftp.get_sequence_intron()

### Custom Generator with NCBI data

In [None]:
annotation_file = "../../data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gtf"
sequence_file = "../../data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.fna"
region_generator_ncbi = CustomGenomicRegionGenerator(annotation_file, sequence_file, files_source="NCBI", species = "Homo_sapiens", annotation_release="110", genome_assembly="GRCh38")
print(region_generator_ncbi.annotation_file)
print(region_generator_ncbi.sequence_file)
print(region_generator_ncbi.files_source)
print(region_generator_ncbi.species)
print(region_generator_ncbi.annotation_release)
print(region_generator_ncbi.genome_assembly)

In [None]:
ncbi_genes = region_generator_ncbi.get_sequence_gene()
ncbi_exons = region_generator_ncbi.get_sequence_exon()
ncbi_CDS = region_generator_ncbi.get_sequence_CDS()

ncbi_UTR = region_generator_ncbi.get_sequence_UTR(five_prime=True, three_prime=True)
ncbi_junction = region_generator_ncbi.get_sequence_exon_exon_junction(block_size=50)

ncbi_intergenic = region_generator_ncbi.get_sequence_intergenic()
ncbi_introns = region_generator_ncbi.get_sequence_intron()

### Custom Generator with Ensemble data

In [None]:
annotation_file = "../../data/annotations/custom_Homo_sapiens.GRCh38.108.chr16.gtf"
sequence_file = "../../data/annotations/custom_Homo_sapiens.GRCh38.dna_rm.primary_assembly_chr16.fa"
region_generator_ensembl = CustomGenomicRegionGenerator(annotation_file, sequence_file, files_source="Ensembl", species = "Homo_sapiens", annotation_release="108", genome_assembly="GRCh38")
print(region_generator_ensembl.annotation_file)
print(region_generator_ensembl.sequence_file)
print(region_generator_ensembl.files_source)
print(region_generator_ensembl.species)
print(region_generator_ensembl.annotation_release)
print(region_generator_ensembl.genome_assembly)

In [None]:
ensembl_gene = region_generator_ensembl.get_sequence_gene()
ensembl_exon = region_generator_ensembl.get_sequence_exon()
ensembl_CDS = region_generator_ensembl.get_sequence_CDS()

ensembl_UTR = region_generator_ensembl.get_sequence_UTR(five_prime=True, three_prime=True)
ensembl_junction = region_generator_ensembl.get_sequence_exon_exon_junction(block_size=50)

ensembl_intergenic = region_generator_ensembl.get_sequence_intergenic()
ensembl_intron = region_generator_ensembl.get_sequence_intron()

### Generate Oligo Sequences

In [None]:
from oligo_designer_toolsuite.sequence_generator import OligoSequenceGenerator

In [None]:
oligo_sequence_generator = OligoSequenceGenerator()

In [None]:
file_fasta_random_seqs1 = oligo_sequence_generator.create_sequences_random(
    filename_out="random_sequences1",
    length_sequences=30,
    num_sequences=100,
    name_sequences="random_sequences1",
    base_alphabet_with_probability={"A": 0.1, "C": 0.3, "G": 0.4, "T": 0.2},
)

In [None]:
file_fasta_random_seqs2 = oligo_sequence_generator.create_sequences_random(
    filename_out="random_sequences2",
    length_sequences=15,
    num_sequences=3,
    name_sequences="random_sequences2",
)

In [None]:
file_fasta_exons = oligo_sequence_generator.create_sequences_sliding_window(
    filename_out="sliding_window_sequences",
    file_fasta_in=[ncbi_exons, ncbi_junction],
    length_interval_sequences=(30, 31),
)

## Test Oligo Database

In [None]:
from oligo_designer_toolsuite.database import OligoDatabase

In [None]:
region_ids = ["AARS1","DECR2","FAM234A","RHBDF1","WASIR2", "this_gene_does_not_exist"]

metadata_ncbi = {"annotation_source": region_generator_ncbi.files_source, 
            "species": region_generator_ncbi.species, 
            "annotation_release": region_generator_ncbi.annotation_release, 
            "genome_assembly": region_generator_ncbi.genome_assembly
            }

In [None]:
oligos = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True)
oligos2 = OligoDatabase(min_oligos_per_region=4, write_regions_with_insufficient_oligos=True)

In [None]:
oligos.load_metadata(metadata_ncbi)
#oligos.metadata

In [None]:
oligos.load_sequences_from_fasta(
    file_fasta_in=file_fasta_random_seqs1,
    sequence_type="oligo",
    region_ids=["random_sequences1"],
    database_overwrite=True,
)
oligos.database

In [None]:
oligos.load_sequences_from_fasta(
    file_fasta_in=file_fasta_random_seqs2,
    sequence_type="oligo",
    database_overwrite=False,
)
#oligos.database

In [None]:
oligos.load_sequences_from_fasta(
    file_fasta_in=file_fasta_exons,
    sequence_type="target",
    region_ids=region_ids,
    database_overwrite=False,
)
oligos.database

In [None]:
# check if calculation of number of targeted transcripts and isoform consensus works
oligos.calculate_num_targeted_transcripts()
oligos.calculate_isoform_consensus()

In [None]:
# check if removale of regions works
oligos.remove_regions_with_insufficient_oligos("database_generation")
assert len(oligos.database.keys()) == (len(region_ids) - 1 + 2), "error: wrong number of regions in database"

In [None]:
# check if save and load works
file_database, file_metadata = oligos.save_database(
    region_ids="random_sequences2", filename_out="database_random_sequences2"
)

oligos2.load_metadata(file_metadata)
oligos2.load_database(file_database, database_overwrite=True)
oligos2.load_sequences_from_fasta(
    file_fasta_in=file_fasta_random_seqs1,
    sequence_type="oligo",
    database_overwrite=False,
)

In [None]:
# check if removale of regions works
oligos2.remove_regions_with_insufficient_oligos("database_generation")
assert len(oligos2.database.keys()) == 1, "error: wrong number of regions in database"

In [None]:
# check if we get the correct number of sequences returned
list_sequences = oligos2.get_sequence_list()
assert len(list_sequences) == 100, "error: wrong number of sequences in database"

In [None]:
oligos.write_database_to_fasta()

## Test Reference Database


In [None]:
from oligo_designer_toolsuite.database import ReferenceDatabase
from oligo_designer_toolsuite.utils import FastaParser

In [None]:
fasta_parser = FastaParser()

metadata = {"annotation_source": region_generator_ncbi.files_source, "species": region_generator_ncbi.species, "annotation_release": region_generator_ncbi.annotation_release, "genome_assembly": region_generator_ncbi.genome_assembly}

In [None]:
reference = ReferenceDatabase()
reference.load_metadata(metadata=metadata)
reference.load_sequences_from_fasta(file_fasta=ncbi_genes, database_overwrite=True)
reference.load_sequences_from_fasta(file_fasta=ncbi_intergenic, database_overwrite=False)

In [None]:
reference.filter_database("AARS1")
for entry in reference.database:
    region, _, _, = fasta_parser.parse_fasta_header(entry.id)
    assert region == 'AARS1', f"error: this region {region} should be filtered out."
reference.database

In [None]:
file_fasta_database = reference.write_database_to_fasta(filename = "filtered_databse")
file_metadata_database = reference.write_metadata_to_yaml(filename = "filtered_databse")
assert fasta_parser.check_fasta_format(file_fasta_database) == True, "error: wrong file format"