# Test Databases

In [1]:
import os
import shutil

from oligo_designer_toolsuite.utils import FastaParser
from oligo_designer_toolsuite.database import OligoDatabase, ReferenceDatabase, OligoAttributes
from oligo_designer_toolsuite.sequence_generator import OligoSequenceGenerator

## Setup

In [2]:
# Global Parameters
FILE_NCBI_EXONS = "../../data/tests/annotations/sequences_ncbi_exons.fna"
FILE_DATABASE_OLIGO_ATTRIBUTES = "../../data/tests/databases/database_oligo_attributes.fna"

METADATA = {
    "files_source": "NCBI",
    "species": "Homo_sapiens",
    "annotation_release": "110",
    "genome_assembly": "GRCh38",
}

REGION_IDS = [
    "AARS1",
    "DECR2",
    "FAM234A",
    "RHBDF1",
    "WASIR2",
    "this_gene_does_not_exist",
]

## Test Reference Database

In [3]:
tmp_path = os.path.join(os.getcwd(), "tmp_reference_database")

fasta_parser = FastaParser()

reference = ReferenceDatabase(dir_output=tmp_path)
reference.load_metadata(metadata=METADATA)
reference.load_sequences_from_fasta(file_fasta=[FILE_NCBI_EXONS, FILE_NCBI_EXONS], database_overwrite=True)
reference.load_sequences_from_fasta(file_fasta=FILE_NCBI_EXONS, database_overwrite=False)



In [4]:
reference.filter_database("AARS1", remove_region=True)
for entry in reference.database:
    region, _, _, = fasta_parser.parse_fasta_header(entry.id)
    assert region != "AARS1", f"error: this region {region} should be filtered out."

In [5]:
file_fasta_database = reference.write_database_to_fasta(filename="filtered_databse")
file_metadata_database = reference.write_metadata_to_yaml(filename="filtered_databse")
assert fasta_parser.check_fasta_format(file_fasta_database) == True, f"error: wrong file format for database in {file_fasta_database}"

In [6]:
shutil.rmtree(tmp_path)

## Test Oligo Database

In [7]:
tmp_path = os.path.join(os.getcwd(), "tmp_oligo_database")

fasta_parser = FastaParser()
oligo_sequence_generator = OligoSequenceGenerator(dir_output=tmp_path)
oligo_database = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=tmp_path)
oligo_database.load_metadata(METADATA)

file_random_seqs = oligo_sequence_generator.create_sequences_random(
    filename_out="random_sequences1",
    length_sequences=30,
    num_sequences=100,
    name_sequences="random_sequences1",
    base_alphabet_with_probability={"A": 0.1, "C": 0.3, "G": 0.4, "T": 0.2},
)

file_sliding_window = oligo_sequence_generator.create_sequences_sliding_window(
    filename_out="sliding_window_sequences",
    file_fasta_in=FILE_NCBI_EXONS,
    length_interval_sequences=(30, 31),
)


In [8]:
oligo_database.load_sequences_from_fasta(
    file_fasta=file_random_seqs,
    sequence_type="oligo",
    region_ids=["random_sequences1"],
    database_overwrite=True,
)
oligo_database.load_sequences_from_fasta(
    file_fasta=file_sliding_window,
    sequence_type="target",
    region_ids=REGION_IDS,
    database_overwrite=False,
)

assert len(oligo_database.database) > 0, "error: no sequences loaded into database"



In [9]:
oligo_database.load_database(FILE_DATABASE_OLIGO_ATTRIBUTES, database_overwrite=True)

file_database, file_metadata = oligo_database.save_database(region_ids=["region_1", "region_2"], filename="database_region1_region2")

oligo_database.load_metadata(file_metadata)
oligo_database.load_database(file_database, database_overwrite=True)

assert oligo_database.metadata["files_source"] == "NCBI", "error: wrong metadata loaded"
assert len(oligo_database.database.keys()) == 2, "error: wrong number regions saved and loaded"



In [10]:
file_fasta = oligo_database.write_database_to_fasta(filename="database_region1_region2")

assert fasta_parser.check_fasta_format(file_fasta) == True, f"error: wrong file format for database in {file_fasta}"

In [11]:
oligo_database.load_sequences_from_fasta(
    file_fasta=file_sliding_window,
    sequence_type="target",
    region_ids=REGION_IDS,
    database_overwrite=True,
)
oligo_database.load_sequences_from_fasta(
    file_fasta=file_random_seqs,
    sequence_type="oligo",
    database_overwrite=False,
)

oligo_database.remove_regions_with_insufficient_oligos("database_generation")
assert len(oligo_database.database.keys()) == (len(REGION_IDS) - 1 + 1), "error: wrong number of regions in database"



In [12]:
oligo_database.load_sequences_from_fasta(
    file_fasta=file_random_seqs,
    sequence_type="oligo",
    database_overwrite=True,
)

list_sequences = oligo_database.get_sequence_list()
assert len(list_sequences) == 100, "error: wrong number of sequences in database"

In [13]:
oligo_database.load_database(
    file_database=FILE_DATABASE_OLIGO_ATTRIBUTES,
    region_ids=None,
    database_overwrite=True,
)

mapping = oligo_database.get_sequence_oligoid_mapping(sequence_type="oligo")
assert len(mapping["CTCACTCGACTCTTACACAGTCATA"]) == 4, "error: wrong number of oligos for sequence"



In [14]:
oligo_database.load_database(
    file_database=FILE_DATABASE_OLIGO_ATTRIBUTES,
    region_ids=None,
    database_overwrite=True,
)
attribute = oligo_database.get_oligo_attribute(attribute="test_attribute")

assert len(attribute["test_attribute"].unique()) == 2, "error: wrong attribute returned"



In [15]:
oligo_database.load_database(
    file_database=FILE_DATABASE_OLIGO_ATTRIBUTES,
    region_ids="region_3",
    database_overwrite=True,
)
new_attribute = {
    "region_3::1": {"GC_content": 63},
    "region_3::2": {"GC_content": 66},
    "region_3::3": {"GC_content": 80},
    "region_3::4": {"GC_content": 70},
    "region_3::5": {"GC_content": 40},
}
oligo_database.update_oligo_attributes(new_attribute)
attribute = oligo_database.get_oligo_attribute(attribute="GC_content")

assert len(attribute) == 5, "error: attribute not correctly updated"

