# Test Databases

In [1]:
import os
import shutil

from oligo_designer_toolsuite.utils import FastaParser
from oligo_designer_toolsuite.database import OligoDatabase, ReferenceDatabase, OligoAttributes
from oligo_designer_toolsuite.sequence_generator import OligoSequenceGenerator

## Setup

In [2]:
# Global Parameters
FILE_NCBI_EXONS = "../data/genomic_regions/sequences_ncbi_exons.fna"
FILE_DATABASE_OLIGO_ATTRIBUTES = "../data/databases/database_oligo_attributes.csv"

REGION_IDS = [
    "AARS1",
    "DECR2",
    "FAM234A",
    "RHBDF1",
    "WASIR2",
    "this_gene_does_not_exist",
]

## Test Reference Database

In [3]:
tmp_path = os.path.join(os.getcwd(), "tmp_reference_database")

fasta_parser = FastaParser()

reference = ReferenceDatabase(dir_output=tmp_path)
reference.load_database_from_fasta(files_fasta=[FILE_NCBI_EXONS, FILE_NCBI_EXONS], database_overwrite=True)
reference.load_database_from_fasta(files_fasta=FILE_NCBI_EXONS, database_overwrite=False)

In [4]:
reference.filter_database("AARS1", remove_region=True)
for entry in reference.database:
    region, _, _, = fasta_parser.parse_fasta_header(entry.id)
    assert region != "AARS1", f"error: this region {region} should be filtered out."

In [5]:
file_fasta_database = reference.write_database_to_fasta(filename="filtered_databse")
# assert fasta_parser.check_fasta_format(file_fasta_database) == True, f"error: wrong file format for database in {file_fasta_database}"

In [6]:
shutil.rmtree(tmp_path)

## Test Oligo Database

In [7]:
tmp_path = os.path.join(os.getcwd(), "tmp_oligo_database")

fasta_parser = FastaParser()
attribute_calculator = OligoAttributes()
oligo_sequence_generator = OligoSequenceGenerator(dir_output=tmp_path)
oligo_database = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=tmp_path)

file_random_seqs = oligo_sequence_generator.create_sequences_random(
    filename_out="random_sequences1",
    length_sequences=30,
    num_sequences=100,
    name_sequences="random_sequences1",
    base_alphabet_with_probability={"A": 0.1, "C": 0.3, "G": 0.4, "T": 0.2},
)

file_sliding_window = oligo_sequence_generator.create_sequences_sliding_window(
    files_fasta_in=FILE_NCBI_EXONS,
    length_interval_sequences=(30, 31),
    region_ids=REGION_IDS
)


In [8]:
oligo_database.load_database_from_fasta(
    files_fasta=file_random_seqs,
    sequence_type="oligo",
    region_ids=["random_sequences1"],
    database_overwrite=True,
)
oligo_database.load_database_from_fasta(
    files_fasta=file_sliding_window,
    sequence_type="target",
    region_ids=REGION_IDS,
    database_overwrite=False,
)

assert len(oligo_database.database) > 0, "error: no sequences loaded into database"

Output()

Output()



In [9]:
oligo_database = attribute_calculator.calculate_isoform_consensus(oligo_database=oligo_database)
oligo_database = attribute_calculator.calculate_GC_content(oligo_database=oligo_database, sequence_type="oligo")

In [10]:
oligo_database.load_database_from_table(FILE_DATABASE_OLIGO_ATTRIBUTES, database_overwrite=True)
print(oligo_database.database["region_1"])
print(oligo_database.database["region_2"])
print(oligo_database.database["region_3"])

{'region_1::1': {'oligo': 'ATGCCCCAATGGATGACGAT', 'target': 'ATCGTCATCCATTGGGGCAT', 'test_attribute': [['red']], 'ligation_site': [[10]], 'chromosome': [['16']], 'start': [[70289456]], 'end': [[70289485]], 'strand': [['-']], 'regiontype': [['exon']], 'gene_id': [['region_1', 'region_1']], 'transcript_id': [['NM_001605.3', 'XM_047433666.1']], 'exon_number': [['1', '1']], 'number_transcripts': [['2']]}, 'region_1::2': {'oligo': 'GGCTAGGGAATCGAATGGTTCCAATAGAG', 'target': 'CTCTATTGGAACCATTCGATTCCCTAGCC', 'test_attribute': [['blue']], 'ligation_site': [[10]], 'chromosome': [['16']], 'start': [[70289456]], 'end': [[70289485]], 'strand': [['-']], 'regiontype': [['exon']], 'gene_id': [['region_1', 'region_1']], 'transcript_id': [['NM_001605.3', 'XM_047433666.1']], 'exon_number': [['1', '1']], 'number_transcripts': [['2']]}, 'region_1::3': {'oligo': 'CTCACTCGACTCTTACACAGTCATA', 'target': 'TATGACTGTGTAAGAGTCGAGTGAG', 'test_attribute': [['red']], 'ligation_site': [[10]], 'chromosome': [['16']], '

In [11]:
file_database = oligo_database.save_database(region_ids=["region_1", "region_2"])
oligo_database.load_database(file_database, database_overwrite=True)

assert len(oligo_database.database.keys()) == 2, "error: wrong number regions saved and loaded"

Output()

In [12]:
oligo_database.load_database_from_table(FILE_DATABASE_OLIGO_ATTRIBUTES, database_overwrite=True)
oligo_database.filter_oligo_attribute_by_category(name_attribute="exon_number", category_attribute=["1","21"], remove_if_equals_category=False)

assert len(oligo_database.database["region_3"]) == 3, "error: wrong number of oligos removed"

In [13]:
attribute_calculator = OligoAttributes()
oligo_database = attribute_calculator.calculate_isoform_consensus(oligo_database=oligo_database)
oligo_database.filter_oligo_attribute_by_threshold(name_attribute="isoform_consensus", thr_attribute=70, remove_if_smaller_threshold=True)

assert len(oligo_database.database["region_1"]) == 3, "error: wrong number of oligos removed"

In [14]:
file_fasta = oligo_database.write_database_to_fasta(filename="database_region1_region2")

assert fasta_parser.check_fasta_format(file_fasta) == True, f"error: wrong file format for database in {file_fasta}"

In [15]:
oligo_database.load_database_from_fasta(
    files_fasta=file_sliding_window,
    sequence_type="target",
    region_ids=REGION_IDS,
    database_overwrite=True,
)
oligo_database.load_database_from_fasta(
    files_fasta=file_random_seqs,
    sequence_type="oligo",
    database_overwrite=False,
)

oligo_database.remove_regions_with_insufficient_oligos("database_generation")
assert len(oligo_database.database.keys()) == (len(REGION_IDS) - 1 + 1), "error: wrong number of regions in database"

Output()

Output()



In [16]:
oligo_database.load_database_from_fasta(
    files_fasta=file_random_seqs,
    sequence_type="oligo",
    database_overwrite=True,
)

list_sequences = oligo_database.get_sequence_list()
assert len(list_sequences) == 100, "error: wrong number of sequences in database"

Output()

In [17]:
oligo_database.load_database_from_table(
    file_database=FILE_DATABASE_OLIGO_ATTRIBUTES,
    region_ids=None,
    database_overwrite=True,
)

mapping = oligo_database.get_sequence_oligoid_mapping(sequence_type="oligo")
assert len(mapping["CTCACTCGACTCTTACACAGTCATA"]) == 4, "error: wrong number of oligos for sequence"

In [18]:
oligo_database.load_database_from_table(
    file_database=FILE_DATABASE_OLIGO_ATTRIBUTES,
    region_ids=None,
    database_overwrite=True,
)
attribute = oligo_database.get_oligo_attribute_table(attribute="test_attribute")

assert len(attribute["test_attribute"].unique()) == 2, "error: wrong attribute returned"

In [19]:
oligo_database.load_database_from_table(
    file_database=FILE_DATABASE_OLIGO_ATTRIBUTES,
    region_ids="region_3",
    database_overwrite=True,
)
new_attribute = {
    "region_3::1": {"GC_content": 63},
    "region_3::2": {"GC_content": 66},
    "region_3::3": {"GC_content": 80},
    "region_3::4": {"GC_content": 70},
    "region_3::5": {"GC_content": 40},
}
oligo_database.update_oligo_attributes(new_attribute)
attribute = oligo_database.get_oligo_attribute_table(attribute="GC_content")

assert len(attribute) == 5, "error: attribute not correctly updated"