## Test Region Generator

In [1]:
import os
import sys

import pandas as pd
pd.set_option('display.max_columns', None)

from oligo_designer_toolsuite.database import CustomGenomicRegionGenerator, NcbiGenomicRegionGenerator, EnsemblGenomicRegionGenerator, OligoDatabase
from oligo_designer_toolsuite.utils import GffParser
from oligo_designer_toolsuite.oligo_efficiency_filter import (
    PadlockOligoScoring,
    PadlockSetScoring,
)
from oligo_designer_toolsuite.oligo_selection import (
    OligosetGenerator,
    padlock_heuristic_selection,
)

### Custom Generator with NCBI data

In [2]:
annotation_file = "../data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gtf"
sequence_file = "../data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.fna"
region_generator_ncbi = CustomGenomicRegionGenerator(annotation_file, sequence_file, files_source="NCBI", species = "Homo_sapiens", annotation_release="110", genome_assembly="GRCh38")
print(region_generator_ncbi.annotation_file)
print(region_generator_ncbi.sequence_file)
print(region_generator_ncbi.files_source)
print(region_generator_ncbi.species)
print(region_generator_ncbi.annotation_release)
print(region_generator_ncbi.genome_assembly)

../data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gtf
../data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.fna
NCBI
Homo_sapiens
110
GRCh38


In [3]:
ncbi_genome = region_generator_ncbi.generate_genome()

In [4]:
ncbi_transcriptome = region_generator_ncbi.generate_transcript_reduced_representation()

In [5]:
ncbi_CDS = region_generator_ncbi.generate_CDS_reduced_representation()

### Custom Generator with Ensemble data

In [6]:
annotation_file = "../data/annotations/custom_Homo_sapiens.GRCh38.108.chr16.gtf"
sequence_file = "../data/annotations/custom_Homo_sapiens.GRCh38.dna_rm.primary_assembly_chr16.fa"
region_generator_ensembl = CustomGenomicRegionGenerator(annotation_file, sequence_file, files_source="Ensembl", species = "Homo_sapiens", annotation_release="108", genome_assembly="GRCh38")
print(region_generator_ensembl.annotation_file)
print(region_generator_ensembl.sequence_file)
print(region_generator_ensembl.files_source)
print(region_generator_ensembl.species)
print(region_generator_ensembl.annotation_release)
print(region_generator_ensembl.genome_assembly)

../data/annotations/custom_Homo_sapiens.GRCh38.108.chr16.gtf
../data/annotations/custom_Homo_sapiens.GRCh38.dna_rm.primary_assembly_chr16.fa
Ensembl
Homo_sapiens
108
GRCh38


In [7]:
ensembl_genome = region_generator_ensembl.generate_genome()

In [8]:
ensembl_transcriptome = region_generator_ensembl.generate_transcript_reduced_representation()

In [9]:
ensembl_CDS = region_generator_ensembl.generate_CDS_reduced_representation()

## Test Reference Database


In [10]:
from oligo_designer_toolsuite.database import ReferenceDatabase
from oligo_designer_toolsuite.utils import parse_fasta_header, check_fasta_format

In [11]:
ncbi_transcriptome = "./output/annotation/transcriptome_source_NCBI_species_Homo_sapiens_annotation_release_110_genome_assemly_GRCh38_incl_exonjunctions_of_size_100.fna"
metadata = {"annotation_source": region_generator_ncbi.files_source, "species": region_generator_ncbi.species, "annotation_release": region_generator_ncbi.annotation_release, "genome_assembly": region_generator_ncbi.genome_assembly}
reference = ReferenceDatabase(ncbi_transcriptome, metadata=metadata)

In [12]:
reference.load_fasta_into_database()
reference.filter_database(['AARS1'])
for entry in reference.database:
    region, _, _, = parse_fasta_header(entry.id)
    assert region == 'AARS1', f"error: this region {region} should be filtered out."
file_fasta_database = reference.write_fasta_from_database(filename = "filtered_databse")
file_metadata_database = reference.write_metadata_from_database(filename = "filtered_databse")
assert check_fasta_format(file_fasta_database) == True, "error: wrong file format"

## Test Oligo Database

In [13]:
from Bio.SeqUtils import MeltingTemp as mt
import copy

from oligo_designer_toolsuite.database import OligoDatabase
from oligo_designer_toolsuite.utils import check_fasta_format
from oligo_designer_toolsuite.oligo_property_filter import (
    GCContent,
    MaskedSequences,
    MeltingTemperatureNN,
    PropertyFilter,
)

genes = ["AARS1","DECR2","FAM234A","RHBDF1","WASIR2"]
ncbi_transcriptome = "./output/annotation/transcriptome_source_NCBI_species_Homo_sapiens_annotation_release_110_genome_assemly_GRCh38_incl_exonjunctions_of_size_100.fna"
metadata = {"annotation_source": region_generator_ncbi.files_source, "species": region_generator_ncbi.species, "annotation_release": region_generator_ncbi.annotation_release, "genome_assembly": region_generator_ncbi.genome_assembly}


In [14]:
oligos = OligoDatabase(
    min_oligos_per_region = 0,
    metadata = metadata,
    n_jobs = 2
)
oligos.create_database(file_fasta=ncbi_transcriptome,
                       oligo_length_min=30,
                       oligo_length_max=35,
                       region_ids=genes) 
file_database, file_metadata = oligos.write_database(filename="test_db")
oligos.load_database(file_database)
assert oligos.metadata == {}, "error: metadata not cleared"
file_fasta = oligos.write_fasta_from_database()
assert check_fasta_format(file_fasta) == True, "error: wrong file format"

In [15]:
Tm_parameters = {
            "check": True,
            "strict": True,
            "c_seq": None,
            "shift": 0,
            "nn_table": getattr(mt, "DNA_NN3"),
            "tmm_table": getattr(mt, "DNA_TMM1"),
            "imm_table": getattr(mt, "DNA_IMM1"),
            "de_table": getattr(mt, "DNA_DE1"),
            "dnac1": 50,  # [nM]
            "dnac2": 0,
            "selfcomp": False,
            "dNTPs": 0,
            "saltcorr": 7,
            "Na": 1.25,  # [mM]
            "K": 75,  # [mM]
            "Tris": 20,  # [mM]
            "Mg": 10,  # [mM]
        }

Tm_correction_parameters = {
            "DMSO": 0,
            "DMSOfactor": 0.75,
            "fmdfactor": 0.65,
            "fmdmethod": 1,
            "GC": None,
            "fmd": 20,
        }

masked_sequences = MaskedSequences(mask="N")
GC_content = GCContent(GC_content_min=40, GC_content_max=60)
melting_temperature = MeltingTemperatureNN(
    Tm_min=52,
    Tm_max=67,
    Tm_parameters=Tm_parameters,
    Tm_chem_correction_parameters=Tm_correction_parameters,
)

filters = [masked_sequences, GC_content, melting_temperature]
property_filter = PropertyFilter(filters=filters)
property_filter.apply(oligos)

<oligo_designer_toolsuite.database._oligos_database.OligoDatabase at 0x12c509370>

In [16]:

def get_sequences_from_database(database):
    sequences = []
    for region_id, oligo in database.items():
        for oligo_id, oligo_attributes in oligo.items():
            sequences.append(oligo_attributes["sequence"])
    sequences.sort()  # needed to compare
    return sequences

sequences = get_sequences_from_database(oligos.database)
print(len(sequences))

39788
