In [2]:
from Bio.SeqUtils import MeltingTemp as mt
from Bio.SeqUtils import Seq

from oligo_designer_toolsuite.database import OligoAttributes

In [3]:
## Melting temp parameters

Tm_parameters = {
    "check": True,
    "strict": True,
    "c_seq": None,
    "shift": 0,
    "nn_table": getattr(mt, "DNA_NN3"),
    "tmm_table": getattr(mt, "DNA_TMM1"),
    "imm_table": getattr(mt, "DNA_IMM1"),
    "de_table": getattr(mt, "DNA_DE1"),
    "dnac1": 50,  # [nM]
    "dnac2": 0,
    "selfcomp": False,
    "saltcorr": 7,
    "Na": 50,  # [mM]
    "K": 75,  # [mM]
    "Tris": 20,  # [mM]
    "Mg": 10,  # [mM]
    "dNTPs": 0,
}

Tm_chem_correction_parameters = {
    "DMSO": 0,
    "DMSOfactor": 0.75,
    "fmdfactor": 0.65,
    "fmdmethod": 1,
    "GC": None,
    "fmd": 20,
}

Tm_salt_correction_parameters = {
    "method": 7,
    "Na": 50,  # [mM]
    "K": 75,  # [mM]
    "Tris": 20,  # [mM]
    "Mg": 10,  # [mM]
    "dNTPs": 0,
}

In [4]:
class TmGCOligoScoringOld():
    """Scoring class that calculates oligo scores based on their melting temperature (Tm) and GC content.
    The scores are computed by assessing how far each oligo's properties deviate from the optimal conditions.

    $score = w_{Tm}\dfrac{|Tm_{opt} - Tm_{oligo}|}{Tm_{max} - Tm_{min}} + w_{GC}\dfrac{|GC_{opt} - GC_{oligo}|}{GC_{max} - GC_{min}}$.

    :param Tm_min: Minimum acceptable melting temperature.
    :type Tm_min: float
    :param Tm_opt: Optimal melting temperature.
    :type Tm_opt: float
    :param Tm_max: Maximum acceptable melting temperature.
    :type Tm_max: float
    :param GC_content_min: Minimum acceptable GC content.
    :type GC_content_min: float
    :param GC_content_opt: Optimal GC content.
    :type GC_content_opt: float
    :param GC_content_max: Maximum acceptable GC content.
    :type GC_content_max: float
    :param Tm_parameters: Parameters for calculating melting temperature.
    :type Tm_parameters: dict
    :param Tm_salt_correction_parameters: Parameters for salt correction in Tm calculation, optional.
    :type Tm_salt_correction_parameters: dict, optional
    :param Tm_chem_correction_parameters: Parameters for chemical correction in Tm calculation, optional.
    :type Tm_chem_correction_parameters: dict, optional
    :param Tm_weight: Weight factor for Tm deviations in score calculation.
    :type Tm_weight: float
    :param GC_weight: Weight factor for GC content deviations in score calculation.
    :type GC_weight: float
    """

    def __init__(
        self,
        Tm_min: float,
        Tm_opt: float,
        Tm_max: float,
        GC_content_min: float,
        GC_content_opt: float,
        GC_content_max: float,
        Tm_parameters: dict,
        Tm_salt_correction_parameters: dict = None,
        Tm_chem_correction_parameters: dict = None,
        Tm_weight: float = 1,
        GC_weight: float = 1,
    ):
        """Constructor for the TmGCOligoScoring class."""
        self.Tm_min = Tm_min
        self.Tm_opt = Tm_opt
        self.Tm_max = Tm_max
        self.GC_min = GC_content_min
        self.GC_opt = GC_content_opt
        self.GC_max = GC_content_max
        self.Tm_parameters = Tm_parameters
        self.Tm_salt_correction_parameters = Tm_salt_correction_parameters
        self.Tm_chem_correction_parameters = Tm_chem_correction_parameters
        self.Tm_weight = Tm_weight
        self.GC_weight = GC_weight
        self.__generate_scoring_functions()

    def scoring_function(self, sequence: Seq):
        """Calculates the oligo score based on Tm and GC content deviations from their optimal values.

        :param sequence: The DNA sequence of the oligo to score.
        :type sequence: Seq
        :return: The calculated score based on Tm and GC content.
        :rtype: float
        """
        # distance from the optimal melting temperature weightend by the how far is the optimum from the min/ max
        # the scoring is the lower the better
        Tm_oligo = OligoAttributes._calc_TmNN(
            sequence=sequence,
            Tm_parameters=self.Tm_parameters,
            Tm_salt_correction_parameters=self.Tm_salt_correction_parameters,
            Tm_chem_correction_parameters=self.Tm_chem_correction_parameters,
        )
        GC_oligo = OligoAttributes._calc_GC_content(sequence=sequence)
        Tm_dif = Tm_oligo - self.Tm_opt  # check the names of the columns
        GC_dif = GC_oligo - self.GC_opt
        score = self.Tm_weight * self.Tm_error(Tm_dif) + self.GC_weight * self.GC_error(GC_dif)
        return score

    def __generate_scoring_functions(self):
        """
        Generates scoring functions for Tm and GC content based on provided parameters.

        Sets up dynamic functions to calculate errors in Tm and GC content based on the deviation from optimal values.
        This method is intended for internal use to configure the scoring calculations upon initialization.
        """
        # define the error function for the melting temperature
        Tm_dif_max = self.Tm_max - self.Tm_opt
        Tm_dif_min = self.Tm_opt - self.Tm_min
        if Tm_dif_max == Tm_dif_min:
            self.Tm_error = lambda Tm_dif: abs(Tm_dif) / Tm_dif_max
        else:
            self.Tm_error = lambda Tm_dif: abs(Tm_dif) / Tm_dif_max * (Tm_dif > 0) + abs(
                Tm_dif
            ) / Tm_dif_min * (Tm_dif < 0)
        # define the error function for the GC content
        GC_dif_max = self.GC_max - self.GC_opt
        GC_dif_min = self.GC_opt - self.GC_min
        if GC_dif_max == GC_dif_min:
            self.GC_error = lambda GC_dif: abs(GC_dif) / GC_dif_max
        else:
            self.GC_error = lambda GC_dif: abs(GC_dif) / GC_dif_max * (GC_dif > 0) + abs(
                GC_dif
            ) / GC_dif_min * (GC_dif < 0)



In [15]:
class TmGCOligoScoringNew():

    def __init__(
        self,
        Tm_min: float,
        Tm_opt: float,
        Tm_max: float,
        GC_content_min: float,
        GC_content_opt: float,
        GC_content_max: float,
        Tm_parameters: dict,
        Tm_salt_correction_parameters: dict = None,
        Tm_chem_correction_parameters: dict = None,
        Tm_weight: float = 1,
        GC_weight: float = 1,
    ):
        """Constructor for the TmGCOligoScoring class."""
        self.Tm_min = Tm_min
        self.Tm_opt = Tm_opt
        self.Tm_max = Tm_max
        self.GC_min = GC_content_min
        self.GC_opt = GC_content_opt
        self.GC_max = GC_content_max
        self.Tm_parameters = Tm_parameters
        self.Tm_salt_correction_parameters = Tm_salt_correction_parameters
        self.Tm_chem_correction_parameters = Tm_chem_correction_parameters
        self.Tm_weight = Tm_weight
        self.GC_weight = GC_weight
        self._generate_scoring_functions()

    def scoring_function(self, sequence: Seq):
        """Calculate the score for a given sequence based on Tm and GC content."""
        Tm_oligo = OligoAttributes._calc_TmNN(
            sequence=sequence,
            Tm_parameters=self.Tm_parameters,
            Tm_salt_correction_parameters=self.Tm_salt_correction_parameters,
            Tm_chem_correction_parameters=self.Tm_chem_correction_parameters,
        )
        GC_oligo = OligoAttributes._calc_GC_content(sequence=sequence)

        print(Tm_oligo, GC_oligo)

        Tm_dif = Tm_oligo - self.Tm_opt
        GC_dif = GC_oligo - self.GC_opt
        score = self.Tm_weight * self.Tm_error(Tm_dif) + self.GC_weight * self.GC_error(GC_dif)
        return score

    def _generate_scoring_functions(self):
        """Generate the scoring functions for Tm and GC content errors."""
        self.Tm_error = self._generate_error_function(self.Tm_min, self.Tm_opt, self.Tm_max)
        self.GC_error = self._generate_error_function(self.GC_min, self.GC_opt, self.GC_max)

    def _generate_error_function(self, min_val, opt_val, max_val):
        dif_max = max_val - opt_val
        dif_min = opt_val - min_val
        if dif_max == dif_min:
            return lambda dif: abs(dif) / dif_max
        else:
            return lambda dif: abs(dif) / (dif_max if dif > 0 else dif_min)


In [73]:
TM_PARAMETERS = {
    "check": True, #default
    "strict": True, #default
    "c_seq": None, #default
    "shift": 0, #default
    "nn_table": getattr(mt, "DNA_NN3"),
    "tmm_table": getattr(mt, "DNA_TMM1"),
    "imm_table": getattr(mt, "DNA_IMM1"),
    "de_table": getattr(mt, "DNA_DE1"),
    "dnac1": 50, #[nM]
    "dnac2": 0, #[nM]
    "selfcomp": False, #default
    "saltcorr": 7, # Owczarzy et al. (2008)
    "Na": 1.25, #[mM]
    "K": 75, #[mM]
    "Tris": 20, #[mM]
    "Mg": 10, #[mM]
    "dNTPs": 0, #[mM] default
}

TM_PARAMETERS_CHEM_CORR = {
    "DMSO": 0, #default
    "fmd": 20,
    "DMSOfactor": 0.75, #default
    "fmdfactor": 0.65, #default
    "fmdmethod": 1, #default
    "GC": None, #default
}

sequence = "AATTAGAAGCGTGTGCGCACATCCCGG"

Tm_min=52
Tm_opt=60
Tm_max=67
GC_content_min=40
GC_content_opt=50
GC_content_max=60

# Usage of the class
obj = TmGCOligoScoringOld(Tm_min=Tm_min, Tm_opt=Tm_opt, Tm_max=Tm_max, GC_content_min=GC_content_min, GC_content_opt=GC_content_opt, GC_content_max=GC_content_max, Tm_parameters=TM_PARAMETERS, Tm_chem_correction_parameters=TM_PARAMETERS_CHEM_CORR)
score = obj.scoring_function(sequence)  # Example call
print("Score:", score)

# Usage of the class
obj = TmGCOligoScoringNew(Tm_min=Tm_min, Tm_opt=Tm_opt, Tm_max=Tm_max, GC_content_min=GC_content_min, GC_content_opt=GC_content_opt, GC_content_max=GC_content_max, Tm_parameters=TM_PARAMETERS, Tm_chem_correction_parameters=TM_PARAMETERS_CHEM_CORR)
score = obj.scoring_function(sequence)  # Example call
print("Score:", score)


Score: 0.7088571428571432
61.07 55.56
Score: 0.7088571428571432


In [33]:
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix

class OligoOverlapCalculatorNew:

    def _get_overlapping_matrix(self, database_region):
        """Creates a matrix that encodes the overlapping of different oligos.
        The matrix has dimensions n_oligos * n_oligos where each row and column
        belong to an oligo. Each entry contains 1 if the correspondent oligos don't
        overlap and 0 if they overlap. This matrix will be used as an adjacency
        matrix, and the sets of non-overlapping oligos are cliques of this graph.

        :param database_region: dictionary containing all the oligos of the region
        :type database_region: dict
        :return: overlapping matrix
        :rtype: pandas.DataFrame
        """

        def _get_overlap(seq1_intervals, seq2_intervals, distance_between_oligos=0):
            for a in seq1_intervals:
                for b in seq2_intervals:
                    if min(a[1], b[1]) - max(a[0], b[0]) >= -distance_between_oligos:
                        return True
            return False

        oligos_indices = list(database_region.keys())  # Keep track of the indices
        intervals = [
            [
                [start[0], end[0]]
                for start, end in zip(
                    database_region[oligo_id]["start"], database_region[oligo_id]["end"]
                )
            ]
            for oligo_id in oligos_indices
        ]

        n_oligos = len(intervals)
        overlapping_matrix = lil_matrix((n_oligos, n_oligos), dtype=int)

        for i in range(n_oligos):
            for j in range(i + 1, n_oligos):
                if _get_overlap(intervals[i], intervals[j]):
                    overlapping_matrix[i, j] = 1

        overlapping_matrix = overlapping_matrix.maximum(overlapping_matrix.transpose())
        overlapping_matrix.setdiag(1)  # Set diagonal elements to 1

        # Create a sparse matrix containing only ones
        ones_matrix = lil_matrix((n_oligos, n_oligos), dtype=int)
        ones_matrix[:, :] = 1

        # Invert the matrix by subtracting the overlapping matrix from the ones matrix
        overlapping_matrix = ones_matrix - overlapping_matrix

        overlapping_matrix = pd.DataFrame(
            data=overlapping_matrix.toarray(),
            columns=oligos_indices,
            index=oligos_indices,
            dtype=int,
        )

        return overlapping_matrix

In [15]:
class OligoOverlapCalculatorOld:
    def _get_overlapping_matrix(self, database_region: dict, distance_between_oligos: int = 0):
        """Creates a matrix that encodes the overlapping of different oligos. the matrix has dimensions n_oligos * n_oligos where each
        row and column belong to a oligo. Each entry contains 1 if the the correspondent oligos don't overlap and 0 if they overlap, this
        is done because in the next this matrix will be used as an adjacency matrix and the sets of non overlapping oligos are cliques of this graph.

        :pram database_region: dictionary containing all the oligos of the region
        :type database_region: dict
        :return: overlapping matrix
        :rtype: pandas.DataFrame
        """

        def _get_overlap(seq1_intervals, seq2_intervals, distance_between_oligos):
            for a in seq1_intervals:
                for b in seq2_intervals:
                    if min(a[1], b[1]) - max(a[0], b[0]) >= -1 * distance_between_oligos:
                        return True
            return False

        intervals = []
        oligos_indices = []
        
        for oligo_id in database_region.keys():
            interval = []
            oligos_indices.append(oligo_id)  # keep track of the indices
            
            for start, end in zip(database_region[oligo_id]["start"], database_region[oligo_id]["end"]):
                interval.append(
                    [start[0], end[0]]
                )  # save a list of couples of [start,end] of the duplicates of that oligo
            intervals.append(interval)

        overlapping_matrix = np.zeros(
            (len(intervals), len(intervals)), dtype=int
        )  # on the diagonal we have overlaps
        
        for i in range(len(intervals)):
            for j in range(i + 1, len(intervals)):
                if _get_overlap(intervals[i], intervals[j], distance_between_oligos):
                    overlapping_matrix[i, j] = 1
        
        overlapping_matrix = overlapping_matrix + np.transpose(overlapping_matrix) + np.eye(len(intervals))
        overlapping_matrix = np.ones((len(intervals), len(intervals)), dtype=int) - overlapping_matrix
        overlapping_matrix = pd.DataFrame(
            data=overlapping_matrix,
            columns=oligos_indices,
            index=oligos_indices,
            dtype=int,
        )

        return overlapping_matrix

In [38]:
# Test example
database_region = {
    "oligo1": {
        "start": [[1]],
        "end": [[5]]
    },
    "oligo2": {
        "start": [[6]],
        "end": [[12]]
    },
    "oligo3": {
        "start": [[13]],
        "end": [[35]]
    },
    "oligo4": {
        "start": [[16]],
        "end": [[22]]
    },
    "oligo5": {
        "start": [[16]],
        "end": [[22]]
    },
    "oligo6": {
        "start": [[6]],
        "end": [[12]]
    },
}


calculator = OligoOverlapCalculatorOld()
overlapping_matrix = calculator._get_overlapping_matrix(database_region)
print(overlapping_matrix)

calculator = OligoOverlapCalculatorNew()
overlapping_matrix = calculator._get_overlapping_matrix(database_region)
print(overlapping_matrix)

        oligo1  oligo2  oligo3  oligo4  oligo5  oligo6
oligo1       0       1       1       1       1       1
oligo2       1       0       1       1       1       0
oligo3       1       1       0       0       0       1
oligo4       1       1       0       0       0       1
oligo5       1       1       0       0       0       1
oligo6       1       0       1       1       1       0
        oligo1  oligo2  oligo3  oligo4  oligo5  oligo6
oligo1       0       1       1       1       1       1
oligo2       1       0       1       1       1       0
oligo3       1       1       0       0       0       1
oligo4       1       1       0       0       0       1
oligo5       1       1       0       0       0       1
oligo6       1       0       1       1       1       0
