In [None]:
# https://pubs.acs.org/doi/full/10.1021/bi9825091

In [2]:
import numpy as np
import pandas as pd
from numpy.linalg import svd
from sklearn.linear_model import LinearRegression

# Example matrices
S = np.random.rand(51, 16)  # Replace with actual stacking matrix
GMISM = np.random.rand(51, 1)  # Replace with actual free energy increments

# SVD decomposition of S matrix
U, Sigma, Vt = svd(S)

# Solve the linear regression problem using SVD or linear regression
model = LinearRegression(fit_intercept=False)
model.fit(S, GMISM)
GNN = model.coef_  # The 16 unknown mismatch parameters

In [15]:
import numpy as np

def count_mismatch_pairs(seq1, seq2):
    """
    Count nearest-neighbor mismatch dimer occurrences between two sequences.
    
    seq1: 5' to 3' direction sequence (string)
    seq2: 3' to 5' direction complementary sequence (string)
    
    Returns:
    A 16-element vector with counts of each mismatch dimer.
    """
    # Define the 16 possible mismatch dimer pairs in the order ["AA", "AC", ..., "TT"]
    mismatch_pairs = [
        "AA", "AC", "AG", "AT",
        "CA", "CC", "CG", "CT",
        "GA", "GC", "GG", "GT",
        "TA", "TC", "TG", "TT"
    ]
    
    # Create a dictionary to map mismatch pairs to their index in the result vector
    mismatch_indices = {pair: i for i, pair in enumerate(mismatch_pairs)}
    
    # Initialize a 16-element vector to store mismatch counts
    mismatch_vector = np.zeros(16, dtype=int)
    
    # Check the lengths of the sequences (must be the same length)
    if len(seq1) != len(seq2):
        raise ValueError("Sequences must be of equal length.")
    
    # Loop over each adjacent pair in the sequences and count mismatches
    for i in range(len(seq1) - 1):
        # Get the 5'-to-3' dimer in seq1 (adjacent bases)
        dimer1 = seq1[i:i+2]
        # Get the corresponding 3'-to-5' dimer in seq2 (complementary bases, reversed order)
        dimer2 = seq2[i:i+2][::-1]  # Reverse the 3'-to-5' pair to match 5'-to-3'
        
        # Only count it as a mismatch if the two dimers are not identical
        if dimer1 != dimer2:
            # Find the appropriate mismatch pair and increment the corresponding count
            mismatch_vector[mismatch_indices[dimer1]] += 1
    
    return mismatch_vector

# Example Usage:
seq1 = "ATCGTACGTA"  # 5' to 3' direction
seq2 = "TACGCATGCA"  # 3' to 5' direction (complementary sequence)

mismatch_vector = count_mismatch_pairs(seq1, seq2)
print("Mismatch Vector:", mismatch_vector)


Mismatch Vector: [0 1 0 0 0 0 2 0 0 0 0 2 2 1 0 0]


In [23]:

def count_mismatches_with_case(sequence):
    """
    Count nearest-neighbor mismatch dimer occurrences in a single sequence where lowercase letters indicate mismatches.
    
    sequence: A DNA sequence string where lowercase letters represent mismatches.
    
    Returns:
    A 16-element vector with counts of each mismatch dimer.
    """
    # Define the 16 possible mismatch dimer pairs in the order ["AA", "AC", ..., "TT"]
    mismatch_pairs = [
        "AA", "AC", "AG", "AT",
        "CA", "CC", "CG", "CT",
        "GA", "GC", "GG", "GT",
        "TA", "TC", "TG", "TT"
    ]
    
    # Create a dictionary to map mismatch pairs to their index in the result vector
    mismatch_indices = {pair: i for i, pair in enumerate(mismatch_pairs)}
    
    # Initialize a 16-element vector to store mismatch counts
    mismatch_vector = np.zeros(16, dtype=int)
    
    # Convert the sequence to uppercase for normal processing
    sequence_upper = sequence.upper()
    
    # Loop through the sequence and check for mismatches (lowercase letters)
    for i in range(len(sequence) - 1):
        # If the current or next base is lowercase, treat it as a mismatch
        if sequence[i].islower() or sequence[i+1].islower():
            # Extract the dimer (current base and the next base) in 5'-to-3' direction
            dimer = sequence_upper[i:i+2]
            
            # Only count mismatches for valid dimers (2 characters long)
            if len(dimer) == 2 and dimer in mismatch_indices:
                # Increment the count for this mismatch dimer
                mismatch_vector[mismatch_indices[dimer]] += 1
    
    return mismatch_vector

# Example Usage:
sequence = "TAAaAAT"  # The sequence where lowercase indicates mismatches
mismatch_vector = count_mismatches_with_case(sequence)

print("Mismatch Vector:", mismatch_vector)

Mismatch Vector: [2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [24]:
import numpy as np

def count_mismatch_categories(sequence):
    """
    Count the occurrence of mismatch categories in a sequence where lowercase letters represent mismatches.
    
    sequence: A DNA sequence where lowercase letters indicate mismatches.
    
    Returns:
    A 16-element vector with counts of each mismatch category.
    """
    # Define the 16 possible mismatch categories in the Xy/ZW pattern
    mismatch_categories = [
        "AA/TA", "CA/GA", "GA/CA", "TA/AA",  # A·A mismatches
        "AC/TC", "CC/GC", "GC/CC", "TC/AC",  # C·C mismatches
        "AG/TG", "CG/GG", "GG/CG", "TG/AG",  # G·G mismatches
        "AT/TT", "CT/GT", "GT/CT", "TT/AT"   # T·T mismatches
    ]
    
    # Create a dictionary to map mismatch categories to their index in the result vector
    mismatch_indices = {category: i for i, category in enumerate(mismatch_categories)}
    
    # Initialize a 16-element vector to store mismatch counts
    mismatch_vector = np.zeros(16, dtype=int)
    
    # Loop through the sequence and check for mismatches (lowercase letters)
    for i in range(len(sequence) - 1):
        # If the current or next base is lowercase, treat it as a mismatch
        if sequence[i].islower() or sequence[i+1].islower():
            # Extract the dimer (current base and the next base)
            dimer = sequence[i:i+2]
            
            # Convert to uppercase for comparison and match with mismatch categories
            dimer_upper = dimer.upper()
            
            # Find the corresponding mismatch category based on the dimer and its complement
            for category in mismatch_categories:
                left, right = category.split('/')
                if dimer_upper == left:
                    # We found a match, increment the count for this mismatch category
                    mismatch_vector[mismatch_indices[category]] += 1
                    break
    
    return mismatch_vector

# Example Usage:
sequence = "TAAaAATGCGgtAGTTtt"  # The sequence where lowercase indicates mismatches
mismatch_vector = count_mismatch_categories(sequence)

print("Mismatch Vector:", mismatch_vector)


Mismatch Vector: [2 0 0 1 0 0 0 0 0 0 1 0 0 0 1 2]


In [8]:
import pandas as pd

In [25]:
df = pd.read_csv("NN_mismatches.csv")

In [27]:
df.dropna()

Unnamed: 0.1,Unnamed: 0,H,Unnamed: 2,S,Unnamed: 4,G,Unnamed: 6,Tm,Unnamed: 8
0,sequencesa,experimentb,prediction,experimentb,prediction,experimentb,prediction,experimentc,predictionc
2,CAAAaAAAG/d,−36.9,−41.9,−107.0,−123.8,−3.71,−3.47,21.3,21.5
3,CGATaATCG,−50.8,−42.4,−148.0,−120.8,−4.86,−4.96,32.1,31.9
4,GGAAaTTCC,−51.5,−45.7,−151.4,−132.2,−4.59,−4.70,30.6,30.5
5,GGACaGTCC,−53.7,−50.7,−153.2,−144.4,−6.22,−5.94,40.2,38.6
6,GGAGaCTCC,−51.6,−53.5,−145.7,−152.8,−6.38,−6.14,41.3,39.7
7,CATGAaGCTAC/,−65.2,−65.1,−185.4,−185.4,−7.70,−7.62,46.9,46.5
8,CATGTaACTAC/,−48.0,−54.8,−133.8,−155.5,−6.52,−6.62,42.5,42.4
9,GATCTaTGTAC/,−59.3,−57.9,−170.6,−165.9,−6.42,−6.41,40.9,41.0
10,GGATGaATAGC/,−69.3,−61.9,−198.2,−174.9,−7.81,−7.63,46.9,47.1
