In [4]:
import pandas as pd 
import os
import Bio
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import ClustalwCommandline
import collections
from Bio import AlignIO
import random
import numpy as np
import sys

#sys.path.append(os.path.abspath('/home/nimar/PLP_directRNA_design_V2/PLP_directRNA_design/'))
#from PLP_directRNA_design import probedesign as plp

In [None]:
number_of_seqs= 10 # number of regions mapped/PLP
final_designed=5 # number of PLPs designed at the end/gene
plp_length=30
mismatches = 6


In [21]:
# Define IUPAC nucleotide codes for mismatches
IUPAC_CODES = {
    "R": ["A", "G"],
    "Y": ["C", "T"],
    "S": ["G", "C"],
    "W": ["A", "T"],
    "K": ["G", "T"],
    "M": ["A", "C"],
    "B": ["C", "G", "T"],
    "D": ["A", "G", "T"],
    "H": ["A", "C", "T"],
    "V": ["A", "C", "G"],
    "N": ["A", "C", "G", "T"]  # Any base
}

ligation_junctions_dict = {
    'TA': 'preferred', 'GA': 'preferred', 'AG': 'preferred',
    'TT': 'neutral', 'CT': 'neutral', 'CA': 'neutral',
    'TC': 'neutral', 'AC': 'neutral', 'CC': 'neutral',
    'TG': 'neutral', 'AA': 'neutral',
    'CG': 'non-preferred', 'GT': 'non-preferred',
    'GG': 'non-preferred', 'GC': 'non-preferred'
}

def evaluate_ligation_junction(targets, iupac_mismatches=None, plp_length=30):
    """
    Evaluates the ligation junction of a probe and introduces mismatches if needed.

    Args:
        probe_seq (str): The probe sequence.
        iupac_mismatches (list of tuples): List of positions and IUPAC codes to introduce mismatches.
                                            Example: [(5, 'R'), (10, 'Y')]
        plp_length (int): Probe length (default: 30).

    Returns:
        tuple: (updated probe sequence, ligation junction category)
    """
    # Extract the ligation junction (2 bases around the center of the probe)
    junction_position = int((plp_length / 2) - 1)
    for idx in targets.index:
        #print(f"Index: {idx}, Row Data: {targets.loc[idx]}")
        probe_seq = targets.loc[idx]['Sequence']
        ligation_junction = probe_seq[junction_position] + probe_seq[junction_position + 2]

        # Determine the ligation status
        ligation_status = ligation_junctions_dict.get(ligation_junction, "non-preferred")
        targets.loc[idx]['ligation_status'] = ligation_status

        # Apply IUPAC mismatches if provided
        num_mismatches = len(iupac_mismatches)
        if iupac_mismatches:
            for r in range(1, num_mismatches + 1):
                for subset in itertools.combinations(range(num_mismatches), r):
                    selected_mismatches = [iupac_mismatches[i] for i in subset]
                    replacement_options = [IUPAC_CODES[symbol] for pos, symbol in selected_mismatches]
                    # Generate all possible combinations of replacements
                    for replacement in itertools.product(*replacement_options):
                        new_probe_seq = list(probe_seq)
                        new_id_suffix = []
                        for (pos, iupac_symbol), new_base in zip(selected_mismatches, replacement):
                            new_probe_seq[pos] = new_base
                            new_id_suffix.append(f"{pos}_{iupac_symbol}_{new_base}")
                        new_probe_seq = "".join(new_probe_seq)
                        new_probe_id = f"{idx}|{'_'.join(new_id_suffix)}"
                        # Revaluate the ligation junction
                        new_ligation_junction = new_probe_seq[junction_position] + new_probe_seq[junction_position + 2]
                        new_ligation_status = ligation_junctions_dict.get(new_ligation_junction, "non-preferred")
                        new_row = targets.loc[idx].copy()
                        new_row['Sequence'] = new_probe_seq
                        new_row['Ligation junction'] = new_ligation_status
                        targets.loc[new_probe_id] = new_row
            
    
    return targets


In [60]:
prob_seq = 'AGAGGAGGAGAGCAAAGAGGCCAGTGCTCT'
iupac_mismatches = [(5, 'R'), (10, 'Y')]
plp_length = 30
probeSeq, ligation_status = evaluate_ligation_junction(prob_seq, iupac_mismatches, plp_length)
print(probeSeq, ligation_status)


TypeError: 'builtin_function_or_method' object is not iterable

In [61]:
import os
import pandas as pd
junction_position = int((plp_length / 2) - 1)

targets = pd.read_csv('../targets.txt', sep='\t', index_col=0)
for idx in targets.index:
    #print(f"Index: {idx}, Row Data: {targets.loc[idx]}")
    probe_seq = targets.loc[idx]['Sequence']
    ligation_junction = probe_seq[junction_position] + probe_seq[junction_position + 2]

    # Determine the ligation status
    ligation_status = ligation_junctions_dict.get(ligation_junction, "non-preferred")
    targets.loc[idx]['ligation_status'] = ligation_status
    #print(f"Ligation junction: {ligation_junction} ({ligation_status})")
    # Apply IUPAC mismatches if provided
    if iupac_mismatches:
        probe_seq = list(probe_seq)  # Convert to list for mutability
        for pos, iupac_symbol in iupac_mismatches:
            if 0 <= pos < len(probe_seq) and iupac_symbol in IUPAC_CODES:
                for base in IUPAC_CODES[iupac_symbol]:
                    new_probe_seq = probe_seq.copy()
                    new_probe_seq[pos] = base
                    new_probe_seq = "".join(new_probe_seq)  # Convert back to string
                    new_probe_id = f"{idx}|{pos}_{iupac_symbol}_{base}"
                    new_row = targets.loc[idx].copy()
                    new_row['Sequence'] = new_probe_seq
                    new_row['Ligation junction'] = ligation_junctions_dict.get(
                        new_probe_seq[junction_position] + new_probe_seq[junction_position + 2], "non-preferred"
                    )
                    targets.loc[new_probe_id] = new_row


targets

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.loc[idx]['ligation_status'] = ligation_status
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.loc[idx]['ligation_status'] = ligation_status
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.loc[idx]['ligation_status'] = ligation_status
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target

Unnamed: 0_level_0,Gene,Region,Sequence,GC,Coverage,Ligation junction
Probe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Prlh|90880853-90880882,Prlh,1:90880853-90880882,GTGCTTGCTGCTGCTAGGCTTAGTCCTCCC,60.000000,1.0,neutral
Prlh|90880854-90880883,Prlh,1:90880854-90880883,TGCTTGCTGCTGCTAGGCTTAGTCCTCCCA,56.666667,1.0,preferred
Prlh|90880855-90880884,Prlh,1:90880855-90880884,GCTTGCTGCTGCTAGGCTTAGTCCTCCCAG,60.000000,1.0,non-preferred
Prlh|90880856-90880885,Prlh,1:90880856-90880885,CTTGCTGCTGCTAGGCTTAGTCCTCCCAGG,60.000000,1.0,non-preferred
Prlh|90880857-90880886,Prlh,1:90880857-90880886,TTGCTGCTGCTAGGCTTAGTCCTCCCAGGA,56.666667,1.0,neutral
...,...,...,...,...,...,...
Grik2|49659291-49659320|10_Y_T,Grik2,10:49659291-49659320,TAAAGTCCTGTTCTGCTTGTTGTGGATCGG,50.000000,6.0,non-preferred
Grik2|49659292-49659321|5_R_A,Grik2,10:49659292-49659321,AAAGTACTGCTCTGCTTGTTGTGGATCGGA,50.000000,6.0,neutral
Grik2|49659292-49659321|5_R_G,Grik2,10:49659292-49659321,AAAGTGCTGCTCTGCTTGTTGTGGATCGGA,50.000000,6.0,neutral
Grik2|49659292-49659321|10_Y_C,Grik2,10:49659292-49659321,AAAGTCCTGCCCTGCTTGTTGTGGATCGGA,50.000000,6.0,neutral


In [59]:
import itertools

num_mismatches = len(iupac_mismatches)
for subset in itertools.combinations(range(num_mismatches), 2):
    selected_mismatches = [iupac_mismatches[i] for i in subset]
    replacement_options = [IUPAC_CODES[symbol] for pos, symbol in selected_mismatches]
    for replacement in itertools.product(*replacement_options):
        print(replacement)
    #print(replacement_options)
    #for pos, symbol in selected_mismatches:
        #print(pos, symbol)
        #print(IUPAC_CODES[symbol])



('A', 'C')
('A', 'T')
('G', 'C')
('G', 'T')


In [None]:
for replacement in itertools.product(*replacement_options):
    print(replacement)
    GTGCTTGCTGCTGCTAGGCTTAGTCCTCCC

('A', 'C')
('A', 'T')
('G', 'C')
('G', 'T')
