In [None]:
import re
import random
from mhcflurry import Class1AffinityPredictor
from tqdm import tqdm
import sys

In [None]:
predictor = Class1AffinityPredictor.load()

In [None]:
synth = pd.read_csv('results/synthetic_results.csv')

In [None]:
mhc_molecule_list = synth['mhc_molecule'].unique().tolist()
tcr_sequence_list = synth['tcr_sequence'].unique().tolist()

In [None]:
def tcr_reward(curr_seq, tcr_sequences):
    rewards = []
    for tcr_sequence in tcr_sequences:
        tcr_predict = tcrgp.predict([curr_seq], [tcr_sequences])
        predicted_tcr = tcr_predict[0]
        rewards.append(predicted_tcr)
    
    return rewards

def classification_reward(curr_seq):
    peptide_sequence = [curr_seq]  
    probabilities = classifier.predict_proba(peptide_sequence)
    class_1_probabilities = [prob[1] for prob in probabilities]
    return class_1_probabilities


# Load valid amino acids
def load_valid_aas(dictname):
    aas = []
    fh = open(dictname, 'r')
    for line in fh:
        spl = re.split('\s+', line.strip())
        aas.append(spl[0])
    fh.close()
    return aas


# Eliminate empty characters from a sequence
def eliminate_empty(seq):
    return seq.replace('0', '')

# Randomly select an amino acid
def random_aa(aas):
    return aas[random.randint(0, len(aas) - 1)]

# Generate a sequence within a length range
def generate_seq(lmin, lmax, aas):
    seq = ''
    ltarg = random.randint(lmin, lmax)
    while len(seq) < ltarg:
        seq += eliminate_empty(random_aa(aas))
    return seq

# Alter a sequence by replacing an amino acid
def alter_seq(seq, aas):
    ind = random.randint(0, len(seq))
    seq = list(seq)
    aa = random_aa(aas)
    if ind < len(seq):
        seq[ind] = aa
    else:
        seq.append(aa)
    seq = ''.join(seq)
    return eliminate_empty(seq)

# Calculate MHC binding affinity
def get_binding_affinity(peptide_sequence, mhc_proteins):
    rewards = []
    for mhc_protein in mhc_proteins:
        # Predict binding affinity using MHCflurry 2.0 model
        predicted_ic50s = predictor.predict([peptide_sequence], [mhc_protein])
        predicted_ic50 = predicted_ic50s[0]
        
        # Normalize the predicted IC50 to a reward value between 0 and 1
        max_ic50 = 50000
        reward = 1.0 - (predicted_ic50 / max_ic50)
        
        rewards.append(reward)
    
    return rewards

# Update selected sequences
def update_selected(selected, threshold_mhc, threshold_tcr, seq, mhc, rew_mhc, rew_tcr, rew_class, tcr_sequence):
    if rew_mhc > threshold_mhc and rew_tcr > threshold_tcr and rew_class == 1 and seq not in selected:
        selected[seq] = {'MHC': mhc, 'Rew_MHC': rew_mhc, 'TCR_Seq': tcr_sequence, 'Rew_TCR': rew_tcr, 'Rew_Class': rew_class}
        print(' selected', end='')
    return selected

In [None]:
import numpy as np

def needleman_wunsch_normalized(seq1, seq2, match=2, mismatch=1, gap=0):
    def calculate_max_score(seq1, seq2, match, mismatch, gap):
        # Calculate the maximum possible score
        max_score = max(len(seq1), len(seq2)) * match
        return max_score

#     Create a matrix to store alignment scores
    matrix = np.zeros((len(seq1) + 1, len(seq2) + 1))

    #Initialize the first row and first column with gap penalties
    for i in range(len(seq1) + 1):
        matrix[i][0] = i * gap
    for j in range(len(seq2) + 1):
        matrix[0][j] = j * gap

    # Fill in the matrix
    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            match_score = matrix[i-1][j-1] + (match if seq1[i-1] == seq2[j-1] else mismatch) 
            delete_score = matrix[i-1][j] + gap
            insert_score = matrix[i][j-1] + gap
            matrix[i][j] = max(match_score, delete_score, insert_score)

    # Calculate the normalized similarity score
    similarity_score = matrix[len(seq1)][len(seq2)] / calculate_max_score(seq1, seq2, match, mismatch, gap)

    return similarity_score

# Check sequence similarity using your Needleman-Wunsch algorithm
def is_sequence_similar_to_selected(new_seq, selected_seq, similarity_threshold):
    for selected_peptide in selected_seq:
        similarity_score = needleman_wunsch_normalized(new_seq, selected_peptide)  

        if similarity_score >= similarity_threshold:
            return True
    return False


lmin = 8
lmax = 12
thr_rew_mhc = 0.82  # Threshold for binding affinity
thr_rew_tcr = 0.53  # Threshold for TCR reward
similarity_threshold = 0.7


# Extend the list of MHC proteins and TCR sequences
mhc_proteins = mhc_molecule_list
tcr_sequences = tcr_sequence_list
aas = load_valid_aas('generators\Dictionary')

# Initialize selected sequences dictionary and output data list
selected_seq = {}
output_data = []

# Define a dictionary to track the count of each MHC molecule
mhc_counts = {mhc: 0 for mhc in mhc_proteins}
tcr_seq_counts = {tcr: 0 for tcr in tcr_sequences}

# Initialize max MHC and TCR values and corresponding indices
max_mhc_values = [-1] * len(mhc_proteins)
max_tcr_values = [-1] * len(tcr_sequences)
max_mhc_idx = -1
max_tcr_idx = -1

# Initialize progress bar
pbar = tqdm(desc="Processing sequences")

curr_seq = generate_seq(lmin, lmax, aas)  # Initialize curr_seq
curr_rew_mhc_values = get_binding_affinity(curr_seq, mhc_proteins) 
curr_rew_tcr = tcr_reward(curr_seq, tcr_sequences)
curr_rew_class = classification_reward(curr_seq)


while len(selected_seq) < 100:
    new_seq = alter_seq(curr_seq, aas)
    accept = False
    selected_in_iteration = False

    if new_seq != curr_seq and lmin <= len(new_seq) <= lmax:
        new_rew_class = classification_reward(new_seq)

        if new_rew_class > 0.5:  # Updated condition
            # Check if the new sequence is similar to any selected sequence
            if not is_sequence_similar_to_selected(new_seq, selected_seq.keys(), similarity_threshold):
                new_rew_mhc_values = get_binding_affinity(new_seq, mhc_proteins)
                new_rew_tcr_values = tcr_reward(new_seq, tcr_sequences)

                max_mhc_idx = new_rew_mhc_values.index(max(new_rew_mhc_values))
                max_tcr_idx = new_rew_tcr_values.index(max(new_rew_tcr_values))

                # Check if the selected MHC molecule and TCR sequence have exceeded the maximum count
                selected_mhc = mhc_proteins[max_mhc_idx]
                selected_tcr = tcr_sequences[max_tcr_idx]

                if mhc_counts[selected_mhc] < 5 and tcr_seq_counts[selected_tcr] < 5:
                    product_value = new_rew_mhc_values[max_mhc_idx] * new_rew_tcr_values[max_tcr_idx]

                    if product_value > (curr_rew_mhc_values[max_mhc_idx] * curr_rew_tcr[max_tcr_idx]) - 0.35 * random.random():
                        accept = True
                        if new_seq not in selected_seq:
                            selected_in_iteration = True
                            # Update the count of the selected MHC molecule and TCR sequence
                            mhc_counts[selected_mhc] += 1
                            tcr_seq_counts[selected_tcr] += 1
                            selected_seq = update_selected(selected_seq, thr_rew_mhc, thr_rew_tcr, new_seq, selected_mhc, new_rew_mhc_values[max_mhc_idx], new_rew_tcr_values[max_tcr_idx], curr_rew_class, selected_tcr)
                            pbar.update(1)

                # Initialize max MHC and TCR values and corresponding indices
                max_mhc_values = [-1] * len(mhc_proteins)
                max_tcr_values = [-1] * len(tcr_sequences)
                max_mhc_idx = -1
                max_tcr_idx = -1

                if selected_in_iteration:
                    data_dict = {
                        "Sequence": new_seq,
                        **{f"MHC_{i+1}": new_rew_mhc_values[i] for i in range(len(mhc_proteins))},
                        **{f"TCR_{i+1}": new_rew_tcr_values[i] for i in range(len(tcr_sequences))},
                        "Classification": new_rew_class,  # Add new_rew_class here
                    }
                    output_data.append(data_dict)  # Ensure data_dict is appended to output_data

                # Remove MHC molecule if its count reaches 10
                if mhc_counts[selected_mhc] >= 10:
                    mhc_proteins.remove(selected_mhc)

                # Remove TCR sequence if its count reaches 10
                if tcr_seq_counts[selected_tcr] >= 10:
                    tcr_sequences.remove(selected_tcr)

pbar.close()

In [None]:
selected_seq 

In [None]:
mhc_counts

In [None]:
tcr_seq_counts

In [None]:
# Calculate the product of Rew_MHC, Rew_TCR, and Classification for each entry
for key, value in selected_seq.items():
    product = value['Rew_MHC'] * value['Rew_TCR'] * value['Classification']
    selected_seq[key]['Product'] = product

# Sort the entries by the product value in descending order and get the top 5
top_5_entries = sorted(selected_seq.items(), key=lambda x: x[1]['Product'], reverse=True)[:5]

# Print top 5 entries
for key, value in top_5_entries:
    print(f"Peptide: {key}, MHC: {value['MHC']}, TCR_Seq: {value['TCR_Seq']}, "
          f"Rew_MHC: {value['Rew_MHC']}, Rew_TCR: {value['Rew_TCR']}, "
          f"Classification: {value['Classification']}, Product: {value['Product']}")