In [None]:
import os
from tqdm import tqdm
from typing import List

import numpy as np
import pandas as pd

from Bio import pairwise2
from Bio.Align import substitution_matrices

In [None]:
# TODO: this notebook as-is does not make any distinction between true negatives
# and false negatives â€“ will want to add some logic to cell 9, line 6 if you
# want to consider true negatives only!

In [None]:
np.random.seed(1047)

In [None]:
peptides = [
    "FLCMKALLL",
    "LLWNGPMAV"
]

In [None]:
df = pd.read_csv("STEGG_controler/stegg_binding_dataset.csv")
df = df[df.peptide.isin(peptides)]
df["cdr_full"] = df.CDR3a + df.CDR3b
df

In [None]:
for peptide in peptides:
    print(peptide)
    pep_df = df[df.peptide == peptide]
    print(pep_df.label.value_counts())

In [None]:
def blosum_distance(s1: str, s2: str) -> float:

    score = pairwise2.align.globaldx(
        s1, s2, substitution_matrices.load("BLOSUM62"), one_alignment_only=True,
        score_only=True
    )

    return 1 / score

In [None]:
def greedy_sample(seqs: List[str], k: int) -> List[str]:

    n = len(seqs)
    chosen = np.zeros(n)
    curr_i = np.random.randint(0, n)
    chosen[curr_i] = 1

    for _ in tqdm(range(k)):
        scores = np.array([blosum_distance(seqs[curr_i], seqs[i]) if chosen[i] == 0 else 0.0 for i in range(n)])
        curr_i = np.argmax(scores)
        # print(f"chose {seqs[curr_i]}")
        chosen[curr_i] = 1

    return [seqs[i] for i in range(n) if chosen[i] == 1]

In [None]:
datasets = {peptide: None for peptide in peptides}

for peptide in peptides:
    pep_df = df[df.peptide == peptide]
    pos_samples = greedy_sample(pep_df[pep_df.label == 1].cdr_full.tolist(), 120)
    neg_samples = greedy_sample(pep_df[pep_df.label == 0].cdr_full.tolist(), 120)
    datasets[peptide] = pep_df[pep_df.cdr_full.isin(pos_samples + neg_samples)]
    print(datasets[peptide].label.value_counts())


In [None]:
for peptide in peptides:
    datasets[peptide].to_csv(os.path.join("peptide_datasets/", f"{peptide}.csv"))