In [1]:
import os
from tqdm import tqdm
from typing import List

import numpy as np
import pandas as pd

from Bio import pairwise2
from Bio.Align import substitution_matrices



In [2]:
# TODO: this notebook as-is does not make any distinction between true negatives
# and false negatives â€“ will want to add some logic to cell 9, line 6 if you
# want to consider true negatives only!

In [3]:
np.random.seed(1047)

In [4]:
peptides = [
    "GILGFVFTL",
    "ELAGIGILTV",
    "AVFDRKSDAK",
    "KLGGALQAK",
    "IVTDFSVIK",
    "NLVPMVATV",
    "RAKFKQLL",
    "GLCTLVAML"
]

In [5]:
df = pd.read_csv("stag_llm_dataset.csv")
df = df[df.peptide.isin(peptides)]
df["cdr_full"] = df.CDR3a + df.CDR3b
df.head()

Unnamed: 0,CDR3a,CDR3b,MHC Sequence,peptide,TCR_A_sequence,TCR_B_sequence,label,split,cdr_full
435,CAVNAPTGNQFYF,CATASGRLRTQYF,GSHSMRYFYTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,AVFDRKSDAK,AQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQL...,DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLI...,0,train,CAVNAPTGNQFYFCATASGRLRTQYF
436,CAVNSGYSTLTF,CATASGTVTDTQYF,GSHSMRYFYTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,AVFDRKSDAK,AQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQL...,DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLI...,1,train,CAVNSGYSTLTFCATASGTVTDTQYF
437,CAEYGGATNKLIF,CATGLAGNEQFF,GSHSMRYFYTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,AVFDRKSDAK,GEDVEQSLFLSVREGDSSVINCTYTDSSSTYLYWYKQEPGAGLQLL...,DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLI...,0,test,CAEYGGATNKLIFCATGLAGNEQFF
438,CALEYGGSQGNLIF,CATGTDTQYF,GSHSMRYFYTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,AVFDRKSDAK,SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVF...,DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLI...,0,train,CALEYGGSQGNLIFCATGTDTQYF
439,CARETSGSRLTF,CATPRWLAKNIQYF,GSHSMRYFYTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,AVFDRKSDAK,AQTVTQSQPEMSVQEAETVTLSCTYDTSESDYYLFWYKQPPSRQMI...,DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLI...,1,train,CARETSGSRLTFCATPRWLAKNIQYF


In [6]:
df10x = pd.read_csv('10x(random_neg).csv')
df10x["cdr_full"] = df10x.CDR3a + df10x.CDR3b

In [7]:

# Load the datasets
df = pd.read_csv('stag_llm_dataset.csv')
df10x = pd.read_csv('10x(random_neg).csv')

# Get the number of rows before dropping
rows_before = len(df)

# Create a set of unique (CDR3a, CDR3b) tuples from the 10x(random_neg) file
neg_cdr3_set = set(df10x[['CDR3a', 'CDR3b']].apply(tuple, axis=1))

# Combine CDR3a and CDR3b in df_stag into a tuple
stag_cdr3_tuples = df[['CDR3a', 'CDR3b']].apply(tuple, axis=1)

# Define the condition to DROP: label == 0 AND the CDR3 pair is NOT in the negative set
# The '~' operator negates the boolean series from .isin(), selecting combinations *not* found in the negative set.
drop_mask = (df['label'] == 0) & (~stag_cdr3_tuples.isin(neg_cdr3_set))

# Filter df_stag to KEEP rows that do not match the drop_mask
df = df[~drop_mask].copy()

# Get the number of rows after dropping
rows_after = len(df)

# Print the before and after counts
print(f"Number of rows in stag_llm_dataset before filtering: {rows_before}")
print(f"Number of rows in stag_llm_dataset after filtering: {rows_after}")

Number of rows in stag_llm_dataset before filtering: 46201
Number of rows in stag_llm_dataset after filtering: 14079


In [8]:
for peptide in peptides:
    print(peptide)
    pep_df = df[df.peptide == peptide]
    print(pep_df.label.value_counts())

GILGFVFTL
label
0    1548
1    1490
Name: count, dtype: int64
ELAGIGILTV
label
1    407
0    355
Name: count, dtype: int64
AVFDRKSDAK
label
1    796
0    699
Name: count, dtype: int64
KLGGALQAK
label
1    1067
0     873
Name: count, dtype: int64
IVTDFSVIK
label
1    378
0    361
Name: count, dtype: int64
NLVPMVATV
label
1    277
0    227
Name: count, dtype: int64
RAKFKQLL
label
1    1159
0     978
Name: count, dtype: int64
GLCTLVAML
label
0    235
1    210
Name: count, dtype: int64


In [9]:
def blosum_distance(s1: str, s2: str) -> float:

    score = pairwise2.align.globaldx(
        s1, s2, substitution_matrices.load("BLOSUM62"), one_alignment_only=True,
        score_only=True
    )

    return 1 / score

In [10]:
def greedy_sample(seqs: List[str], k: int) -> List[str]:

    n = len(seqs)
    chosen = np.zeros(n)
    curr_i = np.random.randint(0, n)
    chosen[curr_i] = 1

    for _ in tqdm(range(k)):
        scores = np.array([blosum_distance(seqs[curr_i], seqs[i]) if chosen[i] == 0 else 0.0 for i in range(n)])
        curr_i = np.argmax(scores)
        # print(f"chose {seqs[curr_i]}")
        chosen[curr_i] = 1

    return [seqs[i] for i in range(n) if chosen[i] == 1]

In [11]:
datasets = {peptide: None for peptide in peptides}

for peptide in peptides:
    pep_df = df[df.peptide == peptide]
    pos_samples = greedy_sample(pep_df[pep_df.label == 1].cdr_full.tolist(), 120)
    neg_samples = greedy_sample(pep_df[pep_df.label == 0].cdr_full.tolist(), 120)
    datasets[peptide] = pep_df[pep_df.cdr_full.isin(pos_samples + neg_samples)]
    print(datasets[peptide].label.value_counts())


AttributeError: 'DataFrame' object has no attribute 'cdr_full'

In [None]:
for peptide in peptides:
    datasets[peptide].to_csv(os.path.join("peptide_datasets/", f"{peptide}.csv"))