# Dataset 1 

This is the main dataset used for training and testing the models.

It is developed by mapping pHLA structures modeled with APE-Gen by Abella et al 2020 to the experimental binding affinities curated by MHCFlurry 2.0.
The train/test split is made by extracting 10% of binders and 10% of nonbinders into the test portion, while the rest constitutes the training set. For 5-fold cross-validation, training set is split into 5 folds with equal distribution of binding affinities. 

We follow these steps:

1. Map modeled Abella2020 structures to their MHCFlurry2.0 binding affinities
2. Fill the dataset with the data for alleles where sufficient data is available
3. Split the dataset to train/test portions
4. Make the cross-validation splits

### Step 1 - Map modeled Abella2020 structures to their MHCFlurry2.0 binding affinities

**input:** Abella2020 and MHCFlurry2.0 datasets

**output:** full_dataset.csv 

In [5]:
import pandas as pd
import plotly.express as px
from ipywidgets import Output, VBox
from IPython.display import display, clear_output
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import logomaker
from utils import *
import math
#%matplotlib inline


data_path = "./source_data/"

In [6]:
#read in the MHCFlurry dataset
mhcf2_path = data_path + "MHCFlurry_Data_S3.csv"
mhcf_df = pd.read_csv(mhcf2_path)

#read in Abella2020 datasets
struct_path = data_path + "Abella2020_data.csv"
struct_df = pd.read_csv(struct_path)

#format MHCFlurry dataset
mhcf_hla_df = mhcf_df[mhcf_df["allele"].str.contains("HLA")].reset_index(drop=True)
mhcf_hla_df["allele_full"] = mhcf_hla_df["allele"]
mhcf_hla_df.loc[:,("allele")] = mhcf_hla_df["allele_full"].apply(lambda name: name[4]+name[6:8]+name[9:])

#merge the datasets
data_merged = pd.merge(struct_df, mhcf_hla_df, how='inner', on=['allele', 'peptide'])

data_merged = data_merged[["allele", "peptide", "binder", "allele_type", "path", "measurement_inequality",
                          "measurement_value", "measurement_type"]]

#add  Abella2020 C decoys
Cd_path = data_path + "Abella2020_C_decoys.csv"
Cd_df = pd.read_csv(Cd_path)
data_merged = data_merged.append(Cd_df)

#remove duplicated entries
data_merged = drop_duplicates(data_merged)

#filter only the data with sufficient number of binders and nonbinders
data_merged = extract_sufficient_data(data_merged)

In [7]:
#transform the binding affinity labels
def nM_transform(val):
    if val > 50000: return 0.0
    return 1 - math.log(val)/math.log(50000)

data_merged["ba"] = data_merged["measurement_value"].apply(nM_transform)

In [8]:

#visualize the data
x = pd.DataFrame({'count_all' : data_merged.groupby(["allele_type", "allele"]).size()}).reset_index()
y = pd.DataFrame(data_merged.groupby(["allele_type", "allele", "binder"]).count().unstack(fill_value=0).stack().reset_index())

vbox = VBox(children=[])
logos = []
plt.ioff()
out = Output()

fig, axs = plt.subplots(1, 3,figsize=(15,5))
crp_counts_df =logomaker.alignment_to_matrix(sequences=data_merged[data_merged['binder']==1]["peptide"].to_numpy(), to_type='counts')
clear_output()

logomaker.Logo(crp_counts_df, ax = axs[0], color_scheme="charge")
logomaker.Logo(crp_counts_df, ax = axs[1], color_scheme="hydrophobicity")
logomaker.Logo(crp_counts_df, ax = axs[2])

@out.capture(clear_output=True)
def event(trace, points, selector):
    utils_logo_event(trace, points, selector, fig, data_merged, axs)
    
sb = px.sunburst(y, path=['allele_type', 'allele'], values="peptide", color='binder', color_discrete_map={'(?)':'black', 0:'gold', 1:'red'})
fig = go.FigureWidget(sb)
fig.data[0].on_click(event)

vbox.children = [fig, out]
vbox

VBox(children=(FigureWidget({
    'data': [{'branchvalues': 'total',
              'customdata': array([[0.478…

In [23]:
#Information logo only

logo_allele = "A0201"
logo_data = data_merged[data_merged['allele']==logo_allele]

crp_counts_df =logomaker.alignment_to_matrix(sequences=logo_data[logo_data['binder']==1]["peptide"].to_numpy(), 
                                             to_type='information')
clear_output()
fig, axs = plt.subplots(1, 1,figsize=(7,5))
logomaker.Logo(crp_counts_df, ax= axs)
plt.savefig("A0201_logo.pdf")

In [15]:
data_merged.to_csv("full_dataset.csv", index=False)

## Step 2 - split data to train/test sets

The split is created by extracting 10% of all binders and 10% of all nonbinders of the full dataset to the test set.

**input:** full_dataset.csv

**output:** train_set.csv / test_set.csv

In [2]:
data_merged = pd.read_csv("full_dataset.csv")

In [4]:
data_merged

Unnamed: 0,allele,peptide,binder,allele_type,path,measurement_inequality,measurement_value,measurement_type,ba
0,A0101,YLEQLHQLY,1,HLA-A,A0101-YLEQLHQLY.pdb,<,100.0,qualitative,0.574375
1,A0101,HSERHVLLY,1,HLA-A,A0101-HSERHVLLY.pdb,<,100.0,qualitative,0.574375
2,A0101,MTDPEMVEV,1,HLA-A,A0101-MTDPEMVEV.pdb,<,100.0,qualitative,0.574375
3,A0101,LTDFIREEY,1,HLA-A,A0101-LTDFIREEY.pdb,<,100.0,qualitative,0.574375
4,A0101,LLDQRPAWY,1,HLA-A,A0101-LLDQRPAWY.pdb,<,100.0,qualitative,0.574375
...,...,...,...,...,...,...,...,...,...
77576,C1601,QQTTTSFQN,0,HLA-C,confs/C1601-QQTTTSFQN/full_system_confs/39.pdb,>,70000.0,decoy,0.000000
77577,C1601,QQVEQMEIP,0,HLA-C,confs/C1601-QQVEQMEIP/full_system_confs/0.pdb,>,70000.0,decoy,0.000000
77578,C1601,QQWQVFSAE,0,HLA-C,confs/C1601-QQWQVFSAE/full_system_confs/0.pdb,>,70000.0,decoy,0.000000
77579,C1601,QRCVVLRFL,0,HLA-C,confs/C1601-QRCVVLRFL/full_system_confs/39.pdb,>,70000.0,decoy,0.000000


In [9]:
split, split_df = create_split(data_merged, 0.1, 0.1)
train_set, test_set = apply_split(data_merged, split_df)

----------------------------------
Allele: A0101
Number of peptides (train/test): 2418.4/604.6
Number of binders (train/test): 1300.5/144.5
Number of nonbinders (train/test): 1420.2/157.8
----------------------------------
Allele: A0201
Number of peptides (train/test): 6272.8/1568.2
Number of binders (train/test): 5247.9/583.1
Number of nonbinders (train/test): 1809.0/201.0
----------------------------------
Allele: A0203
Number of peptides (train/test): 2210.4/552.6
Number of binders (train/test): 1080.0/120.0
Number of nonbinders (train/test): 1406.7/156.3
----------------------------------
Allele: A0206
Number of peptides (train/test): 1104.8/276.2
Number of binders (train/test): 729.9/81.10000000000001
Number of nonbinders (train/test): 513.0/57.0
----------------------------------
Allele: A0301
Number of peptides (train/test): 4539.2/1134.8
Number of binders (train/test): 3656.7/406.3
Number of nonbinders (train/test): 1449.9/161.10000000000002
----------------------------------
A

In [10]:
train_set["fileloc"] = train_set["path"]
test_set["fileloc"] = test_set["path"]
train_set = train_set[["allele", "peptide", "ba", "binder", "fileloc", "allele_type"]]
test_set = test_set[["allele", "peptide", "ba", "binder", "fileloc", "allele_type"]]

In [11]:
#train_set.to_csv("../train_set.csv", index=False)
#test_set.to_csv("../test_set.csv", index=False)

## Step 3 - construct equally distributed folds for 5-fold crossvalidation

Label the 5 folds within the train_set such that the binder/nonbinder distribution is preserved.

**input:** train_set.csv

**output:** train_set.csv with updated folds

In [2]:
train_set = pd.read_csv("train_set.csv")

In [3]:
#initialize the same folds
train_set["fold_num"] = train_set.apply(lambda x: 0, axis=1)

In [None]:
alleles = train_set["allele"].unique()

for i, allele in enumerate(alleles):
    print(allele)
    allele_data = train_set[train_set["allele"]==allele]
    
    allele_data = allele_data.sort_values(by='ba', ascending=False)
    
    allele_data_b = allele_data[allele_data["binder"]==1]
    allele_data_nb = allele_data[allele_data["binder"]==0]
    
    allele_data_b = allele_data_b.reset_index()
    allele_data_nb = allele_data_nb.reset_index()
    
    allele_cnt = len(allele_data.index)
    print("peptide count: "+str(allele_cnt))
    b_cnt = len(allele_data_b.index)
    print("binders count: "+str(b_cnt))
    nb_cnt = len(allele_data_nb.index)
    print("nonbinders count: "+str(nb_cnt))
    
    fold_cnt = int(allele_cnt/5)
    #ensure that each fold has equal distribution of the minority class
    min_set = 0
    min_set_cnt = nb_cnt
    min_set_fold_cnt = int(nb_cnt/5)
    if b_cnt < nb_cnt:
        min_set = 1
        min_set_cnt = b_cnt
        min_set_fold_cnt = int(b_cnt/5)
    
    max_set_fold_cnt = int((allele_cnt - min_set_cnt)/5)
    
    for fold in range(5):
        
        if min_set == 0:
            fold_indexes_nb = random.sample(range(len(allele_data_nb)), int(min_set_fold_cnt))
            fold_indexes_b = random.sample(range(len(allele_data_b)), int(max_set_fold_cnt))
        else:  
            fold_indexes_b = random.sample(range(len(allele_data_b)), int(min_set_fold_cnt))
            fold_indexes_nb = random.sample(range(len(allele_data_nb)), int(max_set_fold_cnt))
        
        fold_b = allele_data_b[allele_data_b.index.isin(fold_indexes_b)]
        fold_nb = allele_data_nb[allele_data_nb.index.isin(fold_indexes_nb)]
        
        train_set = update_fold(train_set, fold, fold_b, fold_nb)
        
        allele_data_b = allele_data_b[~allele_data_b.index.isin(fold_indexes_b)]
        allele_data_nb = allele_data_nb[~allele_data_nb.index.isin(fold_indexes_nb)]        
        allele_data_b = allele_data_b.reset_index(drop=True)
        allele_data_nb = allele_data_nb.reset_index(drop=True)   
        

In [None]:
#train_set.to_csv("../train_set.csv", index=False)