The negative peptides are sequence segments that are randomly chosen from the source proteins of IEDB HLA immunopeptidomes.

In this notebook, the exported IEDB HLA immunopeptidome dataset is loaded and 3 important file are created based on IEDB dataset and the TransPHLA dataset.

- all_neg: **List of amino acid sequence strings**

This file contains all possible peptide segments from the exported IEDB HLA immunopeptidome dataset. Thus it's unified across all different HLA alleles. We first filter the iedb neg peptides, then randomly cut these peptides into 8-14 mer aa segments.

- HLA2pos_peptide_segments: **Dict: HLA name (string) -> set of possible segments from positive peptides (set)**

This file provides all possible peptide segments of positive peptides for each HLA allele.

- HLA2candidate: **Dict: HLA name (string) -> set of filtered candidate peptide segments (set)**

This file provides all possible **candidate peptide segments** for each HLA allele, which are obtained by **subtracting positive segments from all_neg segments**.


In [2]:
import os 
import pandas as pd 
from collections import Counter

iedb_neg_raw = pd.read_csv("../neoag_data/iedb_neg/epitope_table_export_1664809776.csv", skiprows=1)
print("Raw IEDB dataset peptide num: {}".format(len(iedb_neg_raw)))
# epitope_table_export_1664809776.csv
# exported date: Oct 3, 2022
# 41469 samples
# inclusion criterion: Linear peptide; T cell & MHC ligand assay, negative only; Class I restriction; Human Host; Any Disease

iedb_neg = iedb_neg_raw[['Description']]
iedb_neg = iedb_neg.rename(columns={"Description": "peptide"})
iedb_neg['pep_length'] = iedb_neg['peptide'].map(lambda x: len(x))
print(iedb_neg.groupby(by='pep_length').count().sort_values(by ='peptide', ascending=False).head(20))
print(sorted(iedb_neg['pep_length'].unique()))
raw_all_pep_set = list(set(iedb_neg['peptide']))
pep_set_epitopes_refined = []

unusualAA = ['B', 'J', 'O', 'U', 'X', 'Z']    
for pep in raw_all_pep_set:
    inclu_flag = True
    for uAA in unusualAA:
        if uAA in pep:
            inclu_flag = False
    if inclu_flag and pep.isupper() and pep.isalpha():
        pep_set_epitopes_refined.append(pep)

pep_set_epitopes_refined = list(set(pep_set_epitopes_refined))
print(len(pep_set_epitopes_refined))
print(Counter([len(_) for _ in pep_set_epitopes_refined]))

Raw IEDB dataset peptide num: 41469
            peptide
pep_length         
9             21698
10             9217
15             4517
11             1556
20             1088
25              933
8               779
18              651
14              411
16              101
12               88
13               66
19               64
27               55
21               41
17               28
7                28
23               21
22               19
29               18
[3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 42, 43, 44, 47, 49, 50, 51]
41302
Counter({9: 21695, 10: 9217, 15: 4517, 11: 1556, 20: 1047, 25: 929, 8: 778, 18: 634, 14: 411, 16: 101, 12: 88, 13: 66, 27: 48, 17: 28, 7: 28, 19: 27, 29: 18, 21: 16, 23: 14, 22: 11, 24: 11, 32: 10, 33: 7, 26: 7, 30: 7, 36: 5, 28: 4, 35: 3, 34: 3, 31: 2, 37: 2, 47: 2, 6: 2, 44: 2, 42: 1, 3: 1, 50: 1, 43: 1, 40: 1, 49: 1})


In [12]:
import numpy as np
lengths = ['9', '10', '11', '12', '13', '14']
neg_pep_seg_pools = []
for pep in pep_set_epitopes_refined:
    if len(pep) == 8:
        neg_pep_seg_pools.append(pep)
        
for length in lengths:
    for pep in pep_set_epitopes_refined:
        cur_pep_segs = []
        if len(pep) < int(length):
            continue
        else:
            startpos, stoppos = 0, int(length)
            
            while stoppos < len(pep) + 1:
                cur_pep_segs.append(pep[startpos: stoppos])
                startpos += 1
                stoppos += 1
            assert len(pep) - int(length) == len(cur_pep_segs) - 1
        neg_pep_seg_pools.extend(cur_pep_segs)
print(len(neg_pep_seg_pools), len(set(neg_pep_seg_pools)))

neg_pep_seg_pools = list(set(neg_pep_seg_pools))
print(Counter([len(_) for _ in neg_pep_seg_pools]))
np.save("../neoag_data/main_task/IEDB_negative_segments.npy", neg_pep_seg_pools)


385399 306858
Counter({9: 85607, 10: 59041, 11: 47780, 12: 43885, 13: 38074, 14: 31693, 8: 778})


In [13]:
from collections import defaultdict
data_path = '../neoag_data/'
fold = 4
train_data_raw = pd.read_csv(os.path.join(
            data_path, 'train_data_fold{}.csv'.format(fold)), index_col=0)[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(os.path.join(
            data_path, 'val_data_fold{}.csv'.format(fold)), index_col=0)[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(os.path.join(
            data_path, 'independent_set.csv'), index_col=0)[['peptide', 'HLA', 'label']]

merged_data = pd.concat((pd.concat((train_data_raw, val_data_raw)), test_data_raw))

assert len(merged_data) == len(train_data_raw) + len(val_data_raw) + len(test_data_raw)

In [15]:
len(merged_data), len(merged_data[merged_data.label==1]), merged_data.peptide.nunique(), merged_data[merged_data.label==0].peptide.nunique(), \
    merged_data[merged_data.label==1].peptide.nunique()

(889808, 445061, 669644, 433439, 241864)

In [19]:
lengths = ['8', '9', '10', '11', '12', '13', '14']
pos_merged_data = merged_data[merged_data['label']==1]
print(len(pos_merged_data))
HLA_list = list(set(merged_data['HLA']))
print(len(HLA_list))
    
pos_pep_segs_dict = defaultdict(set)
for HLA in sorted(HLA_list):
    pos_train_HLA = pos_merged_data[pos_merged_data['HLA'] == HLA]
    # print(len(pos_train_HLA))
    
    curr_pep_set = set(pos_train_HLA['peptide'].to_list())
    
    
    for length in lengths:
        for pep in curr_pep_set:
            if len(pep) < int(length):
                continue
            startpos, stoppos = 0, int(length)
            while stoppos < len(pep) + 1:
                pos_pep_segs_dict[HLA].add(pep[startpos: stoppos])
                startpos += 1
                stoppos += 1
    print("HLA: {}\tpositive samples: {}\tsegment num: {}".format(HLA, len(curr_pep_set), len(pos_pep_segs_dict[HLA])))  

445061
112
HLA: HLA-A*01:01	positive samples: 8902	segment num: 53895
HLA: HLA-A*02:01	positive samples: 25174	segment num: 111462
HLA: HLA-A*02:02	positive samples: 2106	segment num: 7567
HLA: HLA-A*02:03	positive samples: 4170	segment num: 15624
HLA: HLA-A*02:04	positive samples: 1404	segment num: 5365
HLA: HLA-A*02:05	positive samples: 2308	segment num: 8259
HLA: HLA-A*02:06	positive samples: 4244	segment num: 14053
HLA: HLA-A*02:07	positive samples: 2742	segment num: 11312
HLA: HLA-A*02:11	positive samples: 346	segment num: 1033
HLA: HLA-A*02:12	positive samples: 290	segment num: 869
HLA: HLA-A*02:16	positive samples: 158	segment num: 471
HLA: HLA-A*02:17	positive samples: 247	segment num: 927
HLA: HLA-A*02:19	positive samples: 210	segment num: 622
HLA: HLA-A*02:20	positive samples: 841	segment num: 2521
HLA: HLA-A*02:50	positive samples: 74	segment num: 222
HLA: HLA-A*03:01	positive samples: 11150	segment num: 45641
HLA: HLA-A*11:01	positive samples: 9524	segment num: 43484
HLA: H

In [20]:
for k,v in pos_pep_segs_dict.items():
    print(k)
    print(Counter([len(_) for _ in v]))
    for kk, vv in Counter([len(_) for _ in v]).items():
        assert kk in set(list(range(8,15)))
    for pos_train_pep in set(pos_merged_data[pos_merged_data['HLA']==k]['peptide'].to_list()):
        assert pos_train_pep in v
        for length in range(8, 15):
            startpos, stoppos = 0, length
            while stoppos < len(pos_train_pep) + 1:
                assert pos_train_pep[startpos: stoppos] in v
                startpos += 1
                stoppos += 1

HLA-A*01:01
Counter({8: 23005, 9: 15459, 10: 7944, 11: 4249, 12: 2160, 13: 863, 14: 215})
HLA-A*02:01
Counter({8: 55590, 9: 34472, 10: 12648, 11: 5498, 12: 2245, 13: 794, 14: 215})
HLA-A*02:02
Counter({8: 4153, 9: 2578, 10: 836})
HLA-A*02:03
Counter({8: 8732, 9: 5244, 10: 1542, 11: 106})
HLA-A*02:04
Counter({8: 3126, 9: 1751, 10: 367, 11: 121})
HLA-A*02:05
Counter({8: 4997, 9: 2735, 10: 454, 11: 73})
HLA-A*02:06
Counter({8: 8422, 9: 4738, 10: 893})
HLA-A*02:07
Counter({8: 6319, 9: 3671, 10: 1006, 11: 316})
HLA-A*02:11
Counter({8: 687, 9: 346})
HLA-A*02:12
Counter({8: 579, 9: 290})
HLA-A*02:16
Counter({8: 313, 9: 158})
HLA-A*02:17
Counter({8: 487, 9: 314, 10: 126})
HLA-A*02:19
Counter({8: 412, 9: 210})
HLA-A*02:20
Counter({8: 1680, 9: 841})
HLA-A*02:50
Counter({8: 148, 9: 74})
HLA-A*03:01
Counter({8: 24407, 9: 14542, 10: 4551, 11: 1559, 12: 508, 13: 74})
HLA-A*11:01
Counter({8: 22205, 9: 13841, 10: 5291, 11: 1673, 12: 400, 13: 74})
HLA-A*23:01
Counter({8: 6736, 9: 3774, 10: 705, 11: 110

In [21]:
main_task_HLA2_candidate_pool = defaultdict(set)
print(len(neg_pep_seg_pools))

for HLA in pos_merged_data.HLA.unique():
    main_task_HLA2_candidate_pool[HLA] = neg_pep_seg_pools - pos_pep_segs_dict[HLA]
for k, v in main_task_HLA2_candidate_pool.items():
    print(k, len(v))
import numpy as np 
np.save(data_path+"main_task/allele2positive_segs.npy", pos_pep_segs_dict)
np.save(data_path+"main_task/allele2candidate_pools.npy", main_task_HLA2_candidate_pool)

306858
HLA-B*27:08 306838
HLA-B*49:01 306847
HLA-B*27:05 306155
HLA-A*02:02 304586
HLA-A*01:01 305517
HLA-A*11:01 303072
HLA-A*02:01 299887
HLA-A*02:06 304418
HLA-B*37:01 306835
HLA-A*29:02 305742
HLA-B*57:03 306823
HLA-A*24:02 304718
HLA-B*40:02 306275
HLA-C*03:04 306786
HLA-C*04:01 306543
HLA-A*68:01 304265
HLA-C*12:03 306757
HLA-B*44:02 306292
HLA-B*46:01 306766
HLA-B*08:01 306123
HLA-A*30:02 305834
HLA-B*38:01 306830
HLA-A*03:01 303622
HLA-B*35:03 306824
HLA-B*27:01 306802
HLA-B*57:01 306377
HLA-B*07:02 304541
HLA-B*45:01 306424
HLA-B*58:01 306277
HLA-C*05:01 306763
HLA-B*27:06 306796
HLA-A*31:01 304372
HLA-B*39:01 306575
HLA-B*50:01 306854
HLA-B*14:02 306782
HLA-B*27:09 306768
HLA-B*15:01 305021
HLA-A*32:01 306617
HLA-B*15:42 306658
HLA-B*18:01 306425
HLA-C*02:02 306798
HLA-B*35:01 305072
HLA-B*18:03 306858
HLA-B*13:02 306841
HLA-A*68:02 305031
HLA-B*40:01 306185
HLA-B*52:01 306854
HLA-B*44:03 306252
HLA-A*23:01 306196
HLA-C*01:02 306829
HLA-A*24:13 306854
HLA-A*02:07 306788
HLA-C