In [37]:
import pandas as pd
import numpy as np
import random
random.seed(621)
data = pd.read_csv('iedb_mira_pos_uniq.txt', sep="\t", header=None)
data.columns = ['peptide', 'cdr3', 'type', 'score']
data

Unnamed: 0,peptide,cdr3,type,score
0,AAGIGILTV,AISEVGVGQPQH,HLA-A02:01,1
1,AAGIGILTV,ASSLSFGTEAF,HLA-A02:01,1
2,AAGIGILTV,ASSWSFGTEAF,HLA-A02:01,1
3,AAGIGILTV,AWSETGLGTGELF,HLA-A02:01,1
4,AIMDKNIIL,ASSQLTGQETQY,HLA-A02:01,1
...,...,...,...,...
9323,YVLDHLIVV,SVGGTSGGATKNEQY,HLA-A02:01,1
9324,YVLDHLIVV,SVGGTSGGQIQETQY,HLA-A02:01,1
9325,YVLDHLIVV,SVGQALYNEQF,HLA-A02:01,1
9326,YVLDHLIVV,SVGRPLDNEQF,HLA-A02:01,1


In [38]:
peptides = data.drop(columns = ['cdr3', 'type', 'score'])
peptides = peptides.drop_duplicates()
peptidesList = peptides['peptide'].tolist()
cdr3List = data['cdr3'].tolist()

# Create a list to check if the data is in the original dataset
referenceList = data.drop(columns = ['type', 'score'])

# Create the mismatched dataset to make 50% positive and 50% negative data
rowsList = []
for i in cdr3List:
    randomPeptide = random.choice(peptidesList)
    a = np.array([randomPeptide,i])
    if not (referenceList == a).all(1).any():
        aDict = {'peptide': randomPeptide, 'cdr3': i, 'type': 'HLA-A02:01', 'score': 0}
        rowsList.append(aDict)
        
# Put negatively generated data into a dataframe and save
negData = pd.DataFrame(rowsList)
data = pd.concat([data, negData])
data

Unnamed: 0,peptide,cdr3,type,score
0,AAGIGILTV,AISEVGVGQPQH,HLA-A02:01,1
1,AAGIGILTV,ASSLSFGTEAF,HLA-A02:01,1
2,AAGIGILTV,ASSWSFGTEAF,HLA-A02:01,1
3,AAGIGILTV,AWSETGLGTGELF,HLA-A02:01,1
4,AIMDKNIIL,ASSQLTGQETQY,HLA-A02:01,1
...,...,...,...,...
9224,WLLPTWGV,SVGGTSGGATKNEQY,HLA-A02:01,0
9225,EAAGIGILTV,SVGGTSGGQIQETQY,HLA-A02:01,0
9226,LLFGYPVYV,SVGQALYNEQF,HLA-A02:01,0
9227,ELAGIGIATV,SVGRPLDNEQF,HLA-A02:01,0


In [39]:
# Get the list of peptides again
peptidesList = data['peptide'].tolist()

cdr3Data = pd.read_csv('healthy_tcrs.tsv', sep="\t", header=0)
cdr3List = cdr3Data['amino_acid'].tolist()

# Create a list to check if the data is in the original dataset
referenceList = data.drop(columns = ['type', 'score'])

# Match healthy TCRs to randomly picked peptides from the mismatched data
rowsList = []
for i in peptidesList:
    for j in range(3):
        randomcdr3 = random.choice(cdr3List)
        a = np.array([i,randomcdr3])
        if not (referenceList == a).all(1).any():
            aDict = {'peptide': i, 'cdr3': randomcdr3, 'score': 0}
            rowsList.append(aDict)
            
# Put negatively generated data into a dataframe
negData = pd.DataFrame(rowsList)
data = pd.concat([data, negData], sort=False)
data

Unnamed: 0,peptide,cdr3,type,score
0,AAGIGILTV,AISEVGVGQPQH,HLA-A02:01,1
1,AAGIGILTV,ASSLSFGTEAF,HLA-A02:01,1
2,AAGIGILTV,ASSWSFGTEAF,HLA-A02:01,1
3,AAGIGILTV,AWSETGLGTGELF,HLA-A02:01,1
4,AIMDKNIIL,ASSQLTGQETQY,HLA-A02:01,1
...,...,...,...,...
55666,ELAGIGIATV,CASSWTGFWEQYF,,0
55667,ELAGIGIATV,CASSPPQADLREQYF,,0
55668,YLEPAPVTA,CASSADLYNEQFF,,0
55669,YLEPAPVTA,CASSEGTGNSPLHF,,0


In [40]:
cdr3List = data['cdr3'].tolist()

peptides = pd.read_csv('eluted_peptide_ligands.csv', sep=",", header=0)
peptideList = peptides['Description'].tolist()
# Split out any post translation modifications for netTCR parsing
peptideList = list(map(lambda x: x.split('+'),peptideList))
peptideList = [i[0] for i in peptideList]

# Create a list to check if the data is in the original dataset
referenceList = data.drop(columns = ['type', 'score'])

rowsList = []
for i in peptideList:
    randomcdr3 = random.choice(cdr3List)
    a = np.array([i,randomcdr3])
    if not (referenceList == a).all(1).any():
        aDict = {'peptide': i, 'cdr3': randomcdr3, 'score': 0}
        rowsList.append(aDict)
        
# Put negatively generated data into a dataframe
negData = pd.DataFrame(rowsList)
data = pd.concat([data, negData], sort=False)
data.to_csv('finalData.csv', sep='\t', index=False)
data

Unnamed: 0,peptide,cdr3,type,score
0,AAGIGILTV,AISEVGVGQPQH,HLA-A02:01,1
1,AAGIGILTV,ASSLSFGTEAF,HLA-A02:01,1
2,AAGIGILTV,ASSWSFGTEAF,HLA-A02:01,1
3,AAGIGILTV,AWSETGLGTGELF,HLA-A02:01,1
4,AIMDKNIIL,ASSQLTGQETQY,HLA-A02:01,1
...,...,...,...,...
7210,PGQRLEWMGRIDPAN,ASTSEGGGGTEAF,,0
7211,PRLLIHYTSALQPGI,CASSQSSSGANVLTF,,0
7212,HYNYMCNSSCMGGMNRRPILTIITL,ASSQGFLLGAMETEAF,,0
7213,NTVFGAERKKRLSIIGPTSRDRSSP,CASSSSIRGRLAGGPTAQYF,,0


In [42]:
# Create a testing data file without the actual scores
testingData = data.drop(columns = ['type', 'score'])
testingData.to_csv('testingData.csv', sep='\t', index=False)

In [36]:

# Other things

peptideList = list(map(lambda x: (x.split('+')),peptides['Description'].tolist()))
peptideList = [i[0] for i in peptideList]
peptideList

['AAAAAIFVI',
 'AAAAALDKKQRNFDKILA',
 'AAGIGILTV',
 'AAMMAEELKKEQDTSAHL',
 'AAVEEGIVLGGG',
 'ACDPHSGHFV',
 'ADVEFCLSL',
 'AEERADIAESQVNKLRAK',
 'AEFQMTFHLFIAAFVGAAAT',
 'AELLNIPFLY',
 'AEQIALKGGKKQLQK',
 'AEQKRNAESVKGMRKSER',
 'AGDGTTTATVLA',
 'AGFKGEQGPKGEP',
 'AGWLADRSVRYPI',
 'AHFSLIHYAGIVDYN',
 'AIIDPLIYA',
 'AIISGDSPV',
 'AKRKTVTAMDVVYAL',
 'ALAPSTMKI',
 'ALIHHNTHL',
 'ALKRAQSEL',
 'ALLAGLVSL',
 'ALLAVGATK',
 'ALMDKSLHV',
 'ALRTDYNASV',
 'ALSDHHIYL',
 'AMLDLLKSV',
 'AMLGTHTMEV',
 'AMVGAVLTA',
 'AMYVAIQAV',
 'ANFSFRNTL',
 'APYVLIGTGTTIV',
 'ASFDKAKLK',
 'ASQKRPSQRHGSKY',
 'ASQKRPSQRHGSKYLATAST',
 'ATGLCFFGVALFCGCGHEAL',
 'ATQIPSYKK',
 'ATVLARSIAKEGFER',
 'ATYNFAVLKLMGRGTKF',
 'AVIGALLAV',
 'AVPVYIYFNTWTTCQSIAFP',
 'CADARMYGVLPWNAFPGKVC',
 'CDGERPTLAFLQDVM',
 'CLGKWLGHPDKFVGITYALT',
 'CMTWNQMNL',
 'CSSLEKTKHRLQNEIEDL',
 'DAEFRHDSGYEVHHQK',
 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
 'DINIYDLFV',
 'DKILAEWKQKYEESQSEL',
 'DLESYLQLNCERGTWR',
 'DLTTKNVSI',
 'DNITSSVLFN',
 'DRASFIKNL