In [29]:
import pandas as pd
import numpy as np
import re
from dataclasses import dataclass
import sys, os
from tqdm.notebook import tqdm

In [30]:
contents = pd.read_csv('./data/combined_dataset_repTCRs.csv')
contents.head()

Unnamed: 0,epi,tcr,binding
0,EAAGIGILTV,CASSLGNEQF,1
1,EAAGIGILTV,CASSLGVATGELF,1
2,EAAGIGILTV,CASSQEEGGGSWGNTIYF,1
3,EAAGIGILTV,CASSQEGLAGASQYF,1
4,EAAGIGILTV,CASSQETDIVFNOPQHF,1


In [31]:
# Process tcr and epitope sets
tcr_set = contents['tcr'].unique()
epi_set = contents['epi'].unique()
tcr_rank = np.arange(2, len(tcr_set)+2)
epi_rank = np.arange(3, len(epi_set)+3)
tcr_pd = pd.DataFrame({"tcr_set": tcr_set, "tcr_rank": tcr_rank})
epi_pd = pd.DataFrame({"epi_set": epi_set, "epi_rank": epi_rank})
tcr_pd.to_csv('./data/tcr.csv', index=False, header=False, sep='\t')
epi_pd.to_csv('./data/epitopes.csv', index=False, header=False, sep='\t')
training_set = pd.read_csv('./data/combined_dataset_repTCRs.csv')


In [32]:
def load_data_split(x_pep, x_tcr, args):

    split_type = args.split
    idx_test_remove = None
    idx_test = None
    idx_train = None

    if split_type == 'random':
        n_total = len(x_pep)
    elif split_type == 'epitope':
        unique_peptides = np.unique(x_pep)
        n_total = len(unique_peptides)
    elif split_type == 'tcr':
        unique_tcrs = np.unique(x_tcr)
        n_total = len(unique_tcrs)

    indexfile = re.sub('.csv', f'_{split_type}_data_shuffle.txt', args.infile)
    idx_shuffled = np.loadtxt(indexfile, dtype=np.int32)

    # Determine data split from folds
    n_test = int(round(n_total / args.n_fold))
    n_train = n_total - n_test

    # Determine position of current test fold
    test_fold_start_index = args.idx_test_fold * n_test
    test_fold_end_index = (args.idx_test_fold + 1) * n_test

    if split_type == 'random':
        # Split data evenly among evenly spaced folds
        # Determine if there is an outer testing fold
        if args.idx_val_fold < 0:
            idx_test = idx_shuffled[test_fold_start_index:test_fold_end_index]
            idx_train = list(set(idx_shuffled).difference(set(idx_test)))
        else:
            validation_fold_start_index = args.idx_val_fold * n_test
            validation_fold_end_index = (args.idx_val_fold + 1) * n_test
            idx_test_remove = idx_shuffled[test_fold_start_index:test_fold_end_index]
            idx_test = idx_shuffled[validation_fold_start_index:validation_fold_end_index]
            idx_train = list(set(idx_shuffled).difference(set(idx_test)).difference(set(idx_test_remove)))
    elif split_type == 'epitope':
        if args.idx_val_fold < 0:
            idx_test_pep = idx_shuffled[test_fold_start_index:test_fold_end_index]
            test_peptides = unique_peptides[idx_test_pep]
            idx_test = [index for index, pep in tqdm(enumerate(x_pep)) if pep in test_peptides]
            idx_train = list(set(range(len(x_pep))).difference(set(idx_test)))
        else:
            validation_fold_start_index = args.idx_val_fold * n_test
            validation_fold_end_index = (args.idx_val_fold + 1) * n_test
            idx_test_remove_pep = idx_shuffled[test_fold_start_index:test_fold_end_index]
            test_remove_peptides = unique_peptides[idx_test_remove_pep]
            idx_test_pep = idx_shuffled[validation_fold_start_index:validation_fold_end_index]
            test_peptides = unique_peptides[idx_test_pep]
            idx_test = [index for index, pep in tqdm(enumerate(x_pep)) if pep in test_peptides]
            idx_test_remove = [index for index, pep in enumerate(x_pep) if pep in test_remove_peptides]
            idx_train = list(set(range(len(x_pep))).difference(set(idx_test)).difference(set(idx_test_remove)))
    elif split_type == 'tcr':
        if args.idx_val_fold < 0:
            idx_test_tcr = idx_shuffled[test_fold_start_index:test_fold_end_index]
            test_tcrs = unique_tcrs[idx_test_tcr]
            idx_test = [index for index, tcr in tqdm(enumerate(x_tcr)) if tcr in test_tcrs]
            idx_train = list(set(range(len(x_tcr))).difference(set(idx_test)))
        else:
            validation_fold_start_index = args.idx_val_fold * n_test
            validation_fold_end_index = (args.idx_val_fold + 1) * n_test
            idx_test_remove_tcr = idx_shuffled[test_fold_start_index:test_fold_end_index]
            test_remove_tcrs = unique_tcrs[idx_test_remove_tcr]
            idx_test_tcr = idx_shuffled[validation_fold_start_index:validation_fold_end_index]
            test_tcrs = unique_tcrs[idx_test_tcr]
            idx_test = [index for index, tcr in tqdm(enumerate(x_tcr)) if tcr in test_tcrs]
            idx_test_remove = [index for index, tcr in enumerate(x_tcr) if tcr in test_remove_tcrs]
            idx_train = list(set(range(len(x_tcr))).difference(set(idx_test)).difference(set(idx_test_remove)))

    return idx_train, idx_test, idx_test_remove

# TCR

In [45]:
with open('./data/combined_dataset_repTCRs_tcr_data_shuffle.txt', 'r') as f:
	lines_tcr = f.readlines()
lines_tcr = [int(l.rstrip('\n')) for l in lines_tcr]
train_tcr = np.array(lines_tcr)
test_tcr = np.array(set(np.arange(len(training_set))) - set(train_tcr))

@dataclass
class Args:
    split: str = 'tcr'
    infile = str = './data/combined_dataset_repTCRs.csv'
    n_fold: int = 5
    idx_test_fold: int = 0
    idx_val_fold: int = -1

id_train_tcr, id_test_tcr, id_test_rmv_tcr = load_data_split(training_set['epi'], training_set['tcr'], Args)

0it [00:00, ?it/s]

In [46]:
train_dataset_tcr = training_set.iloc[id_train_tcr]
test_dataset_tcr = training_set.iloc[id_test_tcr]
tcr_readin = pd.read_csv('./data/tcr.csv', header=None, sep='\t')
tcr_dict = dict(zip(tcr_readin[0], tcr_readin[1]))
epi_readin = pd.read_csv('./data/epitopes.csv', header=None, sep='\t')
epi_dict = dict(zip(epi_readin[0], epi_readin[1]))

train_dataset_tcr_ready = pd.DataFrame({'ligand_name': train_dataset_tcr['epi'].map(epi_dict).fillna(-1),
										'sequence_id': train_dataset_tcr['tcr'].map(tcr_dict).fillna(-1),
										'label': train_dataset_tcr['binding']})
train_dataset_tcr_ready.reset_index(drop=True, inplace=True)
# train_dataset_tcr.loc[:,'tcr_idx'] = train_dataset_tcr['tcr'].map(tcr_dict).fillna(-1)
# train_dataset_tcr.loc[:,'epi_idx'] = train_dataset_tcr['epi'].map(epi_dict).fillna(-1)
# train_dataset_tcr.loc[:,'idx'] = np.arange(len(train_dataset_tcr))
# train_dataset_tcr.set_index('idx', inplace=True)
test_dataset_tcr_ready = pd.DataFrame({'ligand_name': test_dataset_tcr['epi'].map(epi_dict).fillna(-1),
										'sequence_id': test_dataset_tcr['tcr'].map(tcr_dict).fillna(-1),
										'label': test_dataset_tcr['binding']})
test_dataset_tcr_ready.reset_index(drop=True, inplace=True)
# test_dataset_tcr_ready.loc[:,'idx'] = np.arange(len(test_dataset_tcr_ready))
# test_dataset_tcr_ready.set_index('idx', inplace=True)
train_dataset_tcr_ready.to_csv('./data/tcr_train_split.csv', index=True, sep=',')
test_dataset_tcr_ready.to_csv('./data/tcr_test_split.csv', index=True, sep=',')
# test_dataset_tcr.loc[:,'tcr_idx'] = test_dataset_tcr['tcr'].map(tcr_dict).fillna(-1)
# test_dataset_tcr.loc[:,'epi_idx'] = test_dataset_tcr['epi'].map(epi_dict).fillna(-1)
# test_dataset_tcr.loc[:,'idx'] = np.arange(len(test_dataset_tcr))
# test_dataset_tcr.set_index('idx', inplace=True)

# train_csv_tcr = train_dataset_tcr[['epi_idx', 'tcr_idx', 'binding']]
# test_csv_tcr = test_dataset_tcr[['epi_idx', 'tcr_idx', 'binding']]
# train_csv_tcr.to_csv('./data/tcr_train_split.csv', index=True, header=['ligand_name','sequence_id','label'], sep=',')
# test_csv_tcr.to_csv('./data/tcr_test_split.csv', index=True, header=['ligand_name','sequence_id','label'], sep=',')

# Epitope

In [47]:
with open('./data/combined_dataset_repTCRs_epitope_data_shuffle.txt', 'r') as f:
	lines_epi = f.readlines()
lines_epi = [int(l.rstrip('\n')) for l in lines_epi]
train_epi = np.array(lines_epi)
train_epi = np.array(set(np.arange(len(training_set))) - set(train_epi))

@dataclass
class Args:
    split: str = 'epitope'
    infile = str = './data/combined_dataset_repTCRs.csv'
    n_fold: int = 5
    idx_test_fold: int = 0
    idx_val_fold: int = -1

id_train_epi, id_test_epi, id_test_rmv_epi = load_data_split(training_set['epi'], training_set['tcr'], Args)

0it [00:00, ?it/s]

In [55]:
train_dataset_epi = training_set.iloc[id_train_epi]
test_dataset_epi = training_set.iloc[id_test_epi]
epi_readin = pd.read_csv('./data/tcr.csv', header=None, sep='\t')
epi_dict = dict(zip(epi_readin[0], epi_readin[1]))
epi_readin = pd.read_csv('./data/epitopes.csv', header=None, sep='\t')
epi_dict = dict(zip(epi_readin[0], epi_readin[1]))

# train_dataset_epi.loc[: ,'tcr_idx'] = train_dataset_epi.iloc['tcr'].map(tcr_dict).fillna(-1)
# train_dataset_epi.loc[: ,'epi_idx'] = train_dataset_epi.iloc['epi'].map(epi_dict).fillna(-1)
# test_dataset_epi.loc[: ,'tcr_idx'] = test_dataset_epi.iloc['tcr'].map(tcr_dict).fillna(-1)
# test_dataset_epi.loc[: ,'epi_idx'] = test_dataset_epi.iloc['epi'].map(epi_dict).fillna(-1)
# test_dataset_tcr.loc[: ,'idx'] = np.arange(len(test_dataset_tcr))
# test_dataset_tcr.set_index('idx', inplace=True)
train_dataset_epi_ready = pd.DataFrame({'ligand_name': train_dataset_epi['epi'].map(epi_dict).fillna(-1),
										'sequence_id': train_dataset_epi['tcr'].map(tcr_dict).fillna(-1),
										'label': train_dataset_epi['binding']})
train_dataset_epi_ready.reset_index(drop=True, inplace=True)



test_dataset_epi_ready = pd.DataFrame({'ligand_name': test_dataset_epi['epi'].map(epi_dict).fillna(-1),
										'sequence_id': test_dataset_epi['tcr'].map(tcr_dict).fillna(-1),
										'label': test_dataset_epi['binding']})
test_dataset_epi_ready.reset_index(drop=True, inplace=True)


train_dataset_epi_ready.to_csv('./data/epi_train_split.csv', index=True, sep=',')
test_dataset_epi_ready.to_csv('./data/epi_test_split.csv', index=True, sep=',')
# train_csv_epi = train_dataset_epi[['epi_idx', 'tcr_idx', 'binding']]
# test_csv_epi = test_dataset_epi[['epi_idx', 'tcr_idx', 'binding']]
# train_csv_epi.to_csv('./data/epi_train_split.csv', index=True, header=['ligand_name','sequence_id','label'], sep=',')
# test_csv_epi.to_csv('./data/epi_test_split.csv', index=True, header=['ligand_name','sequence_id','label'], sep=',')

In [56]:
tcr_readin = pd.read_csv('./data/tcr.csv', header=None, sep='\t')
tcr_dict = dict(zip(tcr_readin[0], tcr_readin[1]))
epi_readin = pd.read_csv('./data/epitopes.csv', header=None, sep='\t')
epi_dict = dict(zip(epi_readin[0], epi_readin[1]))

In [51]:
epi_dict_rev =  dict(zip(epi_readin[1], epi_readin[0]))
tcr_dict_rev =  dict(zip(tcr_readin[1], tcr_readin[0]))

In [52]:
tcr_train_split = pd.read_csv('./data/tcr_train_split.csv', index_col=0)
tcr_test_split = pd.read_csv('./data/tcr_test_split.csv', index_col=0)
train_pred = {epi_dict_rev[row['ligand_name']]+tcr_dict_rev[row['sequence_id']]: row['label']  for index, row in tcr_train_split.iterrows()}
test_pred = {epi_dict_rev[row['ligand_name']]+tcr_dict_rev[row['sequence_id']]: row['label']  for index, row in tcr_test_split.iterrows()}

In [53]:
training_set = pd.read_csv('./data/combined_dataset_repTCRs.csv')
label = {row['epi']+row['tcr']: row['binding']  for index, row in training_set.iterrows()}

In [54]:
for idx, val in enumerate(label):
	try: 
		assert label[val] == train_pred[val]
	except KeyError:
		try:
			assert label[val] == test_pred[val]
		except KeyError:
			print('wrong')
print(idx)

300015


In [57]:
epi_train_split = pd.read_csv('./data/epi_train_split.csv', index_col=0)
epi_test_split = pd.read_csv('./data/epi_test_split.csv', index_col=0)
train_pred = {epi_dict_rev[row['ligand_name']]+tcr_dict_rev[row['sequence_id']]: row['label']  for index, row in epi_train_split.iterrows()}
test_pred = {epi_dict_rev[row['ligand_name']]+tcr_dict_rev[row['sequence_id']]: row['label']  for index, row in epi_test_split.iterrows()}

In [58]:
for idx, val in enumerate(label):
	try: 
		assert label[val] == train_pred[val]
	except KeyError:
		try:
			assert label[val] == test_pred[val]
		except KeyError:
			print('wrong')
print(idx)

300015


# To upper

In [60]:
tcr = pd.read_csv('./data/tcr.csv', header=None, sep='\t')	
tcr[0] = tcr[0].str.upper()
tcr.to_csv('./data/tcr.csv', header=None, sep='\t')	