In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold

np.random.seed(621)
data = pd.read_csv('finalData.csv', sep="\t", header=0)
data.head(5)

Unnamed: 0,peptide,cdr3,type,score
0,AAGIGILTV,AISEVGVGQPQH,HLA-A02:01,1
1,AAGIGILTV,ASSLSFGTEAF,HLA-A02:01,1


In [2]:
# Get original data and separate into negative and positive
data = data.filter(['peptide', 'cdr3', 'score'])
data.count() #81443
posData = data.loc[data['score'] == 1]
negData = data.loc[data['score'] == 0]
print("neg: " + str(negData['peptide'].count()))
print("pos: " + str(posData['peptide'].count()))

neg: 72115
pos: 9328


In [36]:
# Retrieve a set of testing data
trainArray = []
testArray = []
for i in range(10):
    posTrainSample = posData.sample(frac = 0.3)
    negTrainSample = negData.sample(n = posTrainSample['peptide'].count())
    trainSample = pd.concat([posTrainSample, negTrainSample])
    testSample = data.drop(trainSample.index)
    trainArray.append(trainSample)
    testArray.append(testSample)

In [9]:
# Getting K-Folds for the positive and negative data split on the training data
poskf = KFold(n_splits=5, shuffle=True)
negkf = KFold(n_splits=5, shuffle=True)
for pos_train_index, pos_test_index, neg_train_index, neg_test, index in zip(poskf.split(posData), poskf.split(negData)):
    pos_train, pos_test = posData.iloc[pos_train_index], posData.iloc[pos_test_index]
    neg_train, neg_test = negData.iloc[neg_train_index], negData.iloc[neg_test_index]
    trainingData = pd.concat(pos_train, neg_train)
    testingData = pd.concat(pos_test, neg_test)

TRAIN: [   0    2    4 ... 9325 9326 9327] TEST: [   1    3   17 ... 9301 9312 9321]
TRAIN: [   1    2    3 ... 9325 9326 9327] TEST: [   0    8   15 ... 9320 9322 9324]
TRAIN: [   0    1    3 ... 9324 9326 9327] TEST: [   2    6    9 ... 9311 9313 9325]
TRAIN: [   0    1    2 ... 9324 9325 9326] TEST: [  10   21   24 ... 9318 9319 9327]
TRAIN: [   0    1    2 ... 9324 9325 9327] TEST: [   4    5    7 ... 9306 9323 9326]


Unnamed: 0,peptide,cdr3,score
0,AAGIGILTV,AISEVGVGQPQH,1
1,AAGIGILTV,ASSLSFGTEAF,1
2,AAGIGILTV,ASSWSFGTEAF,1
3,AAGIGILTV,AWSETGLGTGELF,1
6,AIMDKNIIL,SVEGLSYGYT,1
...,...,...,...
9321,YVLDHLIVV,SLTANDAYGYT,1
9322,YVLDHLIVV,STSATENYGYT,1
9324,YVLDHLIVV,SVGGTSGGQIQETQY,1
9325,YVLDHLIVV,SVGQALYNEQF,1


In [38]:
#train.to_csv('TCRTrainingData.txt', sep='\t', header=False, index=False)
for index, i in enumerate(trainArray):
    trainFilename = 'NetTCROriginalData/TCRTrainingData' + str(index) + '.txt'
    i.to_csv(trainFilename, sep='\t', header=False, index=False)
for index, i in enumerate(testArray):
    testFilename = 'NetTCROriginalData/TCRTestingActual' + str(index) + '.txt'
    i.to_csv(testFilename, sep='\t', header=False, index=False)