In [1]:
import pandas as pd
import os
import numpy as np
dataset_dir = "/home/lujd/Hi-TpH/data"
os.listdir(dataset_dir)

['Hi-TpH-level-I.csv',
 'Hi-TpH-level-IV.csv',
 'Hi-TpH-level-IIA.csv',
 'Hi-TpH-tcr_gene2seq.json',
 'Hi-TpH-hla_allele2seq.json',
 'Hi-TpH-level-IIB.csv',
 'README.md',
 'Hi-TpH-level-III.csv']

--- 

### level-I

In [2]:
df1 = pd.read_csv(os.path.join(
    dataset_dir,
    "Hi-TpH-level-I.csv"
))
print(df1.shape)
df1.head()

(720038, 2)


Unnamed: 0,antigen.epitope,beta.cdr3
0,VMAPRTLIL,ASSQDRDTQY
1,FLRGRFYGL,ASSLGQAYEQY
2,FLRGRAYGL,ASSLGQAYEQY
3,EEYLKAWTF,ASSLGQAYEQY
4,EEYLQAFTY,ASSLGQAYEQY


In [3]:
df1 = df1.dropna(subset=['antigen.epitope', 'beta.cdr3'])
df1.rename(columns={"antigen.epitope":"pep", "beta.cdr3":"tcr"}, inplace=True)
df1 = df1.drop_duplicates(ignore_index=True)
print(df1.shape)

(720038, 2)


In [4]:
df1.pep.nunique()

1949

In [5]:
pep_length_counts = df1['pep'].apply(len).value_counts()

print("pep不同长度的个数统计：")
print(pep_length_counts)

pep不同长度的个数统计：
pep
9     377424
10    237861
11     33931
14     14264
25     11578
13     11504
8       8065
12      6028
16      4709
18      4245
15      3462
19      2415
27      1712
43      1080
17       929
20       510
38       253
37        42
41        15
35         6
7          2
31         1
22         1
24         1
Name: count, dtype: int64


In [6]:
tcr_length_counts = df1['tcr'].apply(len).value_counts()

print("cdr3beta不同长度的个数统计：")
print(tcr_length_counts)

cdr3beta不同长度的个数统计：
tcr
15    145198
14    135689
13    123712
16     85253
12     81915
17     47468
11     42963
18     21585
10     14267
19      9334
9       4812
20      3649
21      1535
8       1044
22       575
7        327
23       224
6        117
24       108
5        104
4         56
3         44
25        40
2          6
1          3
27         3
26         3
31         1
34         1
38         1
33         1
Name: count, dtype: int64


pep切成15肽，短于15的padding，max——len15
tcr长于20的就扔掉，max——len20
保证配对

In [7]:
df1 = df1[df1['pep'].str.len() <= 15]
df1 = df1[df1['pep'].str.len() >= 8]
print(len(df1))

692539


In [8]:
df1 = df1[df1['tcr'].str.len() <= 19]
df1 = df1[df1['tcr'].str.len() >= 9]
print(len(df1))

685033


In [9]:
pep_length_counts = df1['pep'].apply(len).value_counts()

print("pep 不同长度的个数统计：")
print(pep_length_counts)

pep 不同长度的个数统计：
pep
9     373243
10    235249
11     33618
14     14184
13     11368
8       7966
12      5988
15      3417
Name: count, dtype: int64


In [10]:
tcr_length_counts = df1['tcr'].apply(len).value_counts()

print("beta 不同长度的个数统计：")
print(tcr_length_counts)

beta 不同长度的个数统计：
tcr
15    143127
14    132052
13    118078
16     84347
12     76338
17     47085
11     38538
18     21422
10     11192
19      9265
9       3589
Name: count, dtype: int64


In [11]:
df1 = df1.drop_duplicates(ignore_index=True)
len(df1)

685033

In [12]:
# 提取唯一的tcr值，转换为列表
tcr_list = df1['tcr'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level1"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

261190


In [13]:
df1['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df1.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

--- 

### level-II A

In [14]:
df_2a = pd.read_csv(os.path.join(
    dataset_dir,
    "Hi-TpH-level-IIA.csv"
))
print(df_2a.shape)
df_2a.head()

(78679, 6)


Unnamed: 0,antigen.epitope,hla.allele,beta.cdr3,hla.full.seq,hla.clip.seq,hla.short.seq
0,VMAPRTLIL,HLA-E*01:01,ASSQDRDTQY,MVDGTLLLLLSEALALTQTWAGSHSLKYFHTSVSRPGRGEPRFISV...,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,YHSMYRESADTIFVNTLYLWHEFYSSAEQAYTWY
1,VMAPRTLIL,HLA-E*01:03,ASSQDRDTQY,MVDGTLLLLLSEALALTQTWAGSHSLKYFHTSVSRPGRGEPRFISV...,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,YHSMYRESADTIFVNTLYLWHEFYSSAEQAYTWY
2,FLRGRFYGL,HLA-B*08,ASSLGQAYEQY,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFDTAMSRPGRGEPRF...,GSHSMRYFDTAMSRPGRGEPRFISVGYVDDTQFVRFDSDAASPREE...,YDSEYRNIFTNTDESNLYLSYNYYTWAVDAYTWY
3,FLRGRAYGL,HLA-B*08,ASSLGQAYEQY,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFDTAMSRPGRGEPRF...,GSHSMRYFDTAMSRPGRGEPRFISVGYVDDTQFVRFDSDAASPREE...,YDSEYRNIFTNTDESNLYLSYNYYTWAVDAYTWY
4,EEYLKAWTF,HLA-B*44:05,ASSLGQAYEQY,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...,GSHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRFDSDATSPRKE...,YYTKYREISTNTYENTAYIRYDYYTWAVDAYLSY


In [15]:
df_2a = df_2a.dropna(subset=['antigen.epitope', 'beta.cdr3',"hla.short.seq"])
df_2a.rename(columns={"antigen.epitope":"pep", "hla.short.seq":"hla", "beta.cdr3":"beta"}, inplace=True)
df_2a = df_2a.drop_duplicates(ignore_index=True)
print(df_2a.shape)

(78679, 6)


In [16]:
df_2a = df_2a[["beta","hla","pep"]]
df_2a.pep.nunique()

1401

In [17]:
pep_length_counts = df_2a['pep'].apply(len).value_counts()

print("pep不同长度的个数统计：")
print(pep_length_counts)

pep不同长度的个数统计：
pep
9     65364
10    10213
8      2040
11      890
12      106
13       40
20       17
15        6
7         2
24        1
Name: count, dtype: int64


In [18]:
tcr_length_counts = df_2a['beta'].apply(len).value_counts()

print("cdr3beta不同长度的个数统计：")
print(tcr_length_counts)

cdr3beta不同长度的个数统计：
beta
13    16091
14    13839
15    12395
12     9958
11     8085
16     7109
17     3570
10     2870
18     1581
9      1094
19     1013
20      262
8       242
7       117
21       88
5        87
6        86
4        55
22       47
3        44
23       16
24       10
2         6
1         3
25        3
26        3
31        1
34        1
38        1
27        1
33        1
Name: count, dtype: int64


In [19]:
df_2a = df_2a[df_2a['beta'].str.len() <= 19]
df_2a = df_2a[df_2a['beta'].str.len() >= 9]
df_2a.shape

(77605, 3)

In [20]:
df_2a = df_2a[df_2a['pep'].str.len() <= 10 ]
df_2a = df_2a[df_2a['pep'].str.len() >= 8]
df_2a.shape

(76555, 3)

In [22]:
pep_length_counts = df_2a['pep'].apply(len).value_counts()

print("pep 不同长度的个数统计：")
print(pep_length_counts)

pep 不同长度的个数统计：
pep
9     64409
10    10132
8      2014
Name: count, dtype: int64


In [23]:
tcr_length_counts = df_2a['beta'].apply(len).value_counts()

print("beta 不同长度的个数统计：")
print(tcr_length_counts)

beta 不同长度的个数统计：
beta
13    15929
14    13633
15    12216
12     9772
11     8014
16     7008
17     3520
10     2851
18     1563
9      1051
19      998
Name: count, dtype: int64


In [24]:
hla_length_counts = df_2a['hla'].apply(len).value_counts()

print("hla 不同长度的个数统计：")
print(hla_length_counts)

hla 不同长度的个数统计：
hla
34    76555
Name: count, dtype: int64


In [25]:
df_2a = df_2a.drop_duplicates(subset=['pep', 'beta', 'hla'], ignore_index=True)
len(df_2a)

76545

- all components

In [26]:
# 提取唯一的tcr值，转换为列表
tcr_list = df_2a['beta'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level2a"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

71699


In [27]:
df_2a['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df_2a.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

In [30]:
train_df

Unnamed: 0,beta,hla,pep,label
0,CAWTWGTLNTEAFF,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,KLGGALQAK,1
1,CASSPPGTGYGYTF,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,KLGGALQAK,1
2,CASSVGVSGSFYEQYF,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,MLNIPSINV,1
3,ASSLASGRSTEAF,YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTGY,AYAQKIFKI,1
4,CASSSVPNNEAFF,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,GILGFVFTL,1
...,...,...,...,...
61231,CASSTGLLTDTQYF,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,KLGGALQAK,1
61232,CASSPPRQRDTQYF,YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTGY,SFHSLHLLF,1
61233,CSGFGDRGNTDTQYF,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,CLGGLLTMV,1
61234,CASSEAGLGHEQFF,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,NLVPMVATV,1


- 2 component

In [29]:
# 提取唯一的tcr值，转换为列表
tcr_list = df_2a['beta'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level2a_basic"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

71699


In [31]:
df_2a['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df_2a.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 去除掉每个数据集中重复的 ['pep', 'beta'] pair
print(len(train_df))
train_df = train_df[["beta", "pep", 'label']]
train_df = train_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(train_df))

print(len(val_df))
val_df = val_df[["beta", "pep", 'label']]
val_df = val_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(val_df))

print(len(test_df))
test_df = test_df[["beta", "pep", 'label']]
test_df = test_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(test_df))

# 去除数据集之间重复的 ['pep', 'beta'] pair
train_df['tag'] = 'train'
val_df['tag'] = 'val'
test_df['tag'] = 'test'
merge_df = pd.concat((train_df, val_df, test_df), axis=0).reset_index(drop=True)
print(len(merge_df))
merge_df = merge_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True, keep='first')
print(len(merge_df))

train_df = merge_df[merge_df['tag']=='train'].reset_index(drop=True)
train_df = train_df[["beta", "pep", 'label']]
print(len(train_df))

val_df = merge_df[merge_df['tag']=='val'].reset_index(drop=True)
val_df = val_df[["beta", "pep", 'label']]
print(len(val_df))

test_df = merge_df[merge_df['tag']=='test'].reset_index(drop=True)
test_df = test_df[["beta", "pep", 'label']]
print(len(test_df))

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

61236
61185
7654
7653
7655
7655
76493
76461
61185
7637
7639


In [32]:
train_df

Unnamed: 0,beta,pep,label
0,CAWTWGTLNTEAFF,KLGGALQAK,1
1,CASSPPGTGYGYTF,KLGGALQAK,1
2,CASSVGVSGSFYEQYF,MLNIPSINV,1
3,ASSLASGRSTEAF,AYAQKIFKI,1
4,CASSSVPNNEAFF,GILGFVFTL,1
...,...,...,...
61180,CASSTGLLTDTQYF,KLGGALQAK,1
61181,CASSPPRQRDTQYF,SFHSLHLLF,1
61182,CSGFGDRGNTDTQYF,CLGGLLTMV,1
61183,CASSEAGLGHEQFF,NLVPMVATV,1


---

### level-II B

In [33]:
df_tcr = pd.read_csv(os.path.join(
    dataset_dir,
    "Hi-TpH-level-IIB.csv"
))
print(df_tcr.shape)
df_tcr.head()

(28375, 3)


Unnamed: 0,antigen.epitope,alpha.cdr3,beta.cdr3
0,VMAPRTLIL,IVVRSSNTGKLI,ASSQDRDTQY
1,SLLMWITQC,AVRPLLDGTYIPT,ASSYLGNTGELF
2,EENLLDFVRF,IVWGGYQKVT,ASRYRDDSYNEQF
3,LLFGYPVYV,AVTTDSWGKLQ,ASRPGLAGGRPEQY
4,LLFGYPVFV,AVTTDSWGKLQ,ASRPGLAGGRPEQY


In [34]:
df_tcr = df_tcr.dropna(subset=['antigen.epitope', 'beta.cdr3', "alpha.cdr3"])
df_tcr.rename(columns={"antigen.epitope":"pep", "beta.cdr3":"beta", "alpha.cdr3":"alpha"}, inplace=True)
df_tcr = df_tcr.drop_duplicates(ignore_index=True)
print(df_tcr.shape)

(28375, 3)


In [35]:
df_tcr.pep.nunique(), df_tcr.beta.nunique(), df_tcr.alpha.nunique()

(1154, 22566, 20979)

In [36]:
pep_length_counts = df_tcr['pep'].apply(len).value_counts()

print("pep不同长度的个数统计：")
print(pep_length_counts)

pep不同长度的个数统计：
pep
9     23484
10     3221
8      1336
11      188
12      102
13       35
25        5
7         3
24        1
Name: count, dtype: int64


In [37]:
df_tcr = df_tcr[df_tcr['pep'].str.len() <= 10 ]
df_tcr = df_tcr[df_tcr['pep'].str.len() >= 8]
df_tcr.shape

(28041, 3)

In [38]:
beta_length_counts = df_tcr['beta'].apply(len).value_counts()

print("beta 不同长度的个数统计：")
print(beta_length_counts)

beta 不同长度的个数统计：
beta
15    6367
14    5729
13    5627
16    3654
12    2051
17    2033
18     898
11     870
19     390
20     157
10     149
21      51
9       23
22      17
8       11
23       7
26       3
25       2
24       1
7        1
Name: count, dtype: int64


In [39]:
alpha_length_counts = df_tcr['alpha'].apply(len).value_counts()

print("alpha 不同长度的个数统计：")
print(alpha_length_counts)

alpha 不同长度的个数统计：
alpha
13    5574
14    5223
12    4519
15    4518
11    2540
16    2307
17    1424
10     934
18     444
9      199
19     177
8       90
20      40
21      15
7       13
22       8
6        4
24       3
26       3
25       3
23       2
5        1
Name: count, dtype: int64


In [40]:
df_tcr = df_tcr[df_tcr['alpha'].str.len() <= 19]
df_tcr = df_tcr[df_tcr['alpha'].str.len() >= 9]
df_tcr.shape

(27859, 3)

In [41]:
df_tcr = df_tcr[df_tcr['beta'].str.len() <= 19]
df_tcr = df_tcr[df_tcr['beta'].str.len() >= 9]
df_tcr.shape

(27611, 3)

In [43]:
pep_length_counts = df_tcr['pep'].apply(len).value_counts()

print("pep 不同长度的个数统计：")
print(pep_length_counts)

pep 不同长度的个数统计：
pep
9     23124
10     3172
8      1315
Name: count, dtype: int64


In [44]:
df_tcr["ab"] = df_tcr['alpha'] + '/' + df_tcr['beta']

df_tcr = df_tcr.drop_duplicates(subset=['pep', 'ab'], ignore_index=True)
print(len(df_tcr))
df_tcr = df_tcr.drop_duplicates(subset=['pep', 'alpha', 'beta'], ignore_index=True)
print(len(df_tcr))

27611
27611


- all component

In [45]:
df_tcr_all = df_tcr[['pep', 'ab']]

# 提取唯一的tcr值，转换为列表
tcr_list = df_tcr_all['ab'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level2b"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

25812


In [46]:
# 添加一列标签为1
df_tcr_all['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df_tcr_all.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr_all['label'] = 1


In [47]:
train_df

Unnamed: 0,pep,ab,label
0,IQPGQTFSV,CAMREGENARLMF/CASSLADLLAEKLFF,1
1,AVFDRKSDAK,CIPADNTGNQFYF/CASRIASRGGDTQYF,1
2,AIFYLITPV,CAVVLNNNDMRF/CATTPAGKTADQETQYF,1
3,NLVPMVATV,CAGTNTGGFKTIF/CASRIELPGAGELFF,1
4,IVTDFSVIK,CAVNMYGDSSYKLIF/CASSWGGGSHYGYTF,1
...,...,...,...
22083,VLFHRAFLV,CASYSGGGADGLTF/CASSLGAGSYEQYF,1
22084,YLQPRTFLL,CAVRGTGRRALTF/CSAGGTGIYGSYEQYF,1
22085,IVTDFSVIK,CVVSAARGATNKLIF/CASRTGLASTDTQYF,1
22086,KLGGALQAK,CAGAAGGYQKVTF/CASSHVNQPQHF,1


- 2 component

In [48]:
df_tcr_basic = df_tcr[['pep', 'beta']]

# 提取唯一的tcr值，转换为列表
tcr_list = df_tcr_basic['beta'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level2b_basic"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

21956


In [49]:
df_tcr_basic['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df_tcr_basic.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 去除掉每个数据集中重复的 ['pep', 'beta'] pair
print(len(train_df))
train_df = train_df[["beta", "pep", 'label']]
train_df = train_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(train_df))

print(len(val_df))
val_df = val_df[["beta", "pep", 'label']]
val_df = val_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(val_df))

print(len(test_df))
test_df = test_df[["beta", "pep", 'label']]
test_df = test_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(test_df))

# 去除数据集之间重复的 ['pep', 'beta'] pair
train_df['tag'] = 'train'
val_df['tag'] = 'val'
test_df['tag'] = 'test'
merge_df = pd.concat((train_df, val_df, test_df), axis=0).reset_index(drop=True)
print(len(merge_df))
merge_df = merge_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True, keep='first')
print(len(merge_df))

train_df = merge_df[merge_df['tag']=='train'].reset_index(drop=True)
train_df = train_df[["beta", "pep", 'label']]
print(len(train_df))

val_df = merge_df[merge_df['tag']=='val'].reset_index(drop=True)
val_df = val_df[["beta", "pep", 'label']]
print(len(val_df))

test_df = merge_df[merge_df['tag']=='test'].reset_index(drop=True)
test_df = test_df[["beta", "pep", 'label']]
print(len(test_df))

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

22088
19505
2761
2628
2762
2603
24736
24074
19505
2313
2256


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr_basic['label'] = 1


In [50]:
train_df

Unnamed: 0,beta,pep,label
0,CASSLADLLAEKLFF,IQPGQTFSV,1
1,CASRIASRGGDTQYF,AVFDRKSDAK,1
2,CATTPAGKTADQETQYF,AIFYLITPV,1
3,CASRIELPGAGELFF,NLVPMVATV,1
4,CASSWGGGSHYGYTF,IVTDFSVIK,1
...,...,...,...
19500,CASSLGGTDTQYF,KLGGALQAK,1
19501,CASSLGAGSYEQYF,VLFHRAFLV,1
19502,CSAGGTGIYGSYEQYF,YLQPRTFLL,1
19503,CASSHVNQPQHF,KLGGALQAK,1


---

### level-III

In [51]:
df_tcr = pd.read_csv(os.path.join(
    dataset_dir,
    "Hi-TpH-level-III.csv"
))
print(df_tcr.shape)
df_tcr.head()

(28262, 7)


Unnamed: 0,antigen.epitope,hla.allele,alpha.cdr3,beta.cdr3,hla.full.seq,hla.clip.seq,hla.short.seq
0,VMAPRTLIL,HLA-E*01:01,IVVRSSNTGKLI,ASSQDRDTQY,MVDGTLLLLLSEALALTQTWAGSHSLKYFHTSVSRPGRGEPRFISV...,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,YHSMYRESADTIFVNTLYLWHEFYSSAEQAYTWY
1,VMAPRTLIL,HLA-E*01:03,IVVRSSNTGKLI,ASSQDRDTQY,MVDGTLLLLLSEALALTQTWAGSHSLKYFHTSVSRPGRGEPRFISV...,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,YHSMYRESADTIFVNTLYLWHEFYSSAEQAYTWY
2,SLLMWITQC,HLA-A*02:01,AVRPLLDGTYIPT,ASSYLGNTGELF,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY
3,EENLLDFVRF,HLA-B*44:02,IVWGGYQKVT,ASRYRDDSYNEQF,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...,GSHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRFDSDATSPRKE...,YYTKYREISTNTYENTAYIRYDDYTWAVDAYLSY
4,EENLLDFVRF,HLA-B*44:03,IVWGGYQKVT,ASRYRDDSYNEQF,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...,GSHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRFDSDATSPRKE...,YYTKYREISTNTYENTAYIRYDDYTWAVLAYLSY


In [52]:
df_tcr = df_tcr.dropna(subset=['antigen.epitope', 'beta.cdr3', "alpha.cdr3", "hla.short.seq"])
df_tcr.rename(columns={"antigen.epitope":"pep", "hla.short.seq":"hla", 
                       "beta.cdr3":"beta", "alpha.cdr3":"alpha"}, inplace=True)
df_tcr = df_tcr.drop_duplicates(ignore_index=True)
print(df_tcr.shape)

(28262, 7)


In [53]:
tcr_length_counts = df_tcr['pep'].apply(len).value_counts()

print("pep 不同长度的个数统计：")
print(tcr_length_counts)

pep 不同长度的个数统计：
pep
9     23420
10     3223
8      1274
11      198
12      102
13       41
7         3
24        1
Name: count, dtype: int64


In [54]:
df_tcr = df_tcr[df_tcr['pep'].str.len() <= 10]
df_tcr = df_tcr[df_tcr['pep'].str.len() >= 8]
df_tcr.shape

(27917, 7)

In [55]:
beta_length_counts = df_tcr['beta'].apply(len).value_counts()

print("beta 不同长度的个数统计：")
print(beta_length_counts)

beta 不同长度的个数统计：
beta
15    6331
14    5702
13    5607
16    3634
12    2047
17    2025
18     894
11     868
19     387
20     157
10     150
21      50
9       23
22      17
8       11
23       7
26       3
25       2
24       1
7        1
Name: count, dtype: int64


In [56]:
alpha_length_counts = df_tcr['alpha'].apply(len).value_counts()

print("alpha 不同长度的个数统计：")
print(alpha_length_counts)

alpha 不同长度的个数统计：
alpha
13    5545
14    5195
15    4502
12    4496
11    2542
16    2292
17    1416
10     934
18     439
9      197
19     177
8       90
20      40
21      15
7       13
22       8
6        4
24       3
26       3
25       3
23       2
5        1
Name: count, dtype: int64


In [57]:
df_tcr = df_tcr[df_tcr['alpha'].str.len() <= 19]
df_tcr = df_tcr[df_tcr['alpha'].str.len() >= 9]
df_tcr.shape

(27735, 7)

In [58]:
df_tcr = df_tcr[df_tcr['beta'].str.len() <= 19]
df_tcr = df_tcr[df_tcr['beta'].str.len() >= 9]
df_tcr.shape

(27488, 7)

In [60]:
pep_length_counts = df_tcr['pep'].apply(len).value_counts()

print("pep 不同长度的个数统计：")
print(pep_length_counts)

pep 不同长度的个数统计：
pep
9     23061
10     3174
8      1253
Name: count, dtype: int64


In [61]:
df_tcr["ab"] = df_tcr['alpha'] + '/' + df_tcr['beta']

df_tcr = df_tcr.drop_duplicates(subset=['pep', 'hla', 'ab'], ignore_index=True)
print(len(df_tcr))
df_tcr = df_tcr.drop_duplicates(subset=['pep', 'hla', 'alpha', 'beta'], ignore_index=True)
print(len(df_tcr))

27478
27478


- all component

In [62]:
df_tcr_all = df_tcr[['pep', 'hla', 'ab']]

# 提取唯一的tcr值，转换为列表
tcr_list = df_tcr_all['ab'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level3"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

25669


In [64]:
# 添加一列标签为1
df_tcr_all['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df_tcr_all.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr_all['label'] = 1


- 2 component

In [66]:
df_tcr_basic = df_tcr[['pep', 'beta']]

# 提取唯一的tcr值，转换为列表
tcr_list = df_tcr_basic['beta'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level3_basic"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

21824


In [67]:
df_tcr_basic['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df_tcr_basic.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 去除掉每个数据集中重复的 ['pep', 'beta'] pair
print(len(train_df))
train_df = train_df[["beta", "pep", 'label']]
train_df = train_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(train_df))

print(len(val_df))
val_df = val_df[["beta", "pep", 'label']]
val_df = val_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(val_df))

print(len(test_df))
test_df = test_df[["beta", "pep", 'label']]
test_df = test_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(test_df))

# 去除数据集之间重复的 ['pep', 'beta'] pair
train_df['tag'] = 'train'
val_df['tag'] = 'val'
test_df['tag'] = 'test'
merge_df = pd.concat((train_df, val_df, test_df), axis=0).reset_index(drop=True)
print(len(merge_df))
merge_df = merge_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True, keep='first')
print(len(merge_df))

train_df = merge_df[merge_df['tag']=='train'].reset_index(drop=True)
train_df = train_df[["beta", "pep", 'label']]
print(len(train_df))

val_df = merge_df[merge_df['tag']=='val'].reset_index(drop=True)
val_df = val_df[["beta", "pep", 'label']]
print(len(val_df))

test_df = merge_df[merge_df['tag']=='test'].reset_index(drop=True)
test_df = test_df[["beta", "pep", 'label']]
print(len(test_df))

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

21982
19394
2747
2620
2749
2600
24614
23941
19394
2281
2266


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr_basic['label'] = 1


---

### level-IV

In [68]:
df_tcr = pd.read_csv(os.path.join(
    dataset_dir,
    "Hi-TpH-level-IV.csv"
))
print(df_tcr.shape)
df_tcr.head()

(26704, 13)


Unnamed: 0,antigen.epitope,hla.allele,alpha.v,alpha.j,alpha.cdr3,beta.v,beta.j,beta.cdr3,alpha.vseq.reconstructed,beta.vseq.reconstructed,hla.full.seq,hla.clip.seq,hla.short.seq
0,VMAPRTLIL,HLA-E*01:01,TRAV26-1*01,TRAJ37*01,IVVRSSNTGKLI,TRBV14*01,TRBJ2-3*01,ASSQDRDTQY,DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQY...,EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFL...,MVDGTLLLLLSEALALTQTWAGSHSLKYFHTSVSRPGRGEPRFISV...,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,YHSMYRESADTIFVNTLYLWHEFYSSAEQAYTWY
1,VMAPRTLIL,HLA-E*01:03,TRAV26-1*01,TRAJ37*01,IVVRSSNTGKLI,TRBV14*01,TRBJ2-3*01,ASSQDRDTQY,DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQY...,EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFL...,MVDGTLLLLLSEALALTQTWAGSHSLKYFHTSVSRPGRGEPRFISV...,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,YHSMYRESADTIFVNTLYLWHEFYSSAEQAYTWY
2,SLLMWITQC,HLA-A*02:01,TRAV21*01,TRAJ6*01,AVRPLLDGTYIPT,TRBV6-5*01,TRBJ2-2*01,ASSYLGNTGELF,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY
3,EENLLDFVRF,HLA-B*44:02,TRAV26-1*01,TRAJ13*01,IVWGGYQKVT,TRBV7-9*01,TRBJ2-1*01,ASRYRDDSYNEQF,DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQY...,DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFL...,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...,GSHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRFDSDATSPRKE...,YYTKYREISTNTYENTAYIRYDDYTWAVDAYLSY
4,EENLLDFVRF,HLA-B*44:03,TRAV26-1*01,TRAJ13*01,IVWGGYQKVT,TRBV7-9*01,TRBJ2-1*01,ASRYRDDSYNEQF,DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQY...,DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFL...,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...,GSHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRFDSDATSPRKE...,YYTKYREISTNTYENTAYIRYDDYTWAVLAYLSY


In [69]:
df_tcr = df_tcr.dropna(subset=['antigen.epitope', 'beta.cdr3', "alpha.cdr3", "hla.short.seq",
                               "alpha.vseq.reconstructed",  "beta.vseq.reconstructed"])
df_tcr.rename(columns={"antigen.epitope":"pep", "hla.short.seq":"hla", 
                       "beta.cdr3":"beta", "alpha.cdr3":"alpha",
                       "beta.vseq.reconstructed":"beta_full",
                       "alpha.vseq.reconstructed":"alpha_full",}, inplace=True)
df_tcr = df_tcr.drop_duplicates(ignore_index=True)
print(df_tcr.shape)

(26704, 13)


In [70]:
tcr_length_counts = df_tcr['pep'].apply(len).value_counts()

print("pep 不同长度的个数统计：")
print(tcr_length_counts)

pep 不同长度的个数统计：
pep
9     22391
10     2822
8      1238
11      122
12       92
13       35
7         3
24        1
Name: count, dtype: int64


In [71]:
df_tcr = df_tcr[df_tcr['pep'].str.len() <= 10]
df_tcr = df_tcr[df_tcr['pep'].str.len() >= 8]
df_tcr.shape

(26451, 13)

In [72]:
beta_length_counts = df_tcr['beta'].apply(len).value_counts()

print("beta 不同长度的个数统计：")
print(beta_length_counts)

beta 不同长度的个数统计：
beta
15    6051
14    5351
13    5324
16    3427
12    1955
17    1948
18     854
11     790
19     375
20     149
10     129
21      49
22      17
8       10
9       10
23       7
26       2
25       2
24       1
Name: count, dtype: int64


In [73]:
alpha_length_counts = df_tcr['alpha'].apply(len).value_counts()

print("alpha 不同长度的个数统计：")
print(alpha_length_counts)

alpha 不同长度的个数统计：
alpha
13    5218
14    4893
15    4292
12    4219
11    2423
16    2189
17    1399
10     878
18     441
19     170
9      156
8       74
20      38
7       19
21      14
22       8
6        7
24       4
25       4
5        3
23       2
Name: count, dtype: int64


In [74]:
df_tcr = df_tcr[df_tcr['alpha'].str.len() <= 19]
df_tcr = df_tcr[df_tcr['alpha'].str.len() >= 9]
df_tcr.shape

(26278, 13)

In [75]:
df_tcr = df_tcr[df_tcr['beta'].str.len() <= 19]
df_tcr = df_tcr[df_tcr['beta'].str.len() >= 9]
df_tcr.shape

(26043, 13)

In [76]:
alpha_length_counts = df_tcr['alpha_full'].apply(len).value_counts()

print("alpha_full 不同长度的个数统计：")
print(alpha_length_counts)

alpha_full 不同长度的个数统计：
alpha_full
112    4199
113    3673
111    3619
110    2772
114    2635
109    2320
115    1903
116    1512
108    1114
117     889
118     704
107     286
119     159
106     103
120      61
105      55
104      30
103       8
102       1
Name: count, dtype: int64


In [77]:
beta_length_counts = df_tcr['beta_full'].apply(len).value_counts()

print("beta_full 不同长度的个数统计：")
print(beta_length_counts)

beta_full 不同长度的个数统计：
beta_full
114    5743
113    4676
115    4433
112    4107
116    2719
117    1464
111    1386
118     769
110     400
119     220
109      46
120      45
121      32
108       2
104       1
Name: count, dtype: int64


In [78]:
df_tcr = df_tcr[df_tcr['alpha_full'].str.len() <= 121]
df_tcr = df_tcr[df_tcr['alpha_full'].str.len() >= 105]
df_tcr.shape

(26004, 13)

In [79]:
df_tcr = df_tcr[df_tcr['beta_full'].str.len() <= 121]
df_tcr = df_tcr[df_tcr['beta_full'].str.len() >= 109]
df_tcr.shape

(26001, 13)

In [81]:
pep_length_counts = df_tcr['pep'].apply(len).value_counts()

print("pep 不同长度的个数统计：")
print(pep_length_counts)

pep 不同长度的个数统计：
pep
9     22003
10     2779
8      1219
Name: count, dtype: int64


In [82]:
df_tcr["ab"] = df_tcr['alpha_full'] + '/' + df_tcr['beta_full']

df_tcr = df_tcr.drop_duplicates(subset=['pep', 'hla', 'ab'], ignore_index=True)
print(len(df_tcr))
df_tcr = df_tcr.drop_duplicates(subset=['pep', 'hla', 'alpha_full', 'beta_full'], ignore_index=True)
print(len(df_tcr))

25987
25987


- all component

In [83]:
df_tcr_all = df_tcr[['pep', 'hla', 'ab']]

# 提取唯一的tcr值，转换为列表
tcr_list = df_tcr_all['ab'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level4"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

23988


In [84]:
# 添加一列标签为1
df_tcr_all['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df_tcr_all.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr_all['label'] = 1


In [85]:
train_df

Unnamed: 0,pep,hla,ab,label
0,KLGGALQAK,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,RKEVEQDPGPFNVPEGATVAFNCTYSNSASQSFFWYRQDCRKEPKL...,1
1,IVTDFSVIK,YYAMYQENVAQTDVDTLYIIYRDYTWAAQAYRWY,SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHL...,1
2,RAKFKQLL,YDSEYRNIFTNTDESNLYLSYNYYTWAVDAYTWY,AQSVTQLDSQVPVFEEAPVELRCNYSSSVSVYLFWYVQYPNQGLQL...,1
3,KLGGALQAK,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,AQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQL...,1
4,GILGFVFTL,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLL...,1
...,...,...,...,...
20784,KLGGALQAK,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,QKEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQYSRKGPEL...,1
20785,KLGGALQAK,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,QQQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPAL...,1
20786,KLGGALQAK,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,KNEVEQSPQNLTAQEGEFITINCSYSVGISALHWLQQHPGGGIVSL...,1
20787,KLGGALQAK,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,GQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAPTFL...,1


- 2 component

In [86]:
df_tcr_basic = df_tcr[['pep', 'beta']]

# 提取唯一的tcr值，转换为列表
tcr_list = df_tcr_basic['beta'].unique().tolist()
print(len(tcr_list))

# 保存去重后的tcr列表为numpy数组
data_path = "/data/lujd/TCRdata/benchmarks/level4_basic"
os.makedirs(data_path, exist_ok=True)
np.save(os.path.join(data_path, "tcr2candidates_pools.npy"), tcr_list)

20067


In [87]:
df_tcr_basic['label'] = 1

# 将数据集按照8:1:1的比例分割成训练集、验证集和测试集
# 先打乱数据集
df_shuffled = df_tcr_basic.sample(frac=1, random_state=42).reset_index(drop=True)

# 计算各个数据集的数量
total_samples = len(df_shuffled)
train_samples = int(0.8 * total_samples)
val_samples = int(0.1 * total_samples)
test_samples = total_samples - train_samples - val_samples

# 分割数据集
train_df = df_shuffled.iloc[:train_samples]
val_df = df_shuffled.iloc[train_samples:train_samples + val_samples]
test_df = df_shuffled.iloc[train_samples + val_samples:]

# 去除掉每个数据集中重复的 ['pep', 'beta'] pair
print(len(train_df))
train_df = train_df[["beta", "pep", 'label']]
train_df = train_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(train_df))

print(len(val_df))
val_df = val_df[["beta", "pep", 'label']]
val_df = val_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(val_df))

print(len(test_df))
test_df = test_df[["beta", "pep", 'label']]
test_df = test_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True)
print(len(test_df))

# 去除数据集之间重复的 ['pep', 'beta'] pair
train_df['tag'] = 'train'
val_df['tag'] = 'val'
test_df['tag'] = 'test'
merge_df = pd.concat((train_df, val_df, test_df), axis=0).reset_index(drop=True)
print(len(merge_df))
merge_df = merge_df.drop_duplicates(subset=['beta', 'pep'], ignore_index=True, keep='first')
print(len(merge_df))

train_df = merge_df[merge_df['tag']=='train'].reset_index(drop=True)
train_df = train_df[["beta", "pep", 'label']]
print(len(train_df))

val_df = merge_df[merge_df['tag']=='val'].reset_index(drop=True)
val_df = val_df[["beta", "pep", 'label']]
print(len(val_df))

test_df = merge_df[merge_df['tag']=='test'].reset_index(drop=True)
test_df = test_df[["beta", "pep", 'label']]
print(len(test_df))

# 保存分割后的数据集为CSV文件
train_df.to_csv(os.path.join(data_path, "train_data_fold0.csv"), index=False)
val_df.to_csv(os.path.join(data_path, "valid_data_fold0.csv"), index=False)
test_df.to_csv(os.path.join(data_path, "test_data_fold0.csv"), index=False)

20789
17932
2598
2455
2600
2438
22825
22099
17932
2103
2064


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcr_basic['label'] = 1


In [88]:
train_df

Unnamed: 0,beta,pep,label
0,CASSLAQVQYTEAFF,KLGGALQAK,1
1,CASSLATVSYEQYF,IVTDFSVIK,1
2,CASSYGILVGGELFF,RAKFKQLL,1
3,CASSLTVAGVYPGYEQYF,KLGGALQAK,1
4,CASSIRSAYEQYF,GILGFVFTL,1
...,...,...,...
17927,CASSSKSGTSLGYNEQFF,KLGGALQAK,1
17928,CSASGGMGYNEQFF,KLGGALQAK,1
17929,CASSWGWSYEQYF,KLGGALQAK,1
17930,CSASGGSSYNEQFF,KLGGALQAK,1
