# Data from pMTnet

- download from: https://github.com/tianshilu/pMTnet/tree/master/data
- 放在 ``TCRdata/raw/paper/pMTnet/``

In [1]:
import pandas as pd
from utils import save_csv

to_your_path = '/data/lujd/'

### ¶ Training set

In [2]:
import pandas as pd

datapath = to_your_path + "TCRdata/raw/paper/pMTnet/"
filename1 = "training_data.csv"

df1 = pd.read_csv(datapath+filename1, sep=",")
df1.head(3)

Unnamed: 0,CDR3,Antigen,HLA
0,CASSSRSSYEQYF,GILGFVFTL,A*02:01
1,CASSPVTGGIYGYTF,NLVPMVATV,A*02:01
2,CSARDGTGNGYTF,GLCTLVAML,A*02:01


In [3]:
# statistic
for col in df1.columns:
    print(f"{col}'s number: {df1[col].nunique()} ")

print(f"total number: {len(df1)} ")

CDR3's number: 29226 
Antigen's number: 428 
HLA's number: 64 
total number: 32044 


In [4]:
len(df1["Antigen"].unique()) # nan

429

In [6]:
len(df1["CDR3"].unique())

29226

In [7]:
sum = 0
for cdr3 in df1["CDR3"].unique():
    if ";" in cdr3:
        sum += 1
print(sum)

1799


- **The data's a little dirty.**

In [8]:
# remove nan from colum "Antigen"
df1 = df1.dropna(axis=0, subset = ["Antigen"]).reset_index(drop=True)

len(df1["Antigen"].unique()), len(df1)

(428, 32041)

In [9]:
def remove_semicolons(data):
    return data.strip(';')

df1['CDR3'] = df1['CDR3'].apply(remove_semicolons)

sum = 0
for cdr3 in df1["CDR3"].unique():
    if ";" in cdr3:
        sum += 1
print(sum, df1["CDR3"].nunique())

0 28621


### ¶ Testing set

In [10]:
filename2 = "testing_data.csv"
df2 = pd.read_csv(datapath+filename2, sep=",")
df2.head(3)

Unnamed: 0,CDR3,Antigen,HLA
0,CASSLSFGTEAFF,SMLGIGIYPV,A*02:01
1,CASSLSFGTEAFF,LMFDRGMSLL,A*02:01
2,CASSLSFGTEAFF,ILEDRGFNQV,A*02:01


In [11]:
# statistic
for col in df2.columns:
    print(f"{col}'s number: {df2[col].nunique()} ")

print(f"total number: {len(df2)} ")

CDR3's number: 272 
Antigen's number: 224 
HLA's number: 24 
total number: 619 


In [12]:
len(df2["Antigen"].unique())

224

In [13]:
df2["Antigen"].unique()[:30]

array(['SMLGIGIYPV', 'LMFDRGMSLL', 'ILEDRGFNQV', 'MMWDRGLGMM',
       'NLSNLGILPV', 'IMEDVGWLNV', 'NMGGLGIMPV', 'MMWDRGMGLL',
       'SMAGIGIVDV', 'SMLGIGIVPV', 'ALDSRSEHFM', 'AMDSRADMFV',
       'SMNSREEVFV', 'IMDSKSENFL', 'SMNSHSGTFL', 'ELAGIGILTV',
       'AAGIGILTV', 'EAAGIGILTV', 'SLYNTVATL', 'GILGFVFTL',
       'ELAGIGILTV\xa0', 'ALFNTVATL', 'CLFNTVATL', 'DLFNTVATL',
       'ELFNTVATL', 'FLFNTVATL', 'GLFNTVATL', 'HLFNTVATL', 'ILFNTVATL',
       'KLFNTVATL'], dtype=object)

unexpected antigen:
- 'ELAGIGILTV\xa0' --> 'ELAGIGILTV '
- '\xa0NEGVKAAW' --> ' NEGVKAAW'

In [14]:
def remove_space(data):
    return data.strip(' ')

df2['Antigen'] = df2['Antigen'].apply(remove_space)

df2["Antigen"].unique()[:30]

array(['SMLGIGIYPV', 'LMFDRGMSLL', 'ILEDRGFNQV', 'MMWDRGLGMM',
       'NLSNLGILPV', 'IMEDVGWLNV', 'NMGGLGIMPV', 'MMWDRGMGLL',
       'SMAGIGIVDV', 'SMLGIGIVPV', 'ALDSRSEHFM', 'AMDSRADMFV',
       'SMNSREEVFV', 'IMDSKSENFL', 'SMNSHSGTFL', 'ELAGIGILTV',
       'AAGIGILTV', 'EAAGIGILTV', 'SLYNTVATL', 'GILGFVFTL', 'ALFNTVATL',
       'CLFNTVATL', 'DLFNTVATL', 'ELFNTVATL', 'FLFNTVATL', 'GLFNTVATL',
       'HLFNTVATL', 'ILFNTVATL', 'KLFNTVATL', 'LLFNTVATL'], dtype=object)

In [15]:
'ELAGIGILTV' in df2["Antigen"].unique()

True

### ¶ concat

In [16]:
# Concatenate along rows (vertically)
result = pd.concat([df1, df2], ignore_index=True)

for col in result.columns:
    print(f"{col}'s number: {result[col].nunique()} ")

print(f"total number: {len(result)} ")

CDR3's number: 28864 
Antigen's number: 602 
HLA's number: 71 
total number: 32660 


In [17]:
# Define a function to modify the HLA gane name
def add_prefix(data):
    return "HLA-" + data if "HLA-" not in data else data

result['HLA'] = result['HLA'].apply(add_prefix)
result = result.rename(columns={"CDR3":"CDR3b", "Antigen":"peptide"})

result.head(3)

Unnamed: 0,CDR3b,peptide,HLA
0,CASSSRSSYEQYF,GILGFVFTL,HLA-A*02:01
1,CASSPVTGGIYGYTF,NLVPMVATV,HLA-A*02:01
2,CSARDGTGNGYTF,GLCTLVAML,HLA-A*02:01


In [18]:
result["HLA"].unique()

array(['HLA-A*02:01', 'HLA-A*01:01', 'HLA-B*07:02', 'HLA-A*02',
       'HLA-A*01', 'HLA-A*11', 'HLA-B*08', 'HLA-B*44:05', 'HLA-B*57:01',
       'HLA-B*57:03', 'HLA-B*08:01', 'HLA-B*57', 'HLA-B*15', 'HLA-B*27',
       'HLA-B*27:05', 'HLA-B*42', 'HLA-B*35:01', 'HLA-A*24:02',
       'HLA-A*11:01', 'HLA-B*35:02', 'HLA-B*42:01', 'HLA-A*02:02',
       'HLA-A*02:03', 'HLA-A*02:04', 'HLA-A*02:05', 'HLA-A*02:06',
       'HLA-A*02:07', 'HLA-A*02:08', 'HLA-A*02:09', 'HLA-A*02:10',
       'HLA-A*02:11', 'HLA-A*02:12', 'HLA-A*02:13', 'HLA-A*02:14',
       'HLA-A*02:15', 'HLA-A*02:16', 'HLA-A*02:17', 'HLA-B*18',
       'HLA-A*02:01:59', 'HLA-A*02:01:48', 'HLA-B*08:01:29',
       'HLA-A*02:256', 'HLA-B*35:08:01', 'HLA-E*01:01:01:03',
       'HLA-B*35:42:01', 'HLA-B*57:06', 'HLA-B*44:05:01',
       'HLA-A*02:01:98', 'HLA-A*24:02:84', 'HLA-B*27:05:31',
       'HLA-B*51:193', 'HLA-A*02:01:110', 'HLA-B*51:01', 'HLA-B*07',
       'HLA-B*81:01', 'HLA-B*35:08', 'HLA-B*35', 'HLA-B*44:03:08',
       'HLA-B*58

In [19]:
# All positive samples
# Add a new column with all elements as 1 between columns 'peptide' and 'HLA'
position = result.columns.get_loc('HLA')  # Get the index of 'HLA' column
result.insert(position, 'binder', [1]*len(result))

result.head(3)

Unnamed: 0,CDR3b,peptide,binder,HLA
0,CASSSRSSYEQYF,GILGFVFTL,1,HLA-A*02:01
1,CASSPVTGGIYGYTF,NLVPMVATV,1,HLA-A*02:01
2,CSARDGTGNGYTF,GLCTLVAML,1,HLA-A*02:01


- Save data to ``TCRdata/otherpapers/``

In [20]:
# save
print(f"number of brief data: {len(result)}")

result = result.drop_duplicates(ignore_index=True)
print(f"number of duplicated brief data: {len(result)}")

savepath = to_your_path + "TCRdata/otherpapers/"
savename = "pMTnet.csv"
save_csv(result, savepath, savename)

number of brief data: 32660
number of duplicated brief data: 32070
File /data/lujd/TCRdata/otherpapers/pMTnet.csv has been successfully saved.
