In [1]:
import json
import pandas as pd

test_df = pd.read_csv("../data/immrep2023_solutions.csv")
print(test_df['HLA'].unique())

# hla: gene -> sequence
with open("../data/Hi-TpH-hla_allele2seq.json", "r") as f:
    seq_dict = json.load(f)

def get_hla_short_seq(hla):
    return seq_dict[hla]['short sequence']

test_df['HLA'] = test_df['HLA'].apply(lambda x: 'HLA-'+x)
test_df['hla.short.seq'] = test_df['HLA'].apply(get_hla_short_seq)

# tcr: gene -> sequence
with open("../data/tcr_gene2seq.json", "r") as f:
    gene_dict = json.load(f)

for col in ['Va', 'Ja', 'Vb', 'Jb']:
    test_df[col+'.imgt.seq'] = test_df[col].apply(lambda x: gene_dict[x]['amino acid sequence'])
    print(f"# {col+'.imgt.seq'} (UNK): {len(test_df[test_df[col+'.imgt.seq']=='(UNK)'])}")

test_df.head(2)

['A*01:01' 'B*35:01' 'A*02:01' 'B*08:01' 'B*07:02' 'A*11:01']
# Va.imgt.seq (UNK): 0
# Ja.imgt.seq (UNK): 0
# Vb.imgt.seq (UNK): 0
# Jb.imgt.seq (UNK): 0


Unnamed: 0,ID,Peptide,HLA,Va,Ja,TCRa,CDR1a,CDR2a,CDR3a,CDR3a_extended,...,CDR2b,CDR3b,CDR3b_extended,Label,Usage,hla.short.seq,Va.imgt.seq,Ja.imgt.seq,Vb.imgt.seq,Jb.imgt.seq
0,0,SALPTNADLY,HLA-A*01:01,TRAV8-3*01,TRAJ21*01,MLLELIPLLGIHFVLRTARAQSVTQPDIHITVSEGASLELRCNYSY...,YGATPY,YFSGDTLV,AVGDNFNKFY,CAVGDNFNKFYF,...,SYDVKM,ASAPTSAMGEQY,CASAPTSAMGEQYF,0,Private,YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGY,AQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQL...,YNFNKFYFGSGTKLNVKP,DVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGLGLRLI...,SYEQYFGPGTRLTVT
1,1,IPSINVHHY,HLA-B*35:01,TRAV30*01,TRAJ26*01,METLLKVLSGTLLWQLTWVRSQQPVQSPQAVILREGEDAVINCSSS...,KALYS,LLKGGEQ,GTYDNYGQNFV,CGTYDNYGQNFVF,...,FQGTGA,ASSRLAGGVGDTQY,CASSRLAGGVGDTQYF,0,Private,YYATYRNIFTNTYESNLYIRYDSYTWAVLAYLWY,QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLM...,DNYGQNFVFGPGTRLSVLP,GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...,STDTQYFGPGTRLTVL


In [2]:
from utils import reconstruct_vseq

test_df['alpha.vseq.reconstructed'] = test_df['Va.imgt.seq']+'-'+test_df['CDR3a_extended']+'-'+test_df['Ja.imgt.seq']
test_df['beta.vseq.reconstructed'] = test_df['Vb.imgt.seq']+'-'+test_df['CDR3b_extended']+'-'+test_df['Jb.imgt.seq']

test_df['alpha.vseq.reconstructed-e'] = test_df['alpha.vseq.reconstructed'].apply(reconstruct_vseq)
test_df['beta.vseq.reconstructed-e'] = test_df['beta.vseq.reconstructed'].apply(reconstruct_vseq)

for col in ['alpha.vseq.reconstructed-e', 'beta.vseq.reconstructed-e']:
    print(len(test_df[test_df[col]=="(UNK)"]))

0
0


In [3]:
alpha_right, beta_right = 0, 0
alpha_all, beta_all = 0, 0
alpha_inds, beta_inds = [], []
for ind, row in test_df.iterrows():
    if ind == 0:
        print(row['alpha.vseq.reconstructed-e'], row['TCRa'], sep='\n')
        print(row['beta.vseq.reconstructed-e'], row['TCRb'], sep='\n')

    if row['alpha.vseq.reconstructed-e'] in row['TCRa']:
        alpha_right += 1
    else:
        alpha_inds.append(ind)
    
    if row['beta.vseq.reconstructed-e'] in row['TCRb']:
        beta_right += 1
    else:
        beta_inds.append(ind)
    
    alpha_all += 1
    beta_all += 1
    # break

print(f"\nMatched alpha: {alpha_right}/{alpha_all} ({alpha_right/alpha_all*100:.2f}%)")
print(f"Matched beta: {beta_right}/{beta_all} ({beta_right/beta_all*100:.2f}%)")
print(f"Matched alpha/beta: {alpha_right+beta_right}/{alpha_all+beta_all} ({(alpha_right+beta_right)/(alpha_all+beta_all)*100:.2f}%)")

AQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDAAEYFCAVGDNFNKFYFGSGTKLNVKP
MLLELIPLLGIHFVLRTARAQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDAAEYFCAVGDNFNKFYFGSGTKLNVKPNIQNPDPAVYQLRDSKSSDKSVCLFTDFDSQTNVSQSKDSDVYITDKTVLDMRSMDFKSNSAVAWSNKSDFACANAFNNSIIPEDTFFPSPESSCDVKLVEKSFETDTNLNFQNLSVIGFRILLLKVAGFNLLMTLRLWSS
DVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGLGLRLIYFSYDVKMKEKGDIPEGYSVSREKKERFSLILESASTNQTSMYLCASAPTSAMGEQYFGPGTRLTVT
MGIRLLCRVAFCFLAVGLVDVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGLGLRLIYFSYDVKMKEKGDIPEGYSVSREKKERFSLILESASTNQTSMYLCASAPTSAMGEQYFGPGTRLTVTEDLKNVFPPEVAVFEPSEAEISHTQKATLVCLATGFYPDHVELSWWVNGKEVHSGVSTDPQPLKEQPALNDSRYCLSSRLRVSATFWQNPRNHFRCQVQFYGLSENDEWTQDRAKPVTQIVSAEAWGRADCGFTSESYQQGVLSATILYEILLGKATLYAVLVSALVLMAMVKRKDSRG

Matched alpha: 3484/3484 (100.00%)
Matched beta: 3469/3484 (99.57%)
Matched alpha/beta: 6953/6968 (99.78%)


In [4]:
test_df['alpha.vseq'] = test_df['Va.imgt.seq']+'-'+test_df['CDR3a']+'-'+test_df['Ja.imgt.seq']+'-'+test_df['TCRa']
test_df['beta.vseq'] = test_df['Vb.imgt.seq']+'-'+test_df['CDR3b']+'-'+test_df['Jb.imgt.seq']+'-'+test_df['TCRb']

def extract_vseq(data):
    vgene_seq = data.split('-')[0]
    cdr3 = data.split('-')[1]
    jgene_seq = data.split('-')[2]
    full_seq = data.split('-')[3]
    
    vseq_start = full_seq.find(vgene_seq[:30])
    vseq_end = full_seq.find(jgene_seq[-10:]) + 10

    if (vseq_start != -1) & (vseq_end != -1) & (vseq_end - vseq_start > len(cdr3)):
        return full_seq[vseq_start:vseq_end]
    else:
        print(data)
        return '(UNK)'

test_df['alpha.vseq-e'] = test_df['alpha.vseq'].apply(extract_vseq)
test_df['beta.vseq-e'] = test_df['beta.vseq'].apply(extract_vseq)

for col in ['alpha.vseq-e', 'beta.vseq-e']:
    print(len(test_df[test_df[col]=="(UNK)"]))

GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSA-SAKEILGGYGYT-NYGYTFGSGTRLTVV-MLLLLLLLGPAGSGLGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAKEILGGYGYTFGSGTRLTVVEDLNKVFPPEVAVFEPSEAEISHTQKATLVCLATGFFPDHVELSWWVNGKEVHSGVSTDPQPLKEQPALNDSRYCLSSRLRVSATFWQNPRNHFRCQVQFYGLSENDEWTQDRAKPVTQIVSAEAWGRADCGFTSVSYQQGVLSATILYEILLGKATLYAVLVSALVLMAMVKRKDF
GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR-SAIPHSYNEQF-SYNEQFFGPGTRLTVL-MLLLLLLLGPGISLLLPGSLAGSGLGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAIPHSYNEQFFGPGTRLTVLEDLKNVFPPEVAVFEPSEAEISHTQKATLVCLATGFYPDHVELSWWVNGKEVHSGVSTDPQPLKEQPALNDSRYCLSSRLRVSATFWQNPRNHFRCQVQFYGLSENDEWTQDRAKPVTQIVSAEAWGRADCGFTSESYQQGVLSATILYEILLGKATLYAVLVSALVLMAMVKRKDSRG
GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSA-SAKEILGGYGYT-NYGYTFGS

In [5]:
test_df[test_df['alpha.vseq-e'] != test_df['alpha.vseq.reconstructed-e']][
    ['ID', 'Peptide', 'HLA', 'Va', 'Ja', 'CDR3a_extended', 'alpha.vseq.reconstructed-e', 'alpha.vseq-e']]

Unnamed: 0,ID,Peptide,HLA,Va,Ja,CDR3a_extended,alpha.vseq.reconstructed-e,alpha.vseq-e


In [6]:
test_df[test_df['beta.vseq-e'] != test_df['beta.vseq.reconstructed-e']][
    ['ID', 'Peptide', 'HLA', 'Vb', 'Jb', 'CDR3b_extended', 'beta.vseq.reconstructed-e', 'beta.vseq-e']]

Unnamed: 0,ID,Peptide,HLA,Vb,Jb,CDR3b_extended,beta.vseq.reconstructed-e,beta.vseq-e
935,935,VTEHDTLLY,HLA-A*01:01,TRBV20-1*02,TRBJ1-2*01,CSAKEILGGYGYTF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,(UNK)
1125,1125,GILGFVFTL,HLA-A*02:01,TRBV20-1*05,TRBJ2-1*01,CSAIPHSYNEQFF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLML...,(UNK)
1209,1209,NLVPMVATV,HLA-A*02:01,TRBV20-1*02,TRBJ1-2*01,CSAKEILGGYGYTF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,(UNK)
1403,1403,YVLDHLIVV,HLA-A*02:01,TRBV20-1*02,TRBJ1-2*01,CSAKEILGGYGYTF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,(UNK)
1413,1413,YVLDHLIVV,HLA-A*02:01,TRBV20-1*05,TRBJ2-1*01,CSAIPHSYNEQFF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLML...,(UNK)
1890,1890,GLCTLVAML,HLA-A*02:01,TRBV20-1*05,TRBJ2-1*01,CSAIPHSYNEQFF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLML...,(UNK)
1926,1926,RPHERNGFTVL,HLA-B*07:02,TRBV20-1*05,TRBJ2-1*01,CSAIPHSYNEQFF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLML...,(UNK)
2153,2153,RPPIFIRRL,HLA-B*07:02,TRBV20-1*05,TRBJ2-1*01,CSAIPHSYNEQFF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLML...,(UNK)
2247,2247,RAKFKQLL,HLA-B*08:01,TRBV20-1*02,TRBJ1-2*01,CSAKEILGGYGYTF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,(UNK)
2582,2582,IPSINVHHY,HLA-B*35:01,TRBV20-1*05,TRBJ2-1*01,CSAIPHSYNEQFF,GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLML...,(UNK)


In [7]:
test_df['alpha.vseq.reconstructed-e'].str.len().value_counts()

alpha.vseq.reconstructed-e
112    737
113    640
111    467
114    417
110    327
109    221
115    192
116    151
108    148
117     66
118     66
107     38
119      4
104      3
120      3
121      2
105      1
102      1
Name: count, dtype: int64

In [8]:
test_df['beta.vseq.reconstructed-e'].str.len().value_counts()

beta.vseq.reconstructed-e
115    724
113    702
112    616
114    561
116    395
111    181
117    131
118     69
120     32
110     25
108     25
119     17
122      4
121      2
Name: count, dtype: int64