In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:
# McPAS Analytics
# Make sure to delete the empty entry at the end of the file
mcpas = pd.read_csv('McPAS-TCR.csv', encoding = 'latin')
mcpas

Unnamed: 0,CDR3.alpha.aa,CDR3.beta.aa,Species,Category,Pathology,Pathology.Mesh.ID,Additional.study.details,Antigen.identification.method,Single.cell,NGS,Antigen.protein,Protein.ID,Epitope.peptide,Epitope.ID,MHC,Tissue,T.Cell.Type,T.cell.characteristics,CDR3.alpha.nt,TRAV,TRAJ,TRBV,TRBD,TRBJ,Reconstructed.J.annotation,CDR3.beta.nt,Mouse.strain,PubMed.ID,Remarks
0,,CASSDAGANTEVF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,Pre-glycoprotein polyprotein GP complex,P09991,IKAVYNFATCG,26751,H-2db,,CD8,,,,,TRBV8-1,,TRBJ1-1,No,,P14,1716213,
1,,CASSDAGAYAEQF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,Pre-glycoprotein polyprotein GP complex,P09991,IKAVYNFATCG,26751,H-2db,,CD8,,,,,TRBV8-1,,TRBJ2-1,No,,P14,1716213,
2,,CASSDAGGAAEVF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,Pre-glycoprotein polyprotein GP complex,P09991,IKAVYNFATCG,26751,H-2db,,CD8,,,,,TRBV8-3,,TRBJ1-1,No,,P14,1716213,
3,,CASSDAGHSPLYF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,Pre-glycoprotein polyprotein GP complex,P09991,IKAVYNFATCG,26751,H-2db,,CD8,,,,,TRBV8-1,,TRBJ1-6,No,,P14,1716213,
4,,CASSDAWGGAEQYF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,Pre-glycoprotein polyprotein GP complex,P09991,IKAVYNFATCG,26751,H-2db,,CD8,,,,,TRBV8-3,,TRBJ2-6,No,,P14,1716213,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38787,,CSVRPHHEQFF,Human,Pathogens,M.Tuberculosis,D009169,Bulk,2.2,No,Yes,,,,,,PBMC,CD4,,,,,TRBV29-1,,,,,,32341563,
38788,,CSVSDGNQPQHF,Human,Pathogens,M.Tuberculosis,D009169,Bulk,2.2,No,Yes,,,,,,PBMC,CD4,,,,,TRBV29-1,,,,,,32341563,
38789,,CSVTEGQRIF,Human,Pathogens,M.Tuberculosis,D009169,Bulk,2.2,No,Yes,,,,,,PBMC,CD4,,,,,TRBV29-1,,,,,,32341563,
38790,,CSVVWSVMENEKLFF,Human,Pathogens,M.Tuberculosis,D009169,Bulk,2.2,No,Yes,,,,,,PBMC,CD4,,,,,TRBV29-1,,,,,,32341563,


In [3]:
# McPAS Summary Statistics 
def describe_mcpas(df):
    print('Mouse Samples: ', len(df[df['Species'] == 'Mouse']))
    print('Human Samples: ', len(df[df['Species'] == 'Human']))
    print('Unique Antigen Sequences: ', len(df['Epitope.peptide'].unique()))
    print('Cancer Based Samples: ', len(df[df['Category'] == 'Cancer']))
    print('CD8 T-Cell Samples', len(df[df['T.Cell.Type'] == 'CD8']))
    print('CD4 T-Cell Samples', len(df[df['T.Cell.Type'] == 'CD4']))
    print('TCR-Epitope Pairs Available: ', len(df))
# TCR Beta Sequences
mcpas_beta_pairs = mcpas.loc[(mcpas['CDR3.beta.aa'].notnull()) & (mcpas['Epitope.peptide'].notnull())]
print('McPAS TCR Beta Sequences')
print('Unique CDR3 Beta Sequences: ', len(mcpas_beta_pairs['CDR3.beta.aa'].unique()))
describe_mcpas(mcpas_beta_pairs)

# TCR Alpha Sequences
mcpas_alpha_pairs = mcpas.loc[(mcpas['CDR3.alpha.aa'].notnull()) & (mcpas['Epitope.peptide'].notnull())]
print('\nMcPAS TCR Alpha Sequences')
print('Unique CDR3 Alpha Sequences: ', len(mcpas_alpha_pairs['CDR3.alpha.aa'].unique()))
describe_mcpas(mcpas_alpha_pairs)

# TCR Alpha Sequences
mcpas_both_pairs = mcpas.loc[(mcpas['CDR3.alpha.aa'].notnull()) & (mcpas['Epitope.peptide'].notnull()) & (mcpas['CDR3.beta.aa'].notnull())]
print('\nMcPAS TCR Both Beta and Alpha Available')
describe_mcpas(mcpas_both_pairs)

McPAS TCR Beta Sequences
Unique CDR3 Beta Sequences:  11774
Mouse Samples:  2914
Human Samples:  11870
Unique Antigen Sequences:  350
Cancer Based Samples:  2398
CD8 T-Cell Samples 11732
CD4 T-Cell Samples 1114
TCR-Epitope Pairs Available:  14828

McPAS TCR Alpha Sequences
Unique CDR3 Alpha Sequences:  3908
Mouse Samples:  2143
Human Samples:  3851
Unique Antigen Sequences:  279
Cancer Based Samples:  1203
CD8 T-Cell Samples 4283
CD4 T-Cell Samples 233
TCR-Epitope Pairs Available:  5994

McPAS TCR Both Beta and Alpha Available
Mouse Samples:  2047
Human Samples:  2795
Unique Antigen Sequences:  258
Cancer Based Samples:  930
CD8 T-Cell Samples 3655
CD4 T-Cell Samples 132
TCR-Epitope Pairs Available:  4842


In [4]:
# VDJDB Analytics
vdjdb = pd.read_csv('VDJDB.csv', encoding = 'latin')
vdjdb

Unnamed: 0,complex.id,Gene,CDR3,V,J,Species,MHC A,MHC B,MHC class,Epitope,Epitope gene,Epitope species,Reference,Method,Meta,CDR3fix,Score
0,0,TRB,CSVWGTGKTYEQYF,TRBV29-1*01,TRBJ2-7*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:17287271,"{""frequency"": ""19/75"", ""identification"": ""tetr...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CSVWGTGKTYEQYF"", ""cdr3_old"": ""CSVWGT...",1
1,0,TRB,CSVWGEGRSYEQYF,TRBV29-1*01,TRBJ2-7*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:17287271,"{""frequency"": ""5/75"", ""identification"": ""tetra...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CSVWGEGRSYEQYF"", ""cdr3_old"": ""CSVWGE...",1
2,0,TRB,CSATILAGVPYGEQYF,TRBV20-1*01,TRBJ2-7*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:17287271,"{""frequency"": ""17/75"", ""identification"": ""tetr...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CSATILAGVPYGEQYF"", ""cdr3_old"": ""CSAT...",1
3,0,TRB,CSASEGTSSYEQYF,TRBV20-1*01,TRBJ2-7*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:17287271,"{""frequency"": ""1/75"", ""identification"": ""tetra...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CSASEGTSSYEQYF"", ""cdr3_old"": ""CSASEG...",0
4,0,TRB,CASSFDREVTGELFF,TRBV7-3*01,TRBJ2-2*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:17287271,"{""frequency"": ""7/75"", ""identification"": ""tetra...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFDREVTGELFF"", ""cdr3_old"": ""CASSF...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76686,24605,TRA,CLVGLFSDGQKLLF,TRAV4*01,,HomoSapiens,HLA-A*68:01,B2M,MHCI,DATYQRTRALVR,NP,InfluenzaA,PMID:31811120,"{""frequency"": ""1/20"", ""identification"": ""tetra...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CLVGLFSDGQKLLF"", ""cdr3_old"": ""CLVGLF...",0
76687,24617,TRA,CAVKYGQKLLF,TRAV12-2*01,,HomoSapiens,HLA-A*68:01,B2M,MHCI,DATYQRTRALVR,NP,InfluenzaA,PMID:31811120,"{""frequency"": ""6/62"", ""identification"": ""tetra...","{""cell.subset"": ""CD8+TetHigh"", ""clone.id"": """",...","{""cdr3"": ""CAVKYGQKLLF"", ""cdr3_old"": ""CAVKYGQKL...",0
76688,24618,TRA,CLVGLYSDGQKLLF,TRAV4*01,,HomoSapiens,HLA-A*68:01,B2M,MHCI,DATYQRTRALVR,NP,InfluenzaA,PMID:31811120,"{""frequency"": ""4/62"", ""identification"": ""tetra...","{""cell.subset"": ""CD8+TetHigh"", ""clone.id"": """",...","{""cdr3"": ""CLVGLYSDGQKLLF"", ""cdr3_old"": ""CLVGLY...",0
76689,24633,TRA,CAVKHGQKLLF,TRAV12-2*01,,HomoSapiens,HLA-A*68:01,B2M,MHCI,DATYQRTRALVR,NP,InfluenzaA,PMID:31811120,"{""frequency"": ""1/62"", ""identification"": ""tetra...","{""cell.subset"": ""CD8+TetLow"", ""clone.id"": """", ...","{""cdr3"": ""CAVKHGQKLLF"", ""cdr3_old"": ""CAVKHGQKL...",0


In [5]:
# VDJDB Summary Statistics
def describe_vdjdb(df):
    print('Human Samples: ', len(df[df['Species'] == 'HomoSapiens']))
    print('Mouse Samples: ', len(df[df['Species'] == 'MusMusculus']))
    print('Monkey Samples: ', len(df[df['Species'] == 'MacacaMulatta']))
    print('Unique Antigen Sequences: ', len(df['Epitope'].unique()))
    print('MHC Class I: ', len(df[df['MHC class'] == 'MHCI']))
    print('MHC Class II: ', len(df[df['MHC class'] == 'MHCII']))
    print('Score 0: ', len(df[df['Score'] == 0]))
    print('Score 1: ', len(df[df['Score'] == 1]))
    print('Score 2: ', len(df[df['Score'] == 2]))
    print('Score 3: ', len(df[df['Score'] == 3]))

vdjdb_beta_pairs = vdjdb.loc[(vdjdb['Gene'] == 'TRB') & (vdjdb['Epitope'].notnull())]
print('VDJDB Beta Sequences')
print('Unique CDR3 Beta Sequences: ', len(vdjdb_beta_pairs['CDR3'].unique()))
describe_vdjdb(vdjdb_beta_pairs)

vdjdb_alpha_pairs = vdjdb.loc[(vdjdb['Gene'] == 'TRA') & (vdjdb['Epitope'].notnull())]
print('\nVDJDB Alpha Sequences')
print('Unique CDR3 Alpha Sequences: ', len(vdjdb_alpha_pairs['CDR3'].unique()))
describe_vdjdb(vdjdb_alpha_pairs)

print('\nVDJDB Alpha or Beta Sequences')
print('Unique CDR3 Alpha or Beta Sequences: ', len(vdjdb['CDR3'].unique()))
describe_vdjdb(vdjdb)

VDJDB Beta Sequences
Unique CDR3 Beta Sequences:  34361
Human Samples:  40550
Mouse Samples:  2926
Monkey Samples:  1982
Unique Antigen Sequences:  229
MHC Class I:  43674
MHC Class II:  1784
Score 0:  39070
Score 1:  5335
Score 2:  622
Score 3:  431

VDJDB Alpha Sequences
Unique CDR3 Alpha Sequences:  22197
Human Samples:  28969
Mouse Samples:  2264
Monkey Samples:  0
Unique Antigen Sequences:  163
MHC Class I:  30353
MHC Class II:  880
Score 0:  28821
Score 1:  1974
Score 2:  211
Score 3:  227

VDJDB Alpha or Beta Sequences
Unique CDR3 Alpha or Beta Sequences:  56556
Human Samples:  69519
Mouse Samples:  5190
Monkey Samples:  1982
Unique Antigen Sequences:  230
MHC Class I:  74027
MHC Class II:  2664
Score 0:  67891
Score 1:  7309
Score 2:  833
Score 3:  658


In [6]:
# Combine the datasets (Beta Sequences Only)
sequences_only_mcpas = mcpas.loc[(mcpas['Epitope.peptide'].notnull())]
sequences_only_mcpas = sequences_only_mcpas[['CDR3.beta.aa', 'Epitope.peptide']]
sequences_only_mcpas.columns = ['CDR3', 'Epitope']
vdjdb_beta_pairs = vdjdb.loc[(vdjdb['Gene'] == 'TRB') & (vdjdb['Epitope'].notnull())]
sequences_only_vdjdb = vdjdb[['CDR3', 'Epitope', 'Score']]
combined_sequences = pd.concat([sequences_only_vdjdb, sequences_only_mcpas])
combined_sequences.reset_index(drop=True)

Unnamed: 0,CDR3,Epitope,Score
0,CSVWGTGKTYEQYF,FLKEKGGL,1.0
1,CSVWGEGRSYEQYF,FLKEKGGL,1.0
2,CSATILAGVPYGEQYF,FLKEKGGL,1.0
3,CSASEGTSSYEQYF,FLKEKGGL,0.0
4,CASSFDREVTGELFF,FLKEKGGL,1.0
...,...,...,...
93055,CASSARSTGELFF,GILGFVFTL,
93056,CASSARSTGELFF,GILGFVFTL,
93057,CASSARSTGELFF,GILGFVFTL,
93058,CASSARSTGELFF,GILGFVFTL,


In [7]:
def combined_summary_stats(df):
    print('Unique CDR3 Sequences: ', len(df['CDR3'].unique()))
    print('Unique Antigen Sequences: ', len(df['Epitope'].unique()))
    print('VDJDB Pairs: ', len(df[df['Score'].notnull()]))
    print('McPAS Pairs: ', len(df[df['Score'].isna()]))

combined_summary_stats(combined_sequences)
combined_sequences.to_csv('vdjdb_mcpas_combined_data.csv', index=False)

Unique CDR3 Sequences:  63613
Unique Antigen Sequences:  532
VDJDB Pairs:  76691
McPAS Pairs:  16369
