In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
pd.set_option('display.max_columns', None)

In [None]:
# McPAS Analytics
# Make sure to delete the empty entry at the end of the file
mcpas = pd.read_csv('McPAS-TCR.csv', encoding = 'latin', low_memory=False)
mcpas

In [None]:
# McPAS Summary Statistics 
def describe_mcpas(df):
    print('Mouse Samples: ', len(df[df['Species'] == 'Mouse']))
    print('Human Samples: ', len(df[df['Species'] == 'Human']))
    print('Unique Antigen Sequences: ', len(df['Epitope.peptide'].unique()))
    print('Cancer Based Samples: ', len(df[df['Category'] == 'Cancer']))
    print('CD8 T-Cell Samples: ', len(df[df['T.Cell.Type'] == 'CD8']))
    print('CD4 T-Cell Samples: ', len(df[df['T.Cell.Type'] == 'CD4']))
    print('MHC I Samples: ', len(df[df['MHC'].str.match('^HLA-[ABC]')==True]))
    print('MHC II Sample: ', len(df[df['MHC'].str.match('^(HLA-D|DR|D)')==True]))
    print('TCR-Epitope Pairs Available: ', len(df))

# McPAS TCR Beta Sequences
mcpas_beta_pairs = mcpas.loc[(mcpas['CDR3.beta.aa'].notnull()) & (mcpas['Epitope.peptide'].notnull())]
print('McPAS TCR Beta Sequences')
print('Unique CDR3 Beta Sequences: ', len(mcpas_beta_pairs['CDR3.beta.aa'].unique()))
describe_mcpas(mcpas_beta_pairs)

# McPAS TCR Alpha Sequences
mcpas_alpha_pairs = mcpas.loc[(mcpas['CDR3.alpha.aa'].notnull()) & (mcpas['Epitope.peptide'].notnull())]
print('\nMcPAS TCR Alpha Sequences')
print('Unique CDR3 Alpha Sequences: ', len(mcpas_alpha_pairs['CDR3.alpha.aa'].unique()))
describe_mcpas(mcpas_alpha_pairs)

# McPAS TCR Alpha AND Beta Sequences
mcpas_both_pairs = mcpas.loc[(mcpas['CDR3.alpha.aa'].notnull()) & (mcpas['Epitope.peptide'].notnull()) & (mcpas['CDR3.beta.aa'].notnull())]
print('\nMcPAS TCR Both Beta and Alpha Available')
describe_mcpas(mcpas_both_pairs)

In [None]:
# VDJDB Analytics
vdjdb = pd.read_csv('VDJDB.tsv', encoding='latin', sep='\t')
vdjdb

In [None]:
# VDJDB Summary Statistics
def describe_vdjdb(df):
    print('Human Samples: ', len(df[df['Species'] == 'HomoSapiens']))
    print('Mouse Samples: ', len(df[df['Species'] == 'MusMusculus']))
    print('Monkey Samples: ', len(df[df['Species'] == 'MacacaMulatta']))
    print('Unique Antigen Sequences: ', len(df['Epitope'].unique()))
    print('MHC Class I: ', len(df[df['MHC class'] == 'MHCI']))
    print('MHC Class II: ', len(df[df['MHC class'] == 'MHCII']))
    print('Total Pairs: ', len(df))
    print('Score 0: ', len(df[df['Score'] == 0]))
    print('Score 1: ', len(df[df['Score'] == 1]))
    print('Score 2: ', len(df[df['Score'] == 2]))
    print('Score 3: ', len(df[df['Score'] == 3]))

# VDJDB TCR Beta Sequences
vdjdb_beta_pairs = vdjdb.loc[(vdjdb['Gene'] == 'TRB') & (vdjdb['Epitope'].notnull())]
print('VDJDB Beta Sequences')
print('Unique CDR3 Beta Sequences: ', len(vdjdb_beta_pairs['CDR3'].unique()))
describe_vdjdb(vdjdb_beta_pairs)

# VDJDB TCR Alpha Sequences
vdjdb_alpha_pairs = vdjdb.loc[(vdjdb['Gene'] == 'TRA') & (vdjdb['Epitope'].notnull())]
print('\nVDJDB Alpha Sequences')
print('Unique CDR3 Alpha Sequences: ', len(vdjdb_alpha_pairs['CDR3'].unique()))
describe_vdjdb(vdjdb_alpha_pairs)

# VDJDB TCR Alpha OR Beta Sequences
print('\nVDJDB Alpha or Beta Sequences')
print('Unique CDR3 Alpha or Beta Sequences: ', len(vdjdb['CDR3'].unique()))
describe_vdjdb(vdjdb)

In [None]:
# IEDB Analytics
iedb_class_i = pd.read_csv('iedb_mhc_class_i.csv', low_memory=False)
print('IEDB MHC Class I')
display(iedb_class_i)

iedb_class_ii = pd.read_csv('iedb_mhc_class_ii.csv', low_memory=False)
print('IEDB MHC Class II')
display(iedb_class_ii)

In [None]:
# Filter down the data to relevant columns
def extract_relevant_columns(df):
    df = df[['Description', 'Chain 1 CDR3 Curated', 'Chain 2 CDR3 Curated']]
    df.columns = ['Peptide', 'Alpha CDR3', 'Beta CDR3']
    return df

iedb_class_i = extract_relevant_columns(iedb_class_i)
iedb_class_ii = extract_relevant_columns(iedb_class_ii)

In [None]:
# Generate Some Summary Statistics about IEDB
def summary_stats(df):
    print('Unique Peptides: ', len(df['Peptide'].unique()))
    print('Total Pairs: ', len(df))
    
def filter_alpha(df):
    filtered_data = df.loc[df['Alpha CDR3'].notnull()]
    return filtered_data

def filter_beta(df):
    filtered_data = df.loc[df['Beta CDR3'].notnull()]
    return filtered_data

def filter_both_alpha_beta(df):
    filtered_data = df.loc[df['Alpha CDR3'].notnull() & df['Beta CDR3'].notnull()]
    return filtered_data

# Alpha Analysis
print('CDR3 Alpha Pairs')
print('MHC Class I')
summary_stats(filter_alpha(iedb_class_i))
print('MHC Class II')
summary_stats(filter_alpha(iedb_class_ii))

# Beta Analysis
print('\nCDR3 Beta Pairs')
print('MHC Class I')
summary_stats(filter_beta(iedb_class_i))
print('MHC Class II')
summary_stats(filter_beta(iedb_class_ii))

# Alpha and Beta Analysis
print('\nCDR3 Alpha and Beta Pairs')
print('MHC Class I')
summary_stats(filter_both_alpha_beta(iedb_class_i))
print('MHC Class II')
summary_stats(filter_both_alpha_beta(iedb_class_ii))

In [None]:
"""
Combine the datasets
Conditions:
- Beta Sequences Only
- VDJDB Score > 0
"""
def final_filter_mcpas(raw_mcpas):
    df = raw_mcpas.loc[(raw_mcpas['Epitope.peptide'].notnull())]
    df = df[['CDR3.beta.aa', 'Epitope.peptide', 'MHC']] # Beta Sequences Only
    df['MHC'] = df['MHC'].replace(regex={r'^HLA-[ABC].*': 'MHCI', r'^(HLA-D|D).*': 'MHCII'})
    df = df.loc[(df['MHC'] == 'MHCI') | (df['MHC'] == 'MHCII')]
    df.columns = ['CDR3', 'Epitope', 'MHC Class']
    return df

def final_filter_vdjdb(raw_vdjdb):
    df = raw_vdjdb.loc[(raw_vdjdb['Gene'] == 'TRB') & (raw_vdjdb['Epitope'].notnull())] # Beta Sequences Only
    df = df[['CDR3', 'Epitope', 'MHC class', 'Score']]
    df = df.loc[df['Score'] > 0] # VDJDB Score > 0
    df.columns = ['CDR3', 'Epitope', 'MHC Class', 'VDJDB Score']
    return df

def final_filter_iedb(raw_iedb_class_i, raw_iedb_class_ii):
    raw_iedb_class_i['MHC Class'] = 'MHCI'
    raw_iedb_class_ii['MHC Class'] = 'MHCII'
    df = pd.concat([raw_iedb_class_i, raw_iedb_class_ii])
    df = df[['Beta CDR3', 'Peptide', 'MHC Class']] # Beta Sequences Only
    df.columns = ['CDR3', 'Epitope', 'MHC Class']
    return df


final_mcpas = final_filter_mcpas(mcpas)
final_vdjdb = final_filter_vdjdb(vdjdb)
final_iedb = final_filter_iedb(iedb_class_i, iedb_class_ii)
combined_sequences = pd.concat([final_mcpas, final_vdjdb, final_iedb])
combined_sequences.reset_index(drop=True)

In [None]:
def combined_summary_stats(df):
    print('Unique CDR3 Sequences: ', len(df['CDR3'].unique()))
    print('Unique Antigen Sequences: ', len(df['Epitope'].unique()))
    print('MHC Class I: ', len(df[df['MHC Class'] == 'MHCI']))
    print('MHC Class II: ', len(df[df['MHC Class'] == 'MHCII']))
    print('VDJDB Pairs: ', len(final_vdjdb))
    print('McPAS Pairs: ', len(final_mcpas))
    print('IEDB Pairs: ', len(final_iedb))
    print('Total Pairs: ', len(df))

print('Combined VDJDB and McPAS Statistics')
combined_summary_stats(combined_sequences)

# Save to File
combined_sequences.to_csv('combined_dataset.csv', index=False)