In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

VW_PREFIX = "VW_"

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [2]:
z_path = Path("../data/SHERC_combined_wSB_4-24-24_Z-HDI95_avg_round.tsv").resolve()
z_df = pd.read_csv(z_path, sep="\t")
display(z_df.head())

Unnamed: 0,Sequence name,VW_001,VW_002,VW_003,VW_004,VW_006,VW_007,VW_008,VW_009,VW_010,...,VW_390,VW_391,VW_392,VW_393,VW_394,VW_395,VW_397,VW_398,VW_399,VW_400
0,PV1_120184,0.68,2.61,-0.09,0.85,-0.16,1.36,1.11,-1.16,0.79,...,-0.1,0.23,-0.24,-0.23,0.16,-0.12,-0.11,-0.11,0.79,0.89
1,PV1_079687,-0.51,-0.31,-0.36,-0.41,0.11,-0.35,-1.14,-1.24,0.01,...,0.29,0.66,1.97,-0.06,-0.44,-0.99,-0.45,-0.49,-0.11,0.39
2,PV1_232363,-0.37,-0.09,-0.07,0.07,1.44,-0.8,0.03,-0.16,-0.75,...,-0.08,0.36,-0.82,0.64,-0.75,0.56,-0.27,0.59,-0.44,-0.13
3,PV1_170274,-0.16,0.39,0.24,1.25,3.68,0.82,1.03,0.34,1.51,...,1.16,1.21,0.63,0.99,-0.18,0.84,1.41,5.69,1.8,-0.37
4,PV1_202108,-0.19,-0.08,-0.17,-0.1,0.3,-0.54,-0.72,1.27,-0.57,...,-0.69,-0.88,0.28,0.21,-0.65,-0.66,-0.26,-0.3,-0.64,-0.66


In [3]:
def apply_threshold(df, is_epitope_z_min=20.0, is_epitope_min_subjects=4, not_epitope_z_max=10.0, not_epitope_max_subjects=None):
    # assuming df passed in full
    subject_cols = [col for col in df.columns if col.startswith(VW_PREFIX)]

    subjects = df.loc[:, subject_cols].apply(pd.to_numeric, errors="coerce")
    num_subjects = subjects.shape[1]
    arr = subjects.to_numpy()
    
    reactive_counts = (arr >= is_epitope_z_min).sum(axis=1)
    nonreactive_counts = (arr < not_epitope_z_max).sum(axis=1)

    def_is = reactive_counts >= is_epitope_min_subjects
    # manually choose non-epitope threshold
    if not_epitope_max_subjects is not None:
        def_not = (nonreactive_counts >= not_epitope_max_subjects) & (reactive_counts == 0)
    else:
        # every subject is nonreactive
        def_not = nonreactive_counts == num_subjects
    uncertain = (~def_is) & (~def_not)

    return df.assign(**{"Def epitope": def_is.astype("int8"), 
                        "Uncertain": uncertain.astype("int8"), 
                        "Not epitope": def_not.astype("int8")})

In [4]:
z_classified = apply_threshold(z_df.copy())
z_classified.shape

(244000, 394)

In [None]:
assert ((z_classified["Def epitope"] + z_classified["Uncertain"] + z_classified["Not epitope"]) == 1).all()
print("Counts:",
      "Def", z_classified["Def epitope"].sum(),
      "Uncertain", z_classified["Uncertain"].sum(),
      "Not", z_classified["Not epitope"].sum())

In [None]:
cols = ["Sequence name", "Def epitope", "Uncertain", "Not epitope"]
any_hits_mask = z_classified[cols].eq(1).any(axis=1)
any_hits = z_classified.loc[any_hits_mask, cols]
any_hits.head(10)

In [None]:
def_hits_mask = z_classified[["Def epitope"]].eq(1).any(axis=1)
def_hits = z_classified.loc[def_hits_mask, cols]
def_hits.head(10)

In [None]:
uncertain_hits_mask = z_classified[["Uncertain"]].eq(1).any(axis=1)
uncertain_hits = z_classified.loc[uncertain_hits_mask, cols]
uncertain_hits.head(10)

In [None]:
not_hits_mask = z_classified[["Not epitope"]].eq(1).any(axis=1)
not_hits = z_classified.loc[not_hits_mask, cols]
not_hits.head(10)

In [5]:
# drop all subjects, map classifications to metadata
z_class = z_classified.copy()
z_class.rename(columns={"Sequence name": "CodeName"}, inplace=True)
subject_cols = [col for col in z_class.columns if col.startswith(VW_PREFIX)]
z_class.drop(columns=subject_cols, inplace=True)
display(z_class.head(10))
z_class.shape

Unnamed: 0,CodeName,Def epitope,Uncertain,Not epitope
0,PV1_120184,0,1,0
1,PV1_079687,0,0,1
2,PV1_232363,0,0,1
3,PV1_170274,0,0,1
4,PV1_202108,0,0,1
5,PV1_117921,0,1,0
6,PV1_103152,0,0,1
7,PV1_085139,0,0,1
8,PV1_215912,0,1,0
9,PV1_184327,0,1,0


(244000, 4)

In [6]:
meta_path = Path("../data/PV1_meta_fixed.tsv")
meta_df = pd.read_csv(meta_path, sep="\t")
display(meta_df.head(10))

Unnamed: 0,CodeName,Category,SpeciesID,Species,Protein,AlignStart,AlignStop,FullName,Peptide,Encoding
0,PV1_000673,SetCover,130310.0,Human mastadenovirus D,MNTLTSVVLLSLLVAFSQAGIINLNVLWGINLTLVGPLDLPVTWYD...,157.0,187.0,"ID=A0A2Z5WIK7_ADE08 AC=A0A2Z5WIK7 OXX=31545,13...",FNHTCNIQNLTLLFVNLTHNGAYIGYTKDG,TTCAACCATACTTGCAACATTCAGAACCTGACCCTGCTGTTCGTAA...
1,PV1_000698,SetCover,129951.0,Human mastadenovirus C,METRGRRPAALQHQQDQPQAHPGQRAARSAPLHRDPDYADEDPAPV...,0.0,30.0,"ID=PKG1_ADE02 AC=P03272 OXX=10515,129951,10509...",METRGRRPAALQHQQDQPQAHPGQRAARSA,ATGGAAACACGTGGTCGTCGTCCGGCTGCGTTACAGCACCAACAAG...
2,PV1_000524,SetCover,130310.0,Human mastadenovirus D,METRGRRPCPFQHQQDESQAHPCKRPARCPPLHRDGDHAHADPETL...,378.0,408.0,"ID=T1ULG6_9ADEN AC=T1ULG6 OXX=130310,130310,10...",HAQRTCYDWIIYNTTPEHEAMQWCYLHPRD,CATGCTCAGCGCACTTGCTACGACTGGATTATCTACAACACAACAC...
3,PV1_000864,SetCover,129951.0,Human mastadenovirus C,MESVEKKDSLTAPSEFATTASTDAANAPTTFPVEAPPLEEEEVIIE...,14.0,44.0,"ID=Q6VGU3_ADE05 AC=Q6VGU3;Q2KS05 OXX=28285,129...",EFATTASTDAANAPTTFPVEAPPLEEEEVI,GAATTTGCAACTACTGCTTCCACCGACGCGGCAAACGCTCCGACTA...
4,PV1_000566,SetCover,108098.0,Human mastadenovirus B,MEQQQAPDPAMRAALQSQPSGINSSDDWTQAMQRIMALTTRNPEAF...,95.0,125.0,"ID=T1UK08_9ADEN AC=T1UK08 OXX=108098,108098,10...",YNALLERVARYNSSNVQTNLDRMVTDVREA,TACAACGCACTGCTGGAACGCGTAGCACGCTATAACTCATCTAACG...
5,PV1_000933,SetCover,129951.0,Human mastadenovirus C,MTTSGVPFGMTLRPTRSRLSRRTPYSRDRLPPFETETRATILEDHP...,25.0,55.0,"ID=E1U5Q6_ADE06 AC=E1U5Q6 OXX=10534,129951,105...",SRDRLPPFETETRATILEDHPLLPECNTLT,TCCCGTGACCGTCTACCGCCGTTCGAAACTGAAACTCGTGCAACCA...
6,PV1_000588,SetCover,130310.0,Human mastadenovirus D,MSHGDSAELARLRHLDHCRRLRCFARESCGLIYFEFPEEHPNGPAH...,3.0,33.0,"ID=B9A5S0_9ADEN AC=B9A5S0 OXX=28278,130310,105...",GDSAELARLRHLDHCRRLRCFARESCGLIY,GGTGATTCTGCGGAACTGGCTCGTCTTCGCCACTTAGATCACTGCC...
7,PV1_000994,SetCover,129951.0,Human mastadenovirus C,MESVEKKDSLTAPSEFATTASTDAANAPTTFPVEAPPLEEEEVIIE...,122.0,152.0,"ID=Q6VGU3_ADE05 AC=Q6VGU3;Q2KS05 OXX=28285,129...",IADVSLAYERHLFSPRVPPKRQENGTCEPN,ATCGCTGATGTATCACTGGCTTACGAACGCCACTTATTCTCTCCGC...
8,PV1_000174,SetCover,129951.0,Human mastadenovirus C,MATPSMMPQWSYMHISGQDASEYLSPGLVQFARATETYFSLNNKFR...,719.0,749.0,"ID=CAPSH_ADE05 AC=P04133 OXX=28285,129951,1050...",KVAITFDSSVSWPGNDRLLTPNEFEIKRSV,AAAGTAGCTATCACCTTCGACTCCTCCGTTTCTTGGCCGGGTAACG...
9,PV1_000044,SetCover,108098.0,Human mastadenovirus B,MATPSMMPQWAYMHIAGQDASEYLSPGLVQLARATDTYFSMGNKFR...,826.0,856.0,"ID=CAPSH_ADE07 AC=P36851 OXX=10519,108098,1050...",YPANYPYPLIGTTAVKSVTQKKFLCDRTMW,TATCCTGCTAACTACCCGTACCCGCTGATCGGTACCACCGCTGTAA...


In [7]:
merged = meta_df.merge(z_class[["CodeName", "Def epitope", "Uncertain", "Not epitope"]], on="CodeName", how="left")
display(merged.head(10))

Unnamed: 0,CodeName,Category,SpeciesID,Species,Protein,AlignStart,AlignStop,FullName,Peptide,Encoding,Def epitope,Uncertain,Not epitope
0,PV1_000673,SetCover,130310.0,Human mastadenovirus D,MNTLTSVVLLSLLVAFSQAGIINLNVLWGINLTLVGPLDLPVTWYD...,157.0,187.0,"ID=A0A2Z5WIK7_ADE08 AC=A0A2Z5WIK7 OXX=31545,13...",FNHTCNIQNLTLLFVNLTHNGAYIGYTKDG,TTCAACCATACTTGCAACATTCAGAACCTGACCCTGCTGTTCGTAA...,0,0,1
1,PV1_000698,SetCover,129951.0,Human mastadenovirus C,METRGRRPAALQHQQDQPQAHPGQRAARSAPLHRDPDYADEDPAPV...,0.0,30.0,"ID=PKG1_ADE02 AC=P03272 OXX=10515,129951,10509...",METRGRRPAALQHQQDQPQAHPGQRAARSA,ATGGAAACACGTGGTCGTCGTCCGGCTGCGTTACAGCACCAACAAG...,0,0,1
2,PV1_000524,SetCover,130310.0,Human mastadenovirus D,METRGRRPCPFQHQQDESQAHPCKRPARCPPLHRDGDHAHADPETL...,378.0,408.0,"ID=T1ULG6_9ADEN AC=T1ULG6 OXX=130310,130310,10...",HAQRTCYDWIIYNTTPEHEAMQWCYLHPRD,CATGCTCAGCGCACTTGCTACGACTGGATTATCTACAACACAACAC...,0,1,0
3,PV1_000864,SetCover,129951.0,Human mastadenovirus C,MESVEKKDSLTAPSEFATTASTDAANAPTTFPVEAPPLEEEEVIIE...,14.0,44.0,"ID=Q6VGU3_ADE05 AC=Q6VGU3;Q2KS05 OXX=28285,129...",EFATTASTDAANAPTTFPVEAPPLEEEEVI,GAATTTGCAACTACTGCTTCCACCGACGCGGCAAACGCTCCGACTA...,1,0,0
4,PV1_000566,SetCover,108098.0,Human mastadenovirus B,MEQQQAPDPAMRAALQSQPSGINSSDDWTQAMQRIMALTTRNPEAF...,95.0,125.0,"ID=T1UK08_9ADEN AC=T1UK08 OXX=108098,108098,10...",YNALLERVARYNSSNVQTNLDRMVTDVREA,TACAACGCACTGCTGGAACGCGTAGCACGCTATAACTCATCTAACG...,0,1,0
5,PV1_000933,SetCover,129951.0,Human mastadenovirus C,MTTSGVPFGMTLRPTRSRLSRRTPYSRDRLPPFETETRATILEDHP...,25.0,55.0,"ID=E1U5Q6_ADE06 AC=E1U5Q6 OXX=10534,129951,105...",SRDRLPPFETETRATILEDHPLLPECNTLT,TCCCGTGACCGTCTACCGCCGTTCGAAACTGAAACTCGTGCAACCA...,0,1,0
6,PV1_000588,SetCover,130310.0,Human mastadenovirus D,MSHGDSAELARLRHLDHCRRLRCFARESCGLIYFEFPEEHPNGPAH...,3.0,33.0,"ID=B9A5S0_9ADEN AC=B9A5S0 OXX=28278,130310,105...",GDSAELARLRHLDHCRRLRCFARESCGLIY,GGTGATTCTGCGGAACTGGCTCGTCTTCGCCACTTAGATCACTGCC...,0,0,1
7,PV1_000994,SetCover,129951.0,Human mastadenovirus C,MESVEKKDSLTAPSEFATTASTDAANAPTTFPVEAPPLEEEEVIIE...,122.0,152.0,"ID=Q6VGU3_ADE05 AC=Q6VGU3;Q2KS05 OXX=28285,129...",IADVSLAYERHLFSPRVPPKRQENGTCEPN,ATCGCTGATGTATCACTGGCTTACGAACGCCACTTATTCTCTCCGC...,0,0,1
8,PV1_000174,SetCover,129951.0,Human mastadenovirus C,MATPSMMPQWSYMHISGQDASEYLSPGLVQFARATETYFSLNNKFR...,719.0,749.0,"ID=CAPSH_ADE05 AC=P04133 OXX=28285,129951,1050...",KVAITFDSSVSWPGNDRLLTPNEFEIKRSV,AAAGTAGCTATCACCTTCGACTCCTCCGTTTCTTGGCCGGGTAACG...,0,0,1
9,PV1_000044,SetCover,108098.0,Human mastadenovirus B,MATPSMMPQWAYMHIAGQDASEYLSPGLVQLARATDTYFSMGNKFR...,826.0,856.0,"ID=CAPSH_ADE07 AC=P36851 OXX=10519,108098,1050...",YPANYPYPLIGTTAVKSVTQKKFLCDRTMW,TATCCTGCTAACTACCCGTACCCGCTGATCGGTACCACCGCTGTAA...,0,1,0


In [11]:
# 20=min z-score to be epitope, 4=min # of subjects >= 20, 10=max z-score to not be an epitope, all=# of subjects < 10
merged_path = Path("../data/PV1_input_data_20_4_10_all.tsv")
merged.to_csv(merged_path, sep="\t", index=False)