In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

VW_PREFIX = "VW_"

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [None]:
z_path = Path("../data/SHERC_combined_wSB_4-24-24_Z-HDI95_avg_round.tsv").resolve()
z_df = pd.read_csv(z_path, sep="\t")
display(z_df.head())

In [None]:
def apply_threshold(df, is_epitope_z_min=20.0, is_epitope_min_subjects=4, not_epitope_z_max=10.0, not_epitope_max_subjects=None):
    # assuming df passed in full
    subject_cols = [col for col in df.columns if col.startswith(VW_PREFIX)]

    subjects = df.loc[:, subject_cols].apply(pd.to_numeric, errors="coerce")
    num_subjects = subjects.shape[1]
    arr = subjects.to_numpy()
    
    reactive_counts = (arr >= is_epitope_z_min).sum(axis=1)
    nonreactive_counts = (arr < not_epitope_z_max).sum(axis=1)

    def_is = reactive_counts >= is_epitope_min_subjects
    # manually choose non-epitope threshold
    if not_epitope_max_subjects is not None:
        def_not = (nonreactive_counts >= not_epitope_max_subjects) & (reactive_counts == 0)
    else:
        # every subject is nonreactive
        def_not = nonreactive_counts == num_subjects
    uncertain = (~def_is) & (~def_not)

    return df.assign(**{"Def epitope": def_is.astype("int8"), 
                        "Uncertain": uncertain.astype("int8"), 
                        "Not epitope": def_not.astype("int8")})

In [None]:
z_classified = apply_threshold(z_df.copy(), not_epitope_max_subjects=300)

In [None]:
assert ((z_classified["Def epitope"] + z_classified["Uncertain"] + z_classified["Not epitope"]) == 1).all()
print("Counts:",
      "Def", z_classified["Def epitope"].sum(),
      "Uncertain", z_classified["Uncertain"].sum(),
      "Not", z_classified["Not epitope"].sum())

In [None]:
cols = ["Sequence name", "Def epitope", "Uncertain", "Not epitope"]
any_hits_mask = z_classified[cols].eq(1).any(axis=1)
any_hits = z_classified.loc[any_hits_mask, cols]
any_hits.head(10)

In [None]:
def_hits_mask = z_classified[["Def epitope"]].eq(1).any(axis=1)
def_hits = z_classified.loc[def_hits_mask, cols]
def_hits.head(10)

In [None]:
uncertain_hits_mask = z_classified[["Uncertain"]].eq(1).any(axis=1)
uncertain_hits = z_classified.loc[uncertain_hits_mask, cols]
uncertain_hits.head(10)

In [None]:
not_hits_mask = z_classified[["Not epitope"]].eq(1).any(axis=1)
not_hits = z_classified.loc[not_hits_mask, cols]
not_hits.head(10)