# Modeling Candidate Selection for Epitope Conformation Studies

## Objective
Select antibody candidates (H+L chain combinations) that show:
- **HIGH** binding to `A/Hong Kong/1/1968` (HK68)
- **HIGH** binding to `1JPLm414_T_G3_PAPRE`
- **LOW/NO** binding to `1JPL_WT_Avi`

This differential binding profile will help model mAb-antigen complexes where epitope conformation differences between 1JPL variants can be studied.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 20)

In [None]:
# Load the data
df = pd.read_csv('../data/raw/cleaned_data.csv')

# Define columns
hk68_col = 'A/Hong Kong/1/1968'
t_g3_col = '1JPLm414_T_G3_PAPRE'
wt_col = '1JPL_WT_Avi'
heavy_col = 'VDJ_AA_H'
light_col = 'VJ_AA_K'

print(f"Total samples: {len(df)}")

## Find Candidates: HIGH HK68 + HIGH T_G3 + LOW WT

In [None]:
# Filter for candidates with:
# - WT < 20 (low/no binding)
# - T_G3 > 5000 (high binding)
# - HK68 > 500 (high binding)
# - Has light chain sequence

candidates = df[
    (df[wt_col] < 20) &
    (df[t_g3_col] > 5000) &
    (df[hk68_col] > 500) &
    (df[light_col].str.len() > 10)
].copy()

# Sort by T_G3 binding (descending)
candidates = candidates.sort_values(t_g3_col, ascending=False)

print(f"Candidates found: {len(candidates)}\n")
print(candidates[['Sample', t_g3_col, wt_col, hk68_col]].to_string(index=False))

In [None]:
# Visualization
fig, ax = plt.subplots(figsize=(10, 7))

# All samples
ax.scatter(df[wt_col], df[t_g3_col], c='lightgray', alpha=0.4, s=20, label='All samples')

# Candidates
ax.scatter(candidates[wt_col], candidates[t_g3_col], c='blue', s=100, 
           edgecolors='black', linewidths=1, label='Selected candidates')

# Annotate
for _, row in candidates.iterrows():
    ax.annotate(row['Sample'], (row[wt_col], row[t_g3_col]), 
               xytext=(5, 5), textcoords='offset points', fontsize=9)

ax.set_xlabel('1JPL_WT_Avi (want LOW)', fontsize=12)
ax.set_ylabel('1JPLm414_T_G3_PAPRE (want HIGH)', fontsize=12)
ax.set_title('Candidate Selection: T_G3 vs WT Binding', fontsize=14)
ax.legend()
plt.tight_layout()
plt.show()

## Top 3 Candidates Summary

In [None]:
top3 = candidates.head(3)

print("Top 3 Candidates for AF3 Modeling:\n")
for _, row in top3.iterrows():
    print(f"{row['Sample']}:")
    print(f"  T_G3: {row[t_g3_col]:,.0f}")
    print(f"  WT: {row[wt_col]:.1f}")
    print(f"  HK68: {row[hk68_col]:,.0f}")
    print(f"  H chain: {len(row[heavy_col])} aa")
    print(f"  L chain: {len(row[light_col])} aa")
    print()