In [None]:
# Installation
!pip install -U caltable --index-url https://jellyroll.cs.tulane.edu/pypi/simple/
# or
!pip install ./caltable-1.0.1.zip

In [1]:
import caltable as ct
import os
import io
import pandas as pd
from pathlib import Path
workbench = ct.WorkBench.load('apl.workbench.json')

In [12]:
# Setup input and output path here
path = './data' # The path to the folder storing your PDB files
store_path = './output' # Where to store the outputs

table = ct.DataTable([{'path':str(file)} for file in Path(path).glob('*.pdb') ])
# Change parameter here
table[:, 'chain'] = 'A' # Select chain
table[:, 'sampler'] = 'exhaustive' # Choose sampling method
table[:, 'sconf_weight'] = 1.0 # Select entropy factor
table[:, 'mer_size'] = 15 # Set mer size for both MHC prediction and APL prediction
table[:, 'hop'] = 7# Set hop size (mer overlap) for both MHC prediction and APL prediction

In [None]:
# Run this to only compute APL
table = workbench['apl-only'](table)

[0.1s] ✓ Task (APL) Antigen Processing Likelihood Finished.ed.ction factors Finished.

In [13]:
# Run this to compute both MHC & APL
table[:, 'alleles'] = 'HLA-DRB1*03:01,HLA-DRB1*07:01,HLA-DRB1*15:01' # Select Alleles, alleles separate by ,

table = workbench['apl-mhc'](table)

[0.1s] ✓ Task Weighted Combine APL And MHC Finished..ished.ed.ction factors Finished..

In [15]:
table

Unnamed: 0,path,chain,sampler,sconf_weight,mer_size,hop,alleles,pdb_id,pdb,corex,sasa,bfactor,sequence,blast,entropy,residue_likelihood,peptide_likelihood,aggregate,mhc,combined
0,data\5jhw.pdb,A,exhaustive,1.0,15,7,"string:HLA-DRB1*03:01,H...(44)",5jhw,PDB:932 lines,"COREX (ln(kf)) Values:[-10.76504924339339, -10...","SASA Values:[1.3160278728943418, 0.77842180516...","B-Factor Values:[77.93, 65.02, 61.81]...(111)",Protein Amio Acid Sequence:NLGLDCDEHSSESRCC......,FASTA Sequence:>UniRef50_K7F502...(64300),"numarray:[0.6059239568702086, 0.95122507874089...","Residue Likelihood:[0.0, 0.1033110698885027, 0...","Peptide Likelihood:[0.08115424039233941, 0.0, ...",Residue Level Aggregated Score:[-0.12601320768...,"string:{""Peptide"":{""0"":...(775)","JSON File:{""Peptide"":{""0"":...(811)"
1,data\6cdb.pdb,A,exhaustive,1.0,15,7,"string:HLA-DRB1*03:01,H...(44)",6cdb,PDB:821 lines,"COREX (ln(kf)) Values:[-5.000873006361232, -5....","SASA Values:[1.2696183006775639, 0.84657581949...","B-Factor Values:[79.72, 72.64, 64.68]...(97)",Protein Amio Acid Sequence:SEINTDTLERVTEIFK......,FASTA Sequence:>UniRef50_Q2FWB0...(54949),"numarray:[0.5629006142273706, 0.73932470404379...","Residue Likelihood:[0.0, 0.0, 0.0]...(97)","Peptide Likelihood:[0.0, 0.16078940049011084, ...",Residue Level Aggregated Score:[-1.33188812909...,"string:{""Peptide"":{""0"":...(660)","JSON File:{""Peptide"":{""0"":...(897)"


In [14]:
# Run this to store the outputs.
for row in range(len(table)):
    residue_data = {
        'residue': list(table[row, 'sequence'].value),
        'bfactor': table[row, 'bfactor'].value,
        'sasa': table[row, 'sasa'].value,
        'corex': table[row, 'corex'].value,
        'entropy': table[row, 'entropy'].value,
        'aggregate': table[row, 'aggregate'].value,
        'likelihood': table[row, 'residue_likelihood'].value
    }
    _df = pd.DataFrame(residue_data)
    _id = table[row, 'pdb_id'].value
    _base = os.path.join(store_path, _id)
    os.makedirs(_base, exist_ok=True)
    _df.to_excel(os.path.join(_base, f'{_id}_residue.xlsx'))
    with open(os.path.join(_base, f'{_id}.fasta'), 'w') as f:
        f.write(table[row, 'blast'].value)
    with open(os.path.join(_base, f'{_id}.pdb'), 'w') as f:
        f.write(table[row, 'pdb'].value)
    if table[row, 'mhc'] is not None:
        _df = pd.read_json(io.StringIO(table[0, 'mhc'].value))
        _df.to_excel(os.path.join(_base, f'{_id}_mhc.xlsx'))
    if table[row, 'combined'] is not None:
        _df = pd.read_json(io.StringIO(table[0, 'combined'].value))
        _df['APL'] = table[row, 'peptide_likelihood'].value
        _df.to_excel(os.path.join(_base, f'{_id}_combined.xlsx'))

ValueError: All arrays must be of the same length

In [5]:
# More parameter for customization
workbench['apl-mhc']

### APL-MHC (Local Files)  

Run APL, MHC, and bind them for local PDB files.  
  
#### Parameters  
- **path**: (string:**string**)=`None`; The path to the target file; (`None`)   
- **chain**: (string:**PDB Chain IDs**)_[OPTIONAL]_=`A`; The selected protein chains ID.; (`[A-Za-z0-9]+(,[A-Za-z0-9]+)*`) The protein chain ids, seperate with `,`, no blank character.  
- **window_size**: (number:**float>1**)_[OPTIONAL]_=`10`; The protein folding unit size. Also, the number of partition schemes.; (`{'min': 1}`) The float number that is greater than 1.  
- **min_size**: (number:**float>1**)_[OPTIONAL]_=`4`; The minumum protein folding unit size.; (`{'min': 1}`) The float number that is greater than 1.  
- **samples**: (number:**float>1**)_[OPTIONAL]_=`10000`; (Ignore for exhaustive sampling) The sample number for each partition scheme. Total sample number=samples*window_size.; (`{'min': 1}`) The float number that is greater than 1.  
- **sampler**: (string:**COREX Sampler**)_[OPTIONAL]_=`exhaustive`; The COREX states sampler; (`(exhaustive|montecarlo|adaptive)`) (exhaustive|montecarlo|adaptive) The COREX micro-states sampler, which could be exhaustive enumerate, Monte Carlo, or Adaptibe Monte Carlo sampler.  
- **threshold**: (number:**float>0**)_[OPTIONAL]_=`0.75`; (Ignore for exhaustive sampling) The threshold for the sampler.; (`{'min': 0}`) The float number that is greater than 0.  
- **sconf_weight**: (number:**float>0**)_[OPTIONAL]_=`1.0`; Entropy factor.; (`{'min': 0}`) The float number that is greater than 0.  
- **base_fraction**: (number:**float>0**)_[OPTIONAL]_=`1.0`; The base fraction used to sum all COREX (ln_kf) values.; (`{'min': 0}`) The float number that is greater than 0.  
- **probe_radius**: (number:**float>1**)_[OPTIONAL]_=`1.4`; The probe radius for SASA in A.; (`{'min': 1}`) The float number that is greater than 1.  
- **n_points**: (number:**float>1**)_[OPTIONAL]_=`1000`; The number of test points in Shrake & Rupley algorithm for SASA.; (`{'min': 1}`) The float number that is greater than 1.  
- **algorithm**: (string:**SASA Algorithm**)_[OPTIONAL]_=`ShrakeRupley`; The SASA algorithms.; (`(ShrakeRupley|LeeRichards)`) (ShrakeRupley|LeeRichards) The SASA Algorithm that could be ShrakeRupley or LeeRichards.  
- **n_slices**: (number:**float>1**)_[OPTIONAL]_=`20`; Get the number of slices per atom in Lee & Richards algorithm.; (`{'min': 1}`) The float number that is greater than 1.  
- **record**: (string:**PDB record**)_[OPTIONAL]_=`ATOM`; The PDB record for B-Factor extraction.; (`(ATOM|HETATM)`) (ATOM|HETATM) PDB record names which could be ATOM or HETATM  
- **db**: (string:**BLAST Databases**)_[OPTIONAL]_=`uniref50`; The BLAST database could be `uniref50`; (`(uniref50)`) The BLAST database could be `uniref50`  
- **expect_value**: (number:**float>0**)_[OPTIONAL]_=`10`; The expect threshold sets the maximum e-value threshold for hits to be reported. Lower values make the search more stringent.; (`{'min': 0}`) The float number that is greater than 0.  
- **word_size**: (number:**float>0**)_[OPTIONAL]_=`3`; This is the size of initial words or seed matches used in the search. Smaller values increase sensitivity but can slow down the search.; (`{'min': 0}`) The float number that is greater than 0.  
- **max_target_seqs**: (number:**float>1**)_[OPTIONAL]_=`500`; Specifies the maximum number of aligned sequences to return. Increasing this will yield more hits.; (`{'min': 1}`) The float number that is greater than 1.  
- **matrix**: (string:**BLAST Matrix**)_[OPTIONAL]_=`BLOSUM62`; Different matrices can affect the sensitivity for detecting homologous sequences.; (`(BLOSUM45|BLOSUM50|BLOSUM62|BLOSUM80|BLOSUM90|PAM30|PAM70|PAM250)`)  Different matrices can affect the sensitivity for detecting homologous sequences.  
- **mer_size**: (number:**float>1**)_[OPTIONAL]_=`15`; The size of each mer of the sequence.; (`{'min': 1}`) The float number that is greater than 1.  
- **hop**: (number:**float>1**)_[OPTIONAL]_=`7`; The size of each hop of mers.; (`{'min': 1}`) The float number that is greater than 1.  
- **flank_size**: (number:**float>1**)_[OPTIONAL]_=`20`; The flank size of APL.; (`{'min': 1}`) The float number that is greater than 1.  
- **loop_size**: (number:**float>1**)_[OPTIONAL]_=`21`; The loop size of APL.; (`{'min': 1}`) The float number that is greater than 1.  
- **w_entropy**: (number:**float>0**)_[OPTIONAL]_=`0.3474973544973545`; The weight for entropy.; (`{'min': 0}`) The float number that is greater than 0.  
- **w_bfactor**: (number:**float>0**)_[OPTIONAL]_=`0.1643121693121693`; The weight for B-factor.; (`{'min': 0}`) The float number that is greater than 0.  
- **w_corex**: (number:**float>0**)_[OPTIONAL]_=`0.2651851851851852`; The weight for COREX.; (`{'min': 0}`) The float number that is greater than 0.  
- **w_sasa**: (number:**float>0**)_[OPTIONAL]_=`0.22300529100529098`; The weight for SASA.; (`{'min': 0}`) The float number that is greater than 0.  
- **alleles**: (string:**Protein Amio Acid Sequence**)_[OPTIONAL]_=`HLA-DRB1*03:01`; The alleles for this sequence, seperate by `,`.; (`None`) The protein amio acid sequence  
- **method**: (string:**The IEDB MHC-II Methods**)_[OPTIONAL]_=`recommended`; The method used to compute MHC-II binding.; (`(recommended|ann|consensus|netmhccons|netmhcpan|netmhcstabpan|pickpocket|smm|smmpmbec)`) The IEDB MHC-II prediction methods.  
- **w_apl**: (number:**float>0**)_[OPTIONAL]_=`0.5`; The weight for APL. The weight for MHC will be `1-w_apl`.; (`{'min': 0}`) The float number that is greater than 0.  
- **apl_threshold**: (number:**float>=0**)_[OPTIONAL]_=`0`; The threshold for APL. If the value is smaller than this threshold, it will be ignored.; (`{'min': -0.0001}`) The float number that is greater or equal than 0.  
- **mhc_threshold**: (number:**float>=0**)_[OPTIONAL]_=`0`; The threshold for MHC. If the value is smaller than this threshold, it will be ignored.; (`{'min': -0.0001}`) The float number that is greater or equal than 0.  
#### Returns  
- **pdb**: (string:**PDB File**)=`None`; The output PDB file that only contains selected chains.; (`None`) The protein PDB file  
- **pdb_id**: (string:**string**)=`None`; The file name; (`None`)   
- **chain**: (string:**PDB Chain IDs**)=`None`; The chains contained in the PDB files.; (`[A-Za-z0-9]+(,[A-Za-z0-9]+)*`) The protein chain ids, seperate with `,`, no blank character.  
- **corex**: (numarray:**COREX (ln(kf)) Values**)=`None`; The COREX values. The order is the same order as the PDB.; (`None`) COREX Values in Sorted Chain ID Order  
- **sasa**: (numarray:**SASA Values**)=`None`; The solvent accessible surface area. The order is the same order as the PDB.; (`None`) SASA Values in Sorted Chain ID Order  
- **bfactor**: (numarray:**B-Factor Values**)=`None`; The B-Factor. The order is the same order as the PDB.; (`None`) B-Factor values in given PDB file atom orders  
- **sequence**: (string:**Protein Amio Acid Sequence**)=`None`; The protein amio acid sequence. The order is the same order as the PDB.; (`[ACDEFGHIKLMNPQRSTVWY?]+`) The protein amio acid sequence  
- **blast**: (string:**FASTA Sequence**)=`None`; The BLAST outpus in FASTA format.; (`None`) The FASTA sequence file  
- **entropy**: (numarray:**Sequence Entropy Values**)=`None`; The sequence entropy of the given sequence based on the alignments.; (`None`) Sequence Entropy Values in Sorted Chain ID Order  
- **residue_likelihood**: (numarray:**Residue Likelihood**)=`None`; Residue Level Likelihood.; (`None`) Residue Level Likelihood  
- **peptide_likelihood**: (numarray:**Peptide Likelihood**)=`None`; Peptide Level Likelihood.; (`None`) Peptide Level Likelihood  
- **aggregate**: (numarray:**Residue Level Aggregated Score**)=`None`; Residue Level Aggregated Score.; (`None`) Residue Level Aggregated Score  
- **mhc**: (string:**JSON File**)=`None`; The MHC-II binding outputs from IEDB following JSON formats; (`None`) The JSON format file  
- **combined**: (string:**JSON File**)=`None`; Combined APL-MHC values for each MHC class.; (`None`) The JSON format file  
