In [1]:
import os
import sys


In [2]:
dataset_focus = "pig"
dataset_id = "ge2"
datatype_str = f"{dataset_focus}_{dataset_id}"
if dataset_id is not None:
    dataset_dir = f"./{dataset_focus}/{dataset_id}/"
else:
    dataset_dir = f"./{dataset_focus}/"

if not os.path.isdir(dataset_dir):
    os.mkdir(dataset_dir)

In [3]:
import data_prep as dp

pig_df = dp.process_raw_fasta_files("b3_cdr3.fasta", "n1_cdr3.fasta", "np1_cdr3.fasta", "np2_cdr3.fasta", "np3_cdr3.fasta", in_dir="/mnt/D1/phage_display_analysis/pig_tissue/fasta files/", out_dir=dataset_dir, violin_out="pid_cdr3_lengths", input_format="fasta")

Observed Characters: ['G', 'Q', 'W', 'D', '*', 'P', 'T', 'V', 'C', 'M', 'N', 'I', 'A', 'Y', 'H', 'F', 'E', 'K', 'S', 'L', 'R', 'X']


In [4]:
# Renaming for convenience
round_dict = {"b3_cdr3": "b3", "n1_cdr3": "n1", "np1_cdr3": "np1", "np2_cdr3": "np2", "np3_cdr3": "np3"}
pig_df.replace({"round": round_dict}, inplace=True)

In [None]:
# So now we can define our datatype

# Datatype defines the basics of our data, Each datatype is specified for a group of related fasta files
# Focus - > short string specifier that gives the overall dataset we are using
# Molecule -> What kind of sequence data? currently protein, dna, and rna are supported
# id -> short string specifier ONLY for datasets which have different clustering methods (CLUSTERS ONLY)
# process -> How were the gaps added to each dataset, used to name directory (CLUSTERS ONLY)
# clusters -> How many clusters are in each data file (1 if no clusters)
# cluster_indices -> Define the lengths of data put in each cluster, It is inclusive so [12, 16] includes length 12 and length 16. There must be cluster_indices for each cluster
# gap_position_indices -> Index where gaps should be added to each sequence that is short of the maximum length. (-1 means add gaps to the end of the clusters)


pig_ge2_datatype = {"focus": "pig", "molecule": "protein", "id": "ge2", "process": "gaps_end", "clusters": 2, "gap_position_indices": [-1, -1], "cluster_indices": [[12, 22], [35, 45]]}

In [5]:
chars_replace = {"*": "-", "X": "-"}
dp.prepare_data_files(datatype_str, pig_df, target_dir=dataset_dir, character_conversion=chars_replace, remove_chars=None) # Creates datafiles in target directory

In [12]:
sys.path.append("../")
import rbm_torch.utils as utils

fasta_files = [f"{x}_c1.fasta" for x in ["b3", "n1", "np1", "np2", "np3"]]
fasta_files += [f"{x}_c2.fasta" for x in ["b3", "n1", "np1", "np2", "np3"]]
for fasta_file in fasta_files:
    seqs, affs, chars, q = utils.fasta_read(dataset_dir+fasta_file, "protein")
    naffs = dp.standardize_affinities(affs, out_plot=dataset_dir+f"{fasta_file.split('.')[0]}")
    dp.make_weight_file(f"{dataset_dir}{fasta_file.split('.')[0]}_st", naffs, "st")

Process Time 0.6348481178283691
Process Time 1.2309179306030273
Process Time 1.8820669651031494
Process Time 0.5857419967651367
Process Time 1.336568832397461
Process Time 0.5483031272888184
Process Time 0.923987865447998
Process Time 0.8785216808319092
Process Time 0.2807133197784424
Process Time 0.5355024337768555


In [14]:
# Now let's make our input files
!python ../rbm_torch/submit.py -h

usage: submit.py [-h] -d DATATYPE -r ROUND -p PARTITION -q QUEUE -m MODEL -e
                 EPOCHS -g GPUS [--wdir [WDIR]] [--precision PRECISION]
                 [-c [C]] [-w [W]] [--walltime [WALLTIME]] [-a [ACCOUNT]]
                 [--email [EMAIL]] [--error [ERROR]]

Generate Slurm Files for pytorch RBM and CRBM

optional arguments:
  -h, --help            show this help message and exit
  --wdir [WDIR]         Manually Set working directory, Usually handled
                        internally.
  --precision PRECISION
                        Set precision of the model, single or double
  -c [C]                Number of CPU cores to use. Default is 6.
  -w [W]                Weight File name to use to weight model training. Must
                        be in same directory as the sequence files.
                        Alternatively can be 'fasta' or None
  --walltime [WALLTIME]
                        Set wall time for training
  -a [ACCOUNT], --account [ACCO

In [17]:
import subprocess as sp
os.chdir("../rbm_torch/")

prefixes = ["b3", "n1", "np1", "np2", "np3"]
cluster_prefixes = [x+"_c1" for x in prefixes]
cluster_prefixes += [x+"_c2" for x in prefixes]
weights = [x + "_st.json" for x in cluster_prefixes]

for i in range(len(cluster_prefixes)):
    sp.call(f"python submit.py -d pig_ge2 -r {cluster_prefixes[i]} -p wzhengpu1 -q wildfire -m crbm -e 200 -g 2 -w {weights[i]}", shell=True)