In [1]:
import os
import sys

## First let's look at our dataset and determine how it should be split up

In [2]:
os.getcwd()

'/home/jonah/PycharmProjects/phage_display_ML/datasets'

In [2]:
## Let's make a directory within datasets to store all our files
# Choose a short string 3-5 characters to denote this particular dataset
# For this one, I chose "ribo" for the ribosomal rna.
# Make sure to set "focus" in datatype as the same string

dataset_focus = "exo"
dataset_dir = f"./{dataset_focus}/"

if not os.path.isdir(dataset_dir):
    os.mkdir(dataset_dir)

In [3]:
# Let's import our data_prep tools
import data_prep as dp

In [4]:
exo_df = dp.process_raw_fasta_files("exosome.csv", in_dir="/mnt/D1/caris_exosome/", out_dir=dataset_dir, input_format="caris",violin_out="exo_data_lengths")

Observed Characters: ['G', 'C', 'A', 'T']


In [None]:
## Looking at the above graph + the length report in our out directory we see this

In [8]:
!cat "./exo/exosome_len_report.txt" | head -n 80 | tail -n 20

Removed 0 Repeat Sequences
Length: 25 Number of Sequences 94
Length: 26 Number of Sequences 151
Length: 27 Number of Sequences 194
Length: 28 Number of Sequences 299
Length: 29 Number of Sequences 447
Length: 30 Number of Sequences 697
Length: 31 Number of Sequences 1283
Length: 32 Number of Sequences 2285
Length: 33 Number of Sequences 5066
Length: 34 Number of Sequences 27616
Length: 35 Number of Sequences 1214996
Length: 36 Number of Sequences 757742
Length: 37 Number of Sequences 728110
Length: 38 Number of Sequences 679551


## We see that the majority of data is length 35. But we can get 2x the data by extending our range from length 35 to length 38. To make the data uniform in length, we will add gaps to the end of all sequences b/t 35 and 37

In [5]:
# So now we can define our datatype

# Datatype defines the basics of our data, Each datatype is specified for a group of related fasta files
# Focus - > short string specifier that gives the overall dataset we are using
# Molecule -> What kind of sequence data? currently protein, dna, and rna are supported
# id -> short string specifier ONLY for datasets which have different clustering methods (CLUSTERS ONLY)
# process -> How were the gaps added to each dataset, used to name directory (CLUSTERS ONLY)
# clusters -> How many clusters are in each data file (1 if no clusters)
# cluster_indices -> Define the lengths of data put in each cluster, It is inclusive so [12, 16] includes length 12 and length 16. There must be cluster_indices for each cluster
# gap_position_indices -> Index where gaps should be added to each sequence that is short of the maximum length. (-1 means add gaps to the end of the clusters)

exo_datatype = {"focus": "exo", "molecule": "dna", "id": None, "process": None, "clusters": 1, "gap_position_indices": [-1], "cluster_indices": [[35, 38]]}

## Before we do anything else, we need to copy our datatype to phage_display_ML/rbm_torch/analysis/global_info.py
## Also make sure to add the new datatype to the datatype_list in the same file.

## Next we need to process the raw files and make our own fasta files with our preferred formatting

In [6]:
chars_to_remove = ["W", "D", "V", "M", "B", "R", "K", "Y", "H", "S"]
chars_replace = {x: "-" for x in chars_to_remove}
dp.prepare_data_files("exo", exo_df, target_dir=dataset_dir, remove_chars=None) # Creates datafiles in target directory

## Now we have generated a data file that we can use for training our RBM or CRBM

In [8]:
# lets double check if only contains characters we know
seqs, chars = dp.fasta_read(dataset_dir+"exosome.fasta")

In [9]:
print(chars)

['C', 'G', '-', 'A', 'U']


In [1]:
!head -n 20 "./exo/exosome.fasta"

>seq0-216997
TTACGGGTCTTCAAAGATTGGTCGTCGCGTGGGCACA-
>seq1-102269
TTCAGTGGGTTAATGAGAACGGAGTCATAGCAAGTA--
>seq2-17991
ATCGGGCCACTGTTTCGATCGCCTGGCCGTGCGGGTGA
>seq3-14612
CTGAGACTATCAAATCACGCGATGCGGCTAGTGCAGA-
>seq4-5890
GTCAAGAACGCCGGAGGTCGCATCAATAACATGGTA--
>seq5-4263
GGATTTAGAACGACCAAAGGCCAAAAAGAGCTGGA---
>seq6-4037
GCGGTCGTTCAGGCGGAACCCGATAGGCTAATTGATGA
>seq7-3474
TTCCACATAGTGAGTAATTGTCTTTGGGCCCGACGTGA
>seq8-3035
GATGATATCGGGCTTTTGTACTATTACGCACACCG---
>seq9-1261
TTTCCCAGTGTGGGTCGCTAAAGTGCCTGATAGGCA--


# Our Last Step is to generate a dataset file, which will inform our models about the location of the data as well as other important details

In [7]:
sys.path.append("../")
import rbm_torch.analysis.global_info as gi

In [12]:
os.getcwd()

'/home/jonah/PycharmProjects/phage_display_ML/datasets'

In [8]:
gi.generate_dataset_file(["exosome.fasta"], gi.supported_datatypes["exo"], destination="./dataset_files/")

In [9]:
!cat "./dataset_files/exo.json"

{"data_files": {"1": ["exosome.fasta"]}, "rounds": {"1": ["exosome"]}, "model_names": {"weights": {"1": ["exosome_w"]}, "equal": {"1": ["exosome"]}}, "local_model_dir": {"rbm": "/mnt/D1/globus/exo_trained_rbms/", "crbm": "/mnt/D1/globus/exo_trained_crbms/"}, "data_dir": "../../datasets/exo/", "server_model_dir": {"rbm": "datasets/exo/trained_rbms/", "crbm": "datasets/exo/trained_crbms/"}, "molecule": "dna", "configkey": {"1": "exo"}, "clusters": 1}

In [None]:
# We're all set to run our models now, except for creating default configs for each dataset
# Here is an example one for crbm. It should be appended to crbm_configs.py and added to all_configs

exo_default_config = {"fasta_file": "",
          "v_num": 38,
          "q": 5,
          "molecule": "dna",
          "epochs": 100, # get's overwritten by training script anyway
          "seed": seed, # this is defined in the config file
          "batch_size": 10000, # can be raised or lowered depending on memory usage
          "mc_moves": 4,
          "lr": 0.006,
          "lr_final": None, # automatically set as lr * 1e-2
          "decay_after": 0.75,
          "loss_type": "free_energy",
          "sample_type": "gibbs",
          "sequence_weights": None,
          "optimizer": "AdamW",
          "weight_decay": 0.001,  # l2 norm on all parameters
          "l1_2": 25.0,
          "lf": 5.0,
          "ld": 10.0,
          "data_worker_num": 4
          }


In [10]:
# TO figure out the convolution topology we use some helper functions in crbm.py

# This function gives all convolutions that fully sample all visible units on the conv transpose for a given data size
from rbm_torch.utils import suggest_conv_size

# one hot encoded vector of input size (B x V X Q) is the input the CRBM uses
visible_num = 38 # V
q_states = 5 #Q
input_shape = (visible_num, q_states)
suggest_conv_size(input_shape, padding_max=2, dilation_max=1, stride_max=2)

Finding Whole Convolutions for Input with 38 inputs:
Whole Convolution Found: Kernel: 1, Stride: 1, Dilation: 1, Padding: 0
Whole Convolution Found: Kernel: 1, Stride: 1, Dilation: 1, Padding: 1
Whole Convolution Found: Kernel: 1, Stride: 1, Dilation: 1, Padding: 2
Whole Convolution Found: Kernel: 2, Stride: 1, Dilation: 1, Padding: 0
Whole Convolution Found: Kernel: 2, Stride: 2, Dilation: 1, Padding: 0
Whole Convolution Found: Kernel: 2, Stride: 1, Dilation: 1, Padding: 1
Whole Convolution Found: Kernel: 2, Stride: 2, Dilation: 1, Padding: 1
Whole Convolution Found: Kernel: 2, Stride: 1, Dilation: 1, Padding: 2
Whole Convolution Found: Kernel: 2, Stride: 2, Dilation: 1, Padding: 2
Whole Convolution Found: Kernel: 3, Stride: 1, Dilation: 1, Padding: 0
Whole Convolution Found: Kernel: 3, Stride: 1, Dilation: 1, Padding: 1
Whole Convolution Found: Kernel: 3, Stride: 1, Dilation: 1, Padding: 2
Whole Convolution Found: Kernel: 4, Stride: 1, Dilation: 1, Padding: 0
Whole Convolution Found:

## My current line of thinking is that having a dilation > 1 or a stride > 1 will introduce some position specific effects.

In [None]:
## Idea: The size of the kernel controls defines the size of the motif/pattern of the convolutional filter. So for this dataset I expect long filters to capture the secondary structure of this rfam family

# It is possible to use different strides and dilations, but I think they only take away from the interpretability of the convolutional filters. Also, they can lead to unsampled visible units on the convolution transpose. Likewise using a hidden layer with the kernel size the same as the number of visible units is somewhat equivalent to an RBM if not exactly (I haven't verified). This introduces a positional dependence into the corresponding hidden layer of the model.

# So I will use sizes:  11, 25, 46, 86, 100, 112
# Motif Finding:  Local Features-------Global Features
# Names/Keys for hidden layers in the convolutional topology can be named anything you can use as key in a dictionary
# Model outputs are the average of each hidden layer with a set weight
exo_default_config["convolution_topology"] = {"hidden7": {"number": 10, "kernel": (7, exo_default_config["q"]), "stride": (1, 1),                                                "padding": (0, 0), "dilation": (1, 1), "output_padding": (0, 0), "weight": 1.0},
                                            "hidden13": {"number": 10, "kernel": (13, exo_default_config["q"]), "stride": (1, 1), "padding": (0, 0), "dilation": (1, 1), "output_padding": (0, 0), "weight": 1.0},
                                            "hidden19": {"number": 15, "kernel": (19, exo_default_config["q"]), "stride": (1, 1), "padding": (0, 0), "dilation": (1, 1), "output_padding": (0, 0), "weight": 1.0},
                                            "hidden25": {"number": 15, "kernel": (25, exo_default_config["q"]), "stride": (1, 1), "padding": (0, 0), "dilation": (1, 1), "output_padding": (0, 0), "weight": 1.0},
                                              "hidden31": {"number": 15, "kernel": (31, exo_default_config["q"]), "stride": (1, 1), "padding": (0, 0), "dilation": (1, 1), "output_padding": (0, 0), "weight": 1.0}
                                             }

In [None]:
### COPY THE ABOVE CELL TO CRBM CONFIGS AS WELL

In [None]:
# Lets create a submission script for a slurm system to run using the script submit.py

# From Directory rbm_torch I ran
"python submit.py -d ribo -r all -p wzhengpu1 -q wildfire -m crbm -e 200 -g 2 --precision double"

# Use python submit.py -h for help!

In [1]:
import rbm_torch.utils as utils
import data_prep as dp
import numpy as np

In [5]:
## Now we will make a weight file, one standardized and one with negative weights
all_fasta_files = [f"exosome.fasta"]
all_affinities =[]
min_vals = []
for fasta_file in all_fasta_files:
    seqs, affs, chars, q = utils.fasta_read(f"./exo/{fasta_file}", "dna", threads=12)
    stand_affs = dp.standardize_affinities(affs, out_plot=f"./exo/{fasta_file.split('.')[0]}_stand_affs")
    min_vals.append(min(stand_affs))
    all_affinities.append(stand_affs)
    # dp.make_weight_file(f"./exo/{fasta_file.split('.')[0]}_st_weights", stand_affs, "st")

    dataset = zip(seqs, stand_affs)
    stand_affs = [(x, a)  for x, a in dataset if min_vals[-1] < a]
    nseqs, naffs = zip(*stand_affs)
    dp.write_fasta(nseqs, naffs, "./exo/exosome_enriched.fasta")



Process Time 2.7722091674804688


In [4]:
thresholds = min_vals
for fid, fasta_file in enumerate(all_fasta_files):
    new_weights = dp.negate_affinites(all_affinities[fid], thresholds[fid], out_plot=f"./exo/{fasta_file.split('.')[0]}_nw_affs", negative_factor=100.)
    dp.make_weight_file(f"./exo/{fasta_file.split('.')[0]}_nw_weights", new_weights, "nw")

In [8]:
import os
os.getcwd()

'/home/jonah/PycharmProjects/phage_display_ML/datasets'

In [9]:
## Need to add new fasta to our dataset file. Let's regenerate it adding our new data file

sys.path.append("../")
import rbm_torch.analysis.global_info as gi

gi.generate_dataset_file(["exosome.fasta", "exosome_enriched.fasta"], gi.supported_datatypes["exo"], destination="./dataset_files/")

In [None]:
#### Now we just need to generate the submission script
# run from phage_display/rbm_torch/
!python submit.py -d exo -r exosome_enriched -w fasta -p wzhengpu1 -q wildfire -g 2 -e 200 -c 8 --precision single -m crbm


## Creating Submission Scripts for HPC Slurm Jobs using submit.py script

In [None]:
# run from phage_display_ML/rbm_torch/
!python submit.py -d exo -r exosome -w exosome_nw_weights.json -p wzhengpu1 -q wildfire -g 2 -e 200 -c 8 --precision single -m crbm
!python submit.py -d exo -r exosome -w exosome_st_weights.json -p wzhengpu1 -q wildfire -g 2 -e 200 -c 8 --precision single -m crbm
