## Setup:

In [178]:
%cd /shared/EvolvePro/

/shared/EvolvePro


In [179]:
%%capture

!pip install pandas numpy scikit-learn scikit-learn-extra xgboost matplotlib seaborn biopython scipy torch fair-esm
!mkdir -p /shared/content/output

% conda activate evolvepro

## Process

In [180]:
from evolvepro.src.process import generate_wt, generate_single_aa_mutants
generate_wt('EVQLVESGGGLVQPGGSLRLSCAASGRTLSSYAMGWFRQAPGKEREFVAAIRWSGNTLYYADSVKGRFTISGDNAKNTVYLQMNSLRAEDTAVYYCAARTRAYDGCPSYTAGPCYHYWGQGTMVTVSS', output_file='/shared/content/output/r200_WT.fasta')
generate_single_aa_mutants('/shared/content/output/r200_WT.fasta', output_file='/shared/content/output/r200.fasta')

Number of mutants: 2433


In [181]:
from evolvepro.src.process import suggest_initial_mutants
suggest_initial_mutants('/shared/content/output/r200.fasta', num_mutants=11, random_seed=42)


Suggested 11 mutants for testing:
1. A24C
2. G112P
3. A33E
4. Y59C
5. G112C
6. D104S
7. N77W
8. T57G
9. P14E
10. M123C
11. R19I


In [182]:
%%bash
# sudo apt-get install seqtk -y
mv /shared/content/output/r200.fasta /shared/content/output/r200.fasta_bk
seqtk subseq /shared/content/output/r200.fasta_bk /shared/content/output/r200.list > /shared/content/output/r200.fasta

[E::stk_subseq] failed to read the list of regions in file '/shared/content/output/r200.list'


CalledProcessError: Command 'b'# sudo apt-get install seqtk -y\nmv /shared/content/output/r200.fasta /shared/content/output/r200.fasta_bk\nseqtk subseq /shared/content/output/r200.fasta_bk /shared/content/output/r200.list > /shared/content/output/r200.fasta\n'' returned non-zero exit status 1.

## PLM

In [None]:
#基于sbatch提交任务
!sinfo

In [None]:
%%bash
# sbatch -p defaultgpu -N 1 <<EOF
# sbatch -p gpu-queue-high -N 1 <<EOF
sbatch -p gpu-queue-high-p5 -N 1 <<EOF
#!/bin/bash
#SBATCH --job-name=esm
#SBATCH --cpus-per-task=8
#SBATCH --mem=200gb
#SBATCH --output=/shared/content/output/%j_stdout.log
#SBATCH --error=/shared/content/output/%j_error.log

#source ~/.bashrc
# source /home/ubuntu/anaconda3/etc/profile.d/conda.sh
# conda activate evolvepro

# 设置多线程环境变量
export OMP_NUM_THREADS=8
export MKL_NUM_THREADS=8
export NUMBA_NUM_THREADS=8

repr_layers=48

# remote old data
rm -rf /shared/content/output/r200_esm2_t48_15B_UR50D/*

conda run -n evolvepro python /shared/EvolvePro/evolvepro/plm/esm/extract.py \
    esm2_t48_15B_UR50D /shared/content/output/r200.fasta \
    /shared/content/output/r200_esm2_t48_15B_UR50D \
    --toks_per_batch 128 --include mean \
    --concatenate_dir /shared/content/output
EOF

# conda run -n plm python /shared/EvolvePro/evolvepro/plm/esm/extract.py \
#     esm2_t36_3B_UR50D /shared/content/output/r200.fasta \
#     /shared/content/output/esm2_t36_3B_UR50D \
#     --toks_per_batch 64 --include mean \
#     --concatenate_dir /shared/content/output
# EOF

In [None]:
#任务查看 - It takes around [5 - 6] mins 

In [None]:
!squeue
!squeue --noheader --format="%i" --name=esm

In [None]:
%%bash
export ESM_JOB=$(squeue --noheader --format="%i" --name=esm)
echo $ESM_JOB > /tmp/esm_job_id.txt
squeue
scontrol show job $ESM_JOB
echo $ESM_JOB


In [None]:
## 检查队列，一直等到JOB 运行完毕，即 -- 没有JOB,跑在P5单卡[NVIDIA H100]机器上，大概需5分钟
!squeue
!squeue --noheader --format="%i" --name=esm

In [None]:
%%bash

export ESM_JOB=$(cat /tmp/esm_job_id.txt)
echo ${ESM_JOB}

In [None]:
%%bash

ESM_JOB=$(cat /tmp/esm_job_id.txt)
echo ${ESM_JOB}
echo "stdio log ---->:"
cat /shared/content/output/${ESM_JOB}_stdout.log
echo ""
echo "error log ---->:"
cat /shared/content/output/${ESM_JOB}_error.log

## Run EVOLVEpro

In [None]:
%%bash
cat > /shared/content/run.py << 'EOF'
from evolvepro.src.evolve import evolve_experimental

protein_name = 'r200'
embeddings_base_path = '/shared/content/output'
embeddings_file_name = 'r200_esm2_t48_15B_UR50D.csv'
# embeddings_file_name = 'c143_esm2_t36_3B_UR50D.csv'

round_base_path = '/shared/EvolvePro/colab/rounds_data'
wt_fasta_path = "/shared/content/output/r200_WT.fasta"
number_of_variants = 11
output_dir = '/shared/content/output/'
rename_WT = False

round_name = 'Round1'
round_file_names = ['r200_Round1.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)

round_name = 'Round2'
round_file_names = ['c143_Round1.xlsx', 'c143_Round2.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)

round_name = 'Round3'
round_file_names = ['c143_Round1.xlsx', 'c143_Round2.xlsx', 'c143_Round3.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)


round_name = 'Round4'
round_file_names = ['c143_Round1.xlsx', 'c143_Round2.xlsx', 'c143_Round3.xlsx', 'c143_Round4.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)

EOF

In [None]:
%%bash
cat > /shared/content/run.sh << 'EOF'
#!/bin/bash
#SBATCH --job-name=esm-plm
##SBATCH --gres=gpu:a100:1
#SBATCH --cpus-per-task=8
#SBATCH --mem=200gb
#SBATCH --output=/shared/content/output/%j_stdout.log
#SBATCH --error=/shared/content/output/%j_error.log

source /home/ubuntu/anaconda3/etc/profile.d/conda.sh
conda activate evolvepro

export PYTHONPATH="/shared/EvolvePro:$PYTHONPATH"

# 设置多线程环境变量
export OMP_NUM_THREADS=8
export MKL_NUM_THREADS=8
export NUMBA_NUM_THREADS=8
python /shared/content/run.py
EOF

In [None]:
!sbatch -p gpu-queue-high-p5 -N 1 /shared/content/run.sh

In [None]:
!squeue -a

## Plot

In [None]:
from evolvepro.src.plot import read_exp_data, plot_variants_by_iteration
protein_name = 'c143'
embeddings_base_path = '/shared/content/output'
embeddings_file_name = 'c143_esm2_t48_15B_UR50D.csv'
# embeddings_file_name = 'c143_esm2_t36_3B_UR50D.csv'
number_of_variants = 11
output_dir = '/shared/content/output/'
rename_WT = False

round_base_path = '/shared/EvolvePro/colab/rounds_data'
round_file_names = ['c143_Round1.xlsx', 'c143_Round2.xlsx', 'c143_Round3.xlsx', 'c143_Round4.xlsx']
wt_fasta_path = "/shared/content/output/c143_WT.fasta"

In [None]:
df = read_exp_data(round_base_path, round_file_names, wt_fasta_path)
plot_variants_by_iteration(df, activity_column='activity', output_dir=output_dir, output_file="c143")

In [None]:
!cat /shared/EvolvePro/*_stdout.log

In [None]:


!squeue 