In [2]:
!python src/esm/extract.py -h

usage: extract.py [-h] [--toks_per_batch TOKS_PER_BATCH]
                  [--repr_layers REPR_LAYERS [REPR_LAYERS ...]] --include
                  {mean,per_tok,bos,contacts}
                  [{mean,per_tok,bos,contacts} ...]
                  [--truncation_seq_length TRUNCATION_SEQ_LENGTH] [--nogpu]
                  [--concatenate_dir CONCATENATE_DIR]
                  model_location fasta_file output_dir

Extract per-token representations and model outputs for sequences in a FASTA
file

positional arguments:
  model_location        PyTorch model file OR name of pretrained model to
                        download (see README for models)
  fasta_file            FASTA file on which to extract representations
  output_dir            output directory for extracted representations

options:
  -h, --help            show this help message and exit
  --toks_per_batch TOKS_PER_BATCH
                        maximum batch size
  --repr_layers REPR_LAYERS [REPR_LAYERS ...]
                  

In [1]:
!python src/esm/extract.py esm1b_t33_650M_UR50S data/p1450.fasta data/esm_embedings/P1450 --toks_per_batch 512 --include mean --concatenate_dir /data/home/maorunzegroup/Basepro/data/esm_embedings

download over
Transferred model to GPU
Read data/p1450.fasta with 3 sequences
Processing 1 of 2 batches (2 sequences)
Device: cuda:0
Processing 2 of 2 batches (1 sequences)
Device: cuda:0
Saved representations to data/esm_embedings/P1450
  file_data = torch.load(file_path)
Shape of concatenated DataFrame: (3, 1280)
Saved concatenated representations to /data/home/maorunzegroup/Basepro/data/esm_embedings/p1450_esm1b_t33_650M_UR50S.csv


### round_0

In [1]:
import numpy as np
import torch
import pandas as pd
import os

In [2]:
def random_sample_csv(input_file_path,saved_file_path,sample_size=200):
    """
    Randomly samples rows from a large CSV file and saves to a new file as round0 data.
    
    Parameters:
    input_file_path (str): Path to input CSV file
    saved_file_path (str): Path to save the sampled CSV file
    sample_size (int): Number of rows to sample (default: 200)
    """
    try:
        # Read the CSV file
        print(f"Reading file: {os.path.basename(input_file_path)}...")
        df = pd.read_csv(input_file_path)
        
        # Validate file size
        if len(df) < sample_size:
            print(f"Warning: File has only {len(df)} rows, less than requested sample size {sample_size}")
            sample_size = len(df)
        
        # Perform random sampling
        np.random.seed(42)  
        round0_indices = np.random.choice(len(df), size=sample_size, replace=False)

        sampled_df = pd.DataFrame()
        sampled_df['variant'] = df['variant'][round0_indices]  # Fixed seed for reproducibility
        sampled_df['fitness'] = df['fitness'][round0_indices]
        sampled_df['indices'] = round0_indices
        # Save sampled data
        sampled_df.to_csv(saved_file_path, index=False)
        print(f"✓ Sampling complete! Saved to: {saved_file_path}")
        print(f"Original rows: {len(df)}, Sampled rows: {len(sampled_df)}")
        
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Operation failed. Please check file path and format")


In [3]:
random_sample_csv('data/GB1/fitness.csv', 'rounds_data/GB1/GB1_round_0.csv', sample_size=200)

Reading file: fitness.csv...
✓ Sampling complete! Saved to: rounds_data/GB1/GB1_round_0.csv
Original rows: 149361, Sampled rows: 200


In [3]:
from src.model import run_directed_evolution

### round_1

In [4]:
protein_name = 'GB1'
embeddings_base_path = 'data/GB1'
embeddings_file_name = 'ESM2_x.pt'
round_base_path = 'rounds_data/GB1'

number_of_variants = 90
output_dir = 'output'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [5]:
round_base_path = 'rounds_data/GB1'
round_name = 'round_1'
round_data_filenames = [
    'GB1_round_0.csv',
    # 'GB1_round_1.csv'
]

In [6]:
fitness = pd.read_csv('data/GB1/fitness.csv')
all_variants = pd.DataFrame({
    'variant': fitness['variant'],
})

In [12]:
from typing import List, Dict, Any, Tuple, Union
import re   
def load_round_data(round_base_path: str, round_file_names, protein_name: str) -> List[pd.DataFrame]:
    """
    Load round data from CSV files in round order (round0, round1, ...).
    
    Parameters:
    protein_name (str): Name of the protein 
    round_base_path (str): Base path for round data
    
    Returns:
    list: Combined DataFrame from all CSV files in the round, in round order
    """
    all_files = []
    # 收集所有匹配的文件
    for file_name in round_file_names:
        if file_name.startswith(protein_name) and file_name.endswith('.csv'):
            file_path = os.path.join(round_base_path, file_name)
            all_files.append(file_path)
    
    # 按轮次排序的关键步骤
    def extract_round_number(file_path):
        """从文件名中提取轮次数字"""
        # 使用正则表达式匹配 roundX 模式
        match = re.search(r'round_(\d+)', file_path)
        if match:
            return int(match.group(1))
        # 如果文件名中没有轮次信息，返回 -1 放在最前面
        return -1
    
    # 按轮次数字排序
    sorted_files = sorted(all_files, key=extract_round_number,reverse = False)
    
    # 按顺序加载数据
    all_round_data = []
    for file_path in sorted_files:
        df = pd.read_csv(file_path)
        all_round_data.append(df)
        print(f"已加载: {os.path.basename(file_path)} (轮次 {extract_round_number(file_path)})")
    
    return all_round_data
all_round_data = load_round_data(round_base_path, round_data_filenames, protein_name)
all_round_data=[]

已加载: GB1_round_0.csv (轮次 0)


In [None]:

df_next_round, df_pre_all_sorted = run_directed_evolution(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_data_filenames,
    number_of_variants,
    output_dir,
    regression_model='xgboost',
    all_variants=all_variants
)

Processing GB1 - round_1
Using device: cuda


  embeddings = torch.load(file_path, map_location=device)


Loaded embeddings from data/GB1/ESM2_x.pt with shape torch.Size([149361, 5120])
Embeddings loaded: torch.Size([149361, 5120])
已加载: GB1_round_0.csv (轮次 0)
torch.Size([200, 5120]) torch.Size([200]) (200,)
successfully select 90 new variants for next round:
       variant   fitness  indices
104538    WYAG  2.455739   104538
82283     YIAG  2.291723    82283
30244     WFAG  2.289244    30244
80659     YFAG  2.273627    80659
35767     TIAG  2.248921    35767
...        ...       ...      ...
115498    IGAG  1.632279   115498
7548      LGAG  1.628683     7548
20533     KVAG  1.624723    20533
10104     IVGG  1.621714    10104
77161     KICG  1.621399    77161

[90 rows x 3 columns]

Top 90 variants predicted by the modelf or next round: 90
       variant   fitness  indices
104538    WYAG  2.455739   104538
82283     YIAG  2.291723    82283
30244     WFAG  2.289244    30244
80659     YFAG  2.273627    80659
35767     TIAG  2.248921    35767
...        ...       ...      ...
115498    IGAG  1

ValueError: not enough values to unpack (expected 3, got 2)

In [131]:
fitness = pd.read_csv('data/GB1/fitness.csv')
fitness.head()



Unnamed: 0,variant,fitness
0,AMHG,0.0
1,QPEI,0.0
2,GMYW,0.0
3,KWNA,0.001791
4,QDRA,0.00473


In [132]:
from src.data import load_embeddings
embeddings = load_embeddings(embeddings_base_path, embeddings_file_name,device='cuda')

  embeddings = torch.load(file_path, map_location=device)


In [142]:
from typing import List, Dict, Any, Tuple, Union
import re

In [183]:
def load_round_data(round_base_path: str, protein_name: str) -> List[pd.DataFrame]:
    """
    Load round data from CSV files in round order (round0, round1, ...).
    
    Parameters:
    protein_name (str): Name of the protein 
    round_base_path (str): Base path for round data
    
    Returns:
    list: Combined DataFrame from all CSV files in the round, in round order
    """
    all_files = []
    # 收集所有匹配的文件
    for file_name in round_data_filenames:
        if file_name.startswith(protein_name) and file_name.endswith('.csv'):
            file_path = os.path.join(round_base_path, file_name)
            all_files.append(file_path)
    
    # 按轮次排序的关键步骤
    def extract_round_number(file_path):
        """从文件名中提取轮次数字"""
        # 使用正则表达式匹配 roundX 模式
        match = re.search(r'round_(\d+)', file_path)
        if match:
            return int(match.group(1))
        # 如果文件名中没有轮次信息，返回 -1 放在最前面
        return -1
    
    # 按轮次数字排序
    sorted_files = sorted(all_files, key=extract_round_number,reverse = False)
    
    # 按顺序加载数据
    all_round_data = []
    for file_path in sorted_files:
        df = pd.read_csv(file_path)
        all_round_data.append(df)
        print(f"已加载: {os.path.basename(file_path)} (轮次 {extract_round_number(file_path)})")
    
    return all_round_data

all_round_data = load_round_data(round_base_path, protein_name)


已加载: GB1_round_0.csv (轮次 0)
已加载: GB1_round_1.csv (轮次 1)


In [148]:
all_round_data[1].shape

(90, 3)

In [184]:
embeddings.shape[0]

149361

In [170]:
all_X = []
all_y = []
list_indices = []

for df in all_round_data:
    X_round = embeddings[df['indices'].values]
    y_round = torch.tensor(df['fitness'].values,dtype=torch.float32)
    round_indices = df['indices']

    all_X.append(X_round)
    all_y.append(y_round)
    list_indices.append(round_indices)
X_train = torch.cat(all_X, dim=0)
y_train = torch.cat(all_y, dim=0) 

train_indices = pd.concat(list_indices, ignore_index=True)
test_indices = np.array([i for i in range(len(fitness['variant'])) if i not in train_indices])

X_train = X_train.to(device)
y_train = y_train.to(device)

In [169]:
train_indices

0       96399
1       68091
2       87476
3        1108
4      129203
        ...  
285    115498
286      7548
287     20533
288     10104
289     77161
Name: indices, Length: 290, dtype: int64

In [None]:
# X_train = embeddings[all_round_data[0]['indices'].values]
# y_train = torch.tensor(all_round_data[0]['fitness'].values,dtype=torch.float32)
# X_train = X_train.to(device)
# y_train = y_train.to(device)

In [78]:
import xgboost as xgb

In [None]:
# 5. 训练XGBoost模型
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100, 
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    tree_method='hist',
    device = 'cuda'
)

model.fit(X_train, y_train)

# 6. 预测所有1000个样本的fitness
all_predictions = model.predict(embeddings)

# 7. 评估结果 (在训练集和测试集上)
train_predictions = all_predictions[train_indices]
test_predictions = all_predictions[test_indices]

In [176]:
df_pre_all= pd.DataFrame({
    'variant': fitness['variant'],
    'fitness': all_predictions
})

In [177]:
df_pre_all_sorted = df_pre_all.sort_values(by='fitness', ascending=False)
df_pre_all_sorted

Unnamed: 0,variant,fitness
142720,YHAG,2.649550
104538,WYAG,2.458590
82283,YIAG,2.294217
30244,WFAG,2.293822
80659,YFAG,2.279350
...,...,...
3441,SRCH,-0.017356
18521,CLNH,-0.017850
108291,DVAK,-0.018296
21100,KLCH,-0.018333


In [178]:
number_variant = 90

filtered_df = df_pre_all_sorted[~df_pre_all_sorted.index.isin(train_indices)]

# 3. 取前 number_variant 个变异体
selected_variants = filtered_df.head(number_variant)

# 4. 如果需要，可以将结果保存到新的 DataFrame
result_df = selected_variants[['variant', 'fitness']].copy()
result_df['indices'] = selected_variants.index  # 保存原始索引

# 显示结果
print(f"成功选择了 {len(selected_variants)} 个新的变异体:")
print(result_df)

# # 5. 保存为 CSV 文件
# filepath = os.path.join(round_base_path, f"{protein_name}_{round_name}.csv")
# result_df.to_csv(filepath, index=False)

成功选择了 90 个新的变异体:
       variant   fitness  indices
130866    YDAG  2.034570   130866
32933     YEAG  2.018511    32933
24555     YTAG  2.017959    24555
113590    YNAG  2.013072   113590
87302     WDAG  2.003016    87302
...        ...       ...      ...
21808     ALIG  1.777380    21808
70299     GKPG  1.776510    70299
51900     WPVG  1.776355    51900
48704     ITAG  1.776309    48704
29855     EIKG  1.774183    29855

[90 rows x 3 columns]
