In [2]:
!pip install transformers[torch] accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from

In [3]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, Dataset

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 读取数据
data = pd.read_csv("https://raw.githubusercontent.com/Kevinzhn/AMP-BERT-Multilabel/main/treinamento")

# 定义自定义数据集类
class ProteinDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sequence = self.data.iloc[index]['Sequence']
        labels = torch.tensor(self.data.iloc[index][['Antibacterial', 'Antiviral', 'Antiparasitic', 'Antifungal']].tolist(), dtype=torch.float)

        inputs = self.tokenizer.encode_plus(
            sequence,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,  # 添加这一行进行截断
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels,
        }

# 设置Bert模型和标记化器
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)
model = model.to(device)  # Move the model to the GPU

# 定义训练函数
def train_model(model, train_dataloader, epochs, learning_rate):
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.1)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)  # Move input tensors to GPU
            attention_mask = batch['attention_mask'].to(device)  # Move attention_mask tensors to GPU
            labels = batch['labels'].to(device)  # Move label tensors to GPU

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {total_loss}")

# 数据预处理
max_length = 200
dataset = ProteinDataset(data, tokenizer, max_length)
train_dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# 训练模型
epochs = 30
learning_rate = 5e-5
train_model(model, train_dataloader, epochs, learning_rate)

# 保存模型权重
model.save_pretrained("saved_model/")
tokenizer.save_pretrained("saved_model/")


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 40.65914925932884
Epoch 2, Loss: 31.846675664186478
Epoch 3, Loss: 28.047019854187965
Epoch 4, Loss: 23.953407660126686
Epoch 5, Loss: 20.977438747882843
Epoch 6, Loss: 18.23094718903303
Epoch 7, Loss: 15.746672481298447
Epoch 8, Loss: 13.797164686024189
Epoch 9, Loss: 10.998575739562511
Epoch 10, Loss: 8.974294036626816
Epoch 11, Loss: 7.6133390218019485
Epoch 12, Loss: 6.515602540224791
Epoch 13, Loss: 5.955614078789949
Epoch 14, Loss: 5.11618141643703
Epoch 15, Loss: 4.753808844834566
Epoch 16, Loss: 4.770174320787191
Epoch 17, Loss: 3.2380668204277754
Epoch 18, Loss: 3.2108397260308266
Epoch 19, Loss: 2.888435497879982
Epoch 20, Loss: 2.693475378677249
Epoch 21, Loss: 2.604971695691347
Epoch 22, Loss: 2.569940121844411
Epoch 23, Loss: 2.4752825633622706
Epoch 24, Loss: 2.528974026441574
Epoch 25, Loss: 2.126432911492884
Epoch 26, Loss: 1.7937334179878235
Epoch 27, Loss: 1.9123684712685645
Epoch 28, Loss: 1.8315021942835301
Epoch 29, Loss: 1.6879911159630865
Epoch 30,

('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/vocab.txt',
 'saved_model/added_tokens.json')

In [7]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# 加载训练好的模型和标记化器
model_path = "saved_model/"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 定义函数来测试蛋白质序列
def test_protein_sequences(model, tokenizer, test_data):
    model.eval()  # 设置模型为评估模式

    results = []

    for index, row in test_data.iterrows():
        sequence = row['Sequence']
        labels = torch.tensor(row[['Antibacterial', 'Antiviral', 'Antiparasitic', 'Antifungal']].tolist(), dtype=torch.float)

        inputs = tokenizer.encode_plus(
            sequence,
            None,
            add_special_tokens=True,
            max_length=20,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )

        with torch.no_grad():
            input_ids = inputs['input_ids'].squeeze().to(device)
            attention_mask = inputs['attention_mask'].squeeze().to(device)
            outputs = model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))

        logits = outputs.logits
        probabilities = torch.sigmoid(logits).squeeze().tolist()

        # 将概率转换为二进制标签
        binary_labels = [1 if p >= 0.5 else 0 for p in probabilities]

        results.append({
            'Sequence': sequence,
            'Predicted_Antibacterial': binary_labels[0],
            'Predicted_Antiviral': binary_labels[1],
            'Predicted_Antiparasitic': binary_labels[2],
            'Predicted_Antifungal': binary_labels[3],
            'True_Antibacterial': int(labels[0]),
            'True_Antiviral': int(labels[1]),
            'True_Antiparasitic': int(labels[2]),
            'True_Antifungal': int(labels[3]),
        })

    return results

# 读取测试集的CSV文件
test_data = pd.read_csv("https://raw.githubusercontent.com/Kevinzhn/AMP-BERT-Multilabel/main/teste")

# 测试蛋白质序列
test_results = test_protein_sequences(model, tokenizer, test_data)

# 将结果转换为DataFrame并输出
results_df = pd.DataFrame(test_results)
print(results_df)

# 计算准确率
correct_predictions = ((results_df['Predicted_Antibacterial'] == results_df['True_Antibacterial']) &
                       (results_df['Predicted_Antiviral'] == results_df['True_Antiviral']) &
                       (results_df['Predicted_Antiparasitic'] == results_df['True_Antiparasitic']) &
                       (results_df['Predicted_Antifungal'] == results_df['True_Antifungal'])).sum()

total_predictions = len(test_data)
accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {accuracy:.2f}")


                              Sequence  Predicted_Antibacterial  \
0               AVAGEKLWLLPHLLKMLLTPTP                        1   
1                            RWRRKWWWW                        1   
2                            KIWWWWRKR                        1   
3                            RLKRWWKFL                        1   
4                            RRWWRWVVW                        1   
...                                ...                      ...   
1848       GRFKRFRKKFKKLFKKLSPVIPLLHLG                        1   
1849  ATCYCRTGRCATRESLSGVCEISGRLYRLCCR                        1   
1850         VKLIQIRIWIQYVTVLQMFSMKTKQ                        0   
1851    GLPCGETTCFTGKCYTPGCSCSYPICKKIN                        1   
1852     GLPVCGETCFGGTCNTPGCSCTWPICTRD                        1   

      Predicted_Antiviral  Predicted_Antiparasitic  Predicted_Antifungal  \
0                       0                        0                     0   
1                       0                  