In [1]:
from datasets import Dataset
import torch
from transformers import BertTokenizer
import os
import random

In [4]:
import os
import concurrent.futures
from datasets import Dataset

def load_data_parallel(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readline()

def load_data(base_path):
    paths = os.listdir(base_path)
    result = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_path = {executor.submit(load_data_parallel, os.path.join(base_path, path)): path for path in paths}
        for future in concurrent.futures.as_completed(future_to_path):
            path = future_to_path[future]
            try:
                data = future.result()
                result.append(data)
            except Exception as exc:
                print(f"Error reading {path}: {exc}")
    return result

def get_dataset(base_path):
    pos_data = load_data(os.path.join(base_path, 'pos'))
    neg_data = load_data(os.path.join(base_path, 'neg'))

    texts = pos_data + neg_data
    labels = [[1]] * len(pos_data) + [[0]] * len(neg_data)
    labels = torch.cat([torch.tensor(sublist) for sublist in labels])
    labels = labels.type(torch.LongTensor)


    dataset = Dataset.from_dict({'texts': texts, 'labels': labels})
    shuffled_dataset = torch.utils.data.random_split(dataset, [len(dataset)])[0]
    return shuffled_dataset

test_base_path = 'D:/5555/aclImdb/test'  # 替换为你的数据文件夹路径
test_dataset = get_dataset(test_base_path)

train_base_path = 'D:/5555/aclImdb/train'  # 替换为你的数据文件夹路径
train_dataset = get_dataset(train_base_path)

print(train_dataset[0:2],test_dataset[0:2])

{'texts': ['This movie frequently extrapolates quantum mechanics to justify nonsensical ideas, capped by such statements like "we all create our own reality".<br /><br />Sorry, folks, reality is what true for all of us, not just the credulous.<br /><br />The idea that "anything\'s possible" doesn\'t hold water on closer examination: if anything\'s possible, contrary things are thus possible and so nothing\'s possible. This leads to postmodernistic nonsense, which is nothing less than an attempt to denigrate established truths so that all ideas, well-founded and stupid, are equal.<br /><br />To quote sci-fi writer Philip K. Dick, who put it so well, "Reality is that which, when you stop believing in it, doesn\'t go away."', 'Spoiler!! I love Branagh, love Helena Bonham-Carter, loved them together in "Mary Shelley\'s Frankenstein" - but THIS -<br /><br />I can understand an actor\'s desire to stretch, to avoid the romantic stereotype. Well, they did, but really - the script droned on, Bo

In [5]:


from transformers import BertForSequenceClassification


from transformers import BertConfig, BertModel

# Define the model name
model_name = "bert-base-cased"

# Instantiate the configuration
config = BertConfig.from_pretrained(model_name)

# Instantiate the model
pretrained = BertModel.from_pretrained(model_name, config=config).to('cuda:0')



Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [6]:
import torch
class Model(torch.nn.Module):

    def __init__(self,pretrained):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 0])
        # out = out.softmax(dim=1)
        return out

model=Model(pretrained)
model.to('cuda:0')

Model(
  (fc): Linear(in_features=768, out_features=2, bias=True)
)

In [5]:
# train_dataset.save_to_disk('./data/train_dataset')
# test_dataset.save_to_disk('./data/test_dataset')


In [7]:
from transformers import BertForSequenceClassification,BertTokenizer,TrainingArguments,BertConfig,BertModel

from datasets import Dataset
import json
import os
from torch.utils.data import DataLoader



from transformers import BertTokenizer

#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-cased')

token




Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

BertTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
def collate_fn(data):
    sents = [item['texts'] for item in data]
    labels = [item['labels'] for item in data]

    #编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.tensor(labels,dtype=torch.long)

    #print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels

#数据加载器
loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                     batch_size=120,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)


In [9]:
# 检查CUDA是否可用
cuda_available = torch.cuda.is_available()

if cuda_available:
    # 获取GPU设备的数量
    device_count = torch.cuda.device_count()
    print(f"CUDA is available with {device_count} device(s)!")
    for i in range(device_count):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available on this system.")

CUDA is available with 1 device(s)!
Device 0: NVIDIA GeForce RTX 3060 Laptop GPU


In [11]:
from transformers import AdamW
import torch

torch.cuda.set_device("cuda:0")


optimizer = AdamW(model.parameters(), lr=0.00005)
criterion = torch.nn.CrossEntropyLoss().to("cuda:0")

model.train()

num_epochs = 10  # 设置训练的轮数

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        input_ids = input_ids.to("cuda:0")
        attention_mask = attention_mask.to("cuda:0")
        token_type_ids = token_type_ids.to("cuda:0")
        labels = labels.to("cuda:0")

        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
        optimizer.zero_grad()
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            preds = out.argmax(dim=1)
            accuracy = (preds == labels).sum().item() / len(labels)
            print(f"Iteration {i}, Loss: {loss.item()}, Accuracy: {accuracy}")


Epoch 1/10
Iteration 0, Loss: 0.6752526164054871, Accuracy: 0.6333333333333333
Iteration 10, Loss: 0.6776291131973267, Accuracy: 0.6666666666666666
Iteration 20, Loss: 0.6820762157440186, Accuracy: 0.5833333333333334
Iteration 30, Loss: 0.6709997057914734, Accuracy: 0.65
Iteration 40, Loss: 0.6770995259284973, Accuracy: 0.5916666666666667
Iteration 50, Loss: 0.6685202121734619, Accuracy: 0.6833333333333333
Iteration 60, Loss: 0.660487949848175, Accuracy: 0.6916666666666667
Iteration 70, Loss: 0.6623037457466125, Accuracy: 0.7083333333333334
Iteration 80, Loss: 0.6654340624809265, Accuracy: 0.65
Iteration 90, Loss: 0.6594411730766296, Accuracy: 0.7166666666666667
Iteration 100, Loss: 0.6651867628097534, Accuracy: 0.6166666666666667
Iteration 110, Loss: 0.6611458659172058, Accuracy: 0.6916666666666667
Iteration 120, Loss: 0.6537171602249146, Accuracy: 0.6666666666666666
Iteration 130, Loss: 0.6561301350593567, Accuracy: 0.6916666666666667
Iteration 140, Loss: 0.6437733173370361, Accuracy

In [15]:
#测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        if i == 100:
            break


        with torch.no_grad():
            input_ids = input_ids.to("cuda:0")
            attention_mask = attention_mask.to("cuda:0")
            token_type_ids = token_type_ids.to("cuda:0")
            labels = labels.to("cuda:0")
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

0.77125
