In [1]:
from transformers import BertTokenizer,BertModel,BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import logging
logging.set_verbosity_error()
import pandas as pd
import numpy as np
import wandb

In [2]:
wandb.init(project='text_class_first_try',name = "32_batch_512_max_token_1e-5lr_500warmup_step")

[34m[1mwandb[0m: Currently logged in as: [33mjameschou159[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
df = pd.read_csv('./Dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24500 entries, 0 to 24499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          24500 non-null  object
 1   sentiment   24500 non-null  int64 
 2   review      24500 non-null  object
 3   sep_review  24500 non-null  object
dtypes: int64(1), object(3)
memory usage: 765.8+ KB


In [4]:
x = list(df['review'])
y = list(df['sentiment'])
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=512)
test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=512)

#可以这样查看词典
vocab = tokenizer.vocab
print(vocab['hello'])

7592


In [5]:
# 数据集读取, 继承torch的Dataset类，方便后面用DataLoader封装数据集
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    #这里的idx是为了让后面的DataLoader成批处理成迭代器，按idx映射到对应数据
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    #数据集长度。通过len(这个实例对象)，可以查看长度
    def __len__(self):
        return len(self.labels)
#将数据集包装成torch的Dataset形式
train_dataset = NewsDataset(train_encoding, y_train)
test_dataset = NewsDataset(test_encoding, y_test)
# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

batch = next(iter(train_loader))
print(batch)
print(batch['input_ids'].shape)


{'input_ids': tensor([[ 101, 2054, 2842,  ...,    0,    0,    0],
        [ 101, 1996, 3185,  ...,    0,    0,    0],
        [ 101, 2076, 1996,  ...,    0,    0,    0],
        ...,
        [ 101, 1037, 4086,  ...,    0,    0,    0],
        [ 101, 2023, 3185,  ...,    0,    0,    0],
        [ 101, 1045, 1005,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 0, 1, 1, 1])}
torch.Size([32, 512])


dict_items([('a', 123), ('b', 56)])


wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


In [6]:
class my_bert_model(nn.Module):
    def __init__(self, freeze_bert=False, hidden_size=768):
        super().__init__()
        config = BertConfig.from_pretrained('bert-base-uncased')
        config.update({'output_hidden_states':True})
        self.bert = BertModel.from_pretrained("bert-base-uncased",config=config)
        self.fc = nn.Linear(hidden_size*4, 2)
        
        #是否冻结bert，不让其参数更新
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        all_hidden_states = torch.stack(outputs[2])  #因为输出的是所有层的输出，是元组保存的，所以转成矩阵
        concat_last_4layers = torch.cat((all_hidden_states[-1],   #取最后4层的输出
                                         all_hidden_states[-2], 
                                         all_hidden_states[-3], 
                                         all_hidden_states[-4]), dim=-1)
        
        cls_concat = concat_last_4layers[:,0,:]   #取 [CLS] 这个token对应的经过最后4层concat后的输出
        result = self.fc(cls_concat)
        
        return result
     


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device,'可用')

model = my_bert_model().to(device)
criterion = nn.CrossEntropyLoss().to(device)

# 优化方法
#过滤掉被冻结的参数，反向传播需要更新的参数
optim = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 500, # Default value in run_glue.py
                                            num_training_steps = total_steps)

cuda 可用




In [8]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)                
        total_train_loss += loss.item()
           
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #梯度裁剪，防止梯度爆炸
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    wandb.log({"average_training_loss":total_train_loss/len(train_loader)})

In [9]:
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
        
            loss = criterion(outputs, labels)
            logits = outputs

            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    wandb.log({"loss": total_eval_loss/len(test_dataloader), "accuracy": avg_val_accuracy})
    print("-------------------------------")

In [10]:
for epoch in range(10):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.6200, 16.31%
epoth: 0, iter_num: 200, loss: 0.4984, 32.63%
epoth: 0, iter_num: 300, loss: 0.1680, 48.94%
epoth: 0, iter_num: 400, loss: 0.1988, 65.25%
epoth: 0, iter_num: 500, loss: 0.2383, 81.57%
epoth: 0, iter_num: 600, loss: 0.1757, 97.88%
Epoch: 0, Average training loss: 0.3725
Accuracy: 0.9148
Average testing loss: 0.2086
-------------------------------
------------Epoch: 1 ----------------
epoth: 1, iter_num: 100, loss: 0.0726, 16.31%
epoth: 1, iter_num: 200, loss: 0.1693, 32.63%
epoth: 1, iter_num: 300, loss: 0.1462, 48.94%
epoth: 1, iter_num: 400, loss: 0.1667, 65.25%
epoth: 1, iter_num: 500, loss: 0.1547, 81.57%
epoth: 1, iter_num: 600, loss: 0.1870, 97.88%
Epoch: 1, Average training loss: 0.1794
Accuracy: 0.9162
Average testing loss: 0.2067
-------------------------------
------------Epoch: 2 ----------------
epoth: 2, iter_num: 100, loss: 0.1184, 16.31%
epoth: 2, iter_num: 200, loss: 0.1340, 32.63%
epoth:

NameError: name 'Inf' is not defined

In [22]:
for batch in test_dataloader:
    # 正常传播
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    print(input_ids.type(torch.int))

tensor([[  101, 26938, 15092,  ...,     0,     0,     0],
        [  101,  2023,  2003,  ...,     0,     0,     0],
        [  101,  2023,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  3191,  ...,     0,     0,     0],
        [  101,  2065,  2023,  ...,     0,     0,     0],
        [  101,  1045,  2387,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
tensor([[ 101, 2008, 1005,  ...,    0,    0,    0],
        [ 101, 2034, 1997,  ...,    0,    0,    0],
        [ 101, 3340, 1024,  ...,    0,    0,    0],
        ...,
        [ 101, 7823, 4364,  ...,    0,    0,    0],
        [ 101, 8129, 1006,  ..., 1011, 2006,  102],
        [ 101, 2054, 2062,  ..., 1996, 3035,  102]], device='cuda:0',
       dtype=torch.int32)
tensor([[  101,  5650,  1006,  ...,     0,     0,     0],
        [  101, 10047,  7098,  ...,     0,     0,     0],
        [  101,  1045,  3130,  ...,     0,     0,     0],
        ...,
        [  101,  2761,  2170,  ...,  38

tensor([[ 101, 1045, 1005,  ...,    0,    0,    0],
        [ 101, 2023, 3185,  ...,    0,    0,    0],
        [ 101, 2021, 2065,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 1010,  ...,    0,    0,    0],
        [ 101, 9975, 1037,  ...,    0,    0,    0],
        [ 101, 1045, 2018,  ...,    0,    0,    0]], device='cuda:0',
       dtype=torch.int32)
tensor([[  101,  1045,  2031,  ...,     0,     0,     0],
        [  101,  2045,  2024,  ...,  2066,  2183,   102],
        [  101,  1045,  2064,  ...,     0,     0,     0],
        ...,
        [  101,  1037,  2146,  ...,  1005,  1049,   102],
        [  101, 11703, 22048,  ...,     0,     0,     0],
        [  101,  1045,  1005,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
tensor([[  101,  2023,  2003,  ...,     0,     0,     0],
        [  101,  2023,  4443,  ...,     0,     0,     0],
        [  101,  2023,  2265,  ...,     0,     0,     0],
        ...,
        [  101, 14414,  2003,  ...,  19

tensor([[ 101, 2241, 2006,  ...,    0,    0,    0],
        [ 101, 1045, 2196,  ..., 2000, 2022,  102],
        [ 101, 2023, 3185,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2387,  ...,    0,    0,    0],
        [ 101, 2023, 3185,  ...,    0,    0,    0],
        [ 101, 2116, 2111,  ...,    0,    0,    0]], device='cuda:0',
       dtype=torch.int32)
tensor([[  101,  1999, 11425,  ...,     0,     0,     0],
        [  101,  2023,  2001,  ...,     0,     0,     0],
        [  101, 25506,  2135,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  3185,  ...,     0,     0,     0],
        [  101,  2387,  2023,  ...,     0,     0,     0],
        [  101,  2074,  2387,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
tensor([[  101, 13870,  2175,  ...,     0,     0,     0],
        [  101,  1996,  2034,  ...,     0,     0,     0],
        [  101,  1045,  7078,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  2428,  ...,    

       dtype=torch.int32)
tensor([[ 101, 2054, 1037,  ...,    0,    0,    0],
        [ 101, 2023, 3185,  ...,    0,    0,    0],
        [ 101, 2043, 1045,  ...,    0,    0,    0],
        ...,
        [ 101, 2664, 2178,  ...,    0,    0,    0],
        [ 101, 2035, 1996,  ...,    0,    0,    0],
        [ 101, 2178, 1997,  ...,    0,    0,    0]], device='cuda:0',
       dtype=torch.int32)
tensor([[  101,  1008,  1008,  ...,  1996,  3953,   102],
        [  101,  2198, 29246,  ...,  2019,  5793,   102],
        [  101,  1996,  2338,  ...,     0,     0,     0],
        ...,
        [  101,  6854,  2005,  ...,     0,     0,     0],
        [  101,  2034,  2125,  ...,     0,     0,     0],
        [  101,  2003,  2771,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
tensor([[  101,  6195,  1005,  ...,     0,     0,     0],
        [  101,  3100,  1010,  ...,  1005,  1055,   102],
        [  101,  2004,  2019,  ...,     0,     0,     0],
        ...,
        [  10