<a href="https://colab.research.google.com/github/Gongmengjie/Sentiment_classification/blob/main/Sen_Bert_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers==4.5.0



In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch
from transformers import (BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, 
                BertLayer, AutoConfig)
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import torch.nn as nn
from torch.nn import LayerNorm
import torch.nn.functional as F
from sklearn import metrics
from sklearn.model_selection import train_test_split
import os
from torch.utils.data import TensorDataset
import random
from argparse import Namespace
from torch.nn.

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def set_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
set_seed(0)

In [None]:
args = Namespace(
    num_class = 5,
    num_epochs = 6,
    lr = 5e-5,
    batch_size = 32,
    data_dir = './data/dontpatronizeme_pcl.tsv',
    save_dict_path = './bertbasline/',
    
)

In [None]:
tokenizer =  BertTokenizer.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class Model(nn.Module):
    def __init__(self, arg):
        super(Model,self).__init__()
        self.hidden_size = 768
        self.num_class = args.num_class
        

        self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.fc1 = nn.Linear(self.hidden_size, 768)
        self.fc2 = nn.Linear(self.hidden_size, 768)
        self.layer_norm = LayerNorm(self.hidden_size)

        self.attention = nn.MultiheadAttention(
            embed_dim = self.hidden_size, 
            num_heads = 6, 
            batch_first = True,
            )
        
        
        self.fc = nn.Linear(self.hidden_size, self.num_class)

        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
        self.act1 = nn.ReLU()
        self.act2 = nn.Tanh()
        
        self.dropout = nn.Dropout(0.1)
    
        

    def forward(self, x):
        outputs = self.bert(input_ids=x[0], attention_mask=x[1])
        encoder_out = outputs[0]
        encoder_out = self.dropout(encoder_out)

        out1 = self.fc1(encoder_out)
        out2 = self.fc2(encoder_out)
        out = out1 + out2
        out = self.act1(out)
        out = self.layer_norm(out)
        out = self.dropout(out)

        out = self.attention(
            query = out,
            key = out,
            value = out,
            key_padding_mask = x[1]
        )

        out = self.dropout(out[0])

        # 对[CLS]依次执行全连接操作和非线性激活操作(tanh)，维度不变
        first_token_tensor = out[:, 0, :] # 试试取第一个cls，试试取所有词向量的均值
        # (32, 256*768) 采取向量拼接的方法，把一个句子32个单词的向量拼接在一起，送给self.dense,这时相应维度要变
        # out = out.view(out.size(0), -1)
        # 或者把256个词的向量768个维度加起来求平均 (128, 768)，送给self.dense        
        # out = torch.mean(out, 1)             
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.act2(pooled_output)

        out = self.dropout(pooled_output)
        out = self.fc(out)
        return out
       

model = Model(args).to(device)

print(model)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      

In [None]:
def data_process(filename):

    data = pd.read_csv(filename, delimiter='\t', header=None)
    # 查缺失值(是否是重要信息)，并删除
    data.info()
    df = pd.DataFrame(data)
    print(df[df.isnull().T.any()])
    df.dropna(inplace=True)
    # 重新命名列

    df.columns=['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label']
    df['keyword_text'] = df['keyword'] + ', '+ df['text']
    
    new_data = df[['keyword_text', 'label']]
    new_data.info()
    print(new_data['label'].value_counts())
    return new_data

In [None]:
def split(new_data):
    train_set, x = train_test_split(
        new_data, 
        stratify=new_data['label'],
        test_size=0.2, 
        random_state=0,
        )
    val_set, test_set = train_test_split(
        x, 
        stratify=x['label'],
        test_size=0.5, 
        random_state=1,
        )
    
    return train_set, val_set, test_set

In [None]:
def myDataset(data_set):
  texts = data_set['keyword_text'].tolist()
  labels = data_set['label'].tolist()
  input_ids = []
  attention_masks = []
  for text in texts:
      encoded_dict = tokenizer.encode_plus(
                    text,                     
                    add_special_tokens = True,
                    max_length = 256,        
                    pad_to_max_length = True,
                    return_attention_mask = True,
                    return_tensors = 'pt',   
      )
      
     
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])


  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)

  dataset = TensorDataset(input_ids, attention_masks, labels)

  return dataset

In [None]:
def load_data(args):

    new_data = data_process(args.data_dir)
    split_data = split(new_data)
  
    train_set, dev_set, test_set = split_data

    train_dataset = myDataset(train_set)
    dev_dataset = myDataset(dev_set)
    test_dataset = myDataset(test_set)
    # 计算机内存充足时pin_memory = True，可加快计算速度
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    return train_loader, dev_loader, test_loader

In [None]:
train_loader, dev_loader, test_loader = load_data(args)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10469 entries, 0 to 10468
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       10469 non-null  int64 
 1   1       10469 non-null  object
 2   2       10469 non-null  object
 3   3       10469 non-null  object
 4   4       10468 non-null  object
 5   5       10469 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 490.9+ KB
         0           1        2   3    4  5
8639  8640  @@16852855  migrant  ke  NaN  0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10468 entries, 0 to 10468
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   keyword_text  10468 non-null  object
 1   label         10468 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 245.3+ KB
0    8528
1     947
3     458
4     391
2     144
Name: label, dtype: int64




In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = args.lr) 

In [None]:
def train(args, model, optimizer, train_loader, dev_loader):
    
    num_epochs = 5   # bert建议是3，但使用bert微调，结果不稳定，适当增大epochs
    total_steps = len(train_loader)* num_epochs
    logging_step = 50
    validation = True
    learning_rate = args.lr
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 0,
        num_training_steps = total_steps,
        )  # 确实有效果，尝试其他warmup
    for epoch in range(num_epochs):
        model.train()
      
        step = 1
        train_loss = train_acc = 0

        for data in tqdm(train_loader):

            data = [i.to(device) for i in data]
            # 模型输入: input_ids,  attention_mask
            # 模型输出: logits,
            output = model(data)
            labels = data[2]
            optimizer.zero_grad()
            loss = F.cross_entropy(output, labels)
            
            train_loss += loss.item()
            loss.backward()

            pred_class = torch.max(output, 1)[1]
            train_acc += (pred_class == labels).float().mean()

            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step() 

            step += 1
            # optimizer.param_groups[0]["lr"] -= learning_rate / (total_steps)
            if step % logging_step == 0:

                print(
                    f"Epoch {epoch + 1} | Step {step} | Train_loss = {train_loss / logging_step:.3f}, Train_acc = {train_acc / logging_step:.3f}"
                )
                train_loss = train_acc = 0
  
        if validation:
            print("Evaluating Dev Set ...")
            model.eval()
            dev_loss = 0
            predict_all = np.array([], dtype=int)
            labels_all = np.array([], dtype=int)
            with torch.no_grad():
    
                for data in tqdm(dev_loader):
                    data = [i.to(device) for i in data]
                    output = model(data)
                    labels = data[2]
                    loss = F.cross_entropy(output, labels)
                    dev_loss += loss.item()

                    labels = data[2].data.cpu().numpy()
                    pred_class = torch.max(output.data, 1)[1].cpu().numpy()
                    
                    labels_all = np.append(labels_all, labels)
                    predict_all = np.append(predict_all, pred_class)

            dev_acc = metrics.accuracy_score(labels_all, predict_all)

            f1_score_micro = metrics.f1_score(labels_all, predict_all, average='micro')
            f1_score_macro = metrics.f1_score(labels_all, predict_all, average='macro')

            report = metrics.classification_report(labels_all, predict_all, digits=3)
            confusion = metrics.confusion_matrix(labels_all, predict_all)
            print(f"Validation | Epoch {epoch + 1} | loss = {dev_loss / len(dev_loader):.3f} | acc = {dev_acc:.3f}")
            print(f"F1 Score (Micro) = {f1_score_micro}")
            print(f"F1 Score (Macro) = {f1_score_macro}")
            print(report)
            print(confusion)

    print("Saving Model ...")
    model.save_pretrained(args.save_dict_path)
    
    

In [None]:
train(args, model, optimizer, train_loader, dev_loader)