<a href="https://colab.research.google.com/github/Gongmengjie/Sentiment_classification/blob/main/Sen_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers==4.5.0

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 76.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 56.3 MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.5.0


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch
from transformers import BertModel, BertTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
from argparse import Namespace
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
tokenizer =  BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.hidden_size = 768
        self.num_class = 5
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc1 = nn.Linear(self.hidden_size, 256)
        self.fc2 = nn.Linear(256, self.num_class)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        outputs = self.bert(input_ids=x[0], attention_mask=x[2])
        pooled_out = outputs[1]
        out = self.fc1(pooled_out)
        out = self.dropout(out)
        out = self.fc2(out)  
        return out

model = Model().to(device)

In [None]:
def data_process(filename):

    data = pd.read_csv(filename, delimiter='\t', header=None)
    # 查缺失值(是否是重要信息)，并删除
    data.info()
    df = pd.DataFrame(data)
    print(df[df.isnull().T.any()])
    df.dropna(inplace=True)
    # 重新命名列

    df.columns=['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label']
    df['keyword_text'] = df['keyword'] + ', '+ df['text']

    new_data = df[['keyword_text', 'label']]
    new_data.info()
    print(new_data['label'].value_counts())
    # new_data = np.array(new_data).tolist()
    return new_data

In [None]:
def split(new_data):
    train_set, x = train_test_split(
        new_data, 
        stratify=new_data['label'],
        test_size=0.2, 
        random_state=0,
        )
    val_set, test_set = train_test_split(
        x, 
        stratify=x['label'],
        test_size=0.5, 
        random_state=1,
        )
    
    return train_set, val_set, test_set

In [None]:
class myDataset(Dataset):

    def __init__(self, split, data, tokenizer):

        self.split = split
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = 512

    def __getitem__(self, idx):

        if self.split == "train":
            text, label = self.data[idx, :]

        elif self.split == "dev":

            text, label = self.data[idx, :]
        elif self.split == "test":
            text = self.data[idx, 0]
            label = None
                

        tokenized_text = self.tokenizer(text)
        if len(tokenized_text['input_ids']) > self.max_token_len:
          input_ids = tokenized_text['input_ids'][:self.max_token_len]
        else:

          input_ids = tokenized_text['input_ids']
          
        input_ids, token_type_ids, attention_mask = self.padding(input_ids)

        return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(label)

    def __len__(self):
        return len(self.data)
      
    
    def padding(self, input_ids):
      temp = input_ids
      padding_len = self.max_token_len - len(temp)
      input_ids = temp + [0] * padding_len
      token_type_ids = [0] * self.max_token_len
      attention_mask = [1] * len(temp) + [0] * padding_len
                           
      return input_ids, token_type_ids, attention_mask

In [None]:
def load_data(filename, tokenizer):

    new_data = data_process(filename)
    split_data = split(new_data)
  
    train_data = np.array(split_data[0])
    dev_data = np.array(split_data[1])
    test_data = np.array(split_data[2])

    train_set = myDataset("train", train_data, tokenizer)
    dev_set = myDataset("dev", dev_data, tokenizer)
    test_set = myDataset("test", test_data, tokenizer)
    # 计算机内存充足时pin_memory = True，可加快计算速度
    train_loader = DataLoader(train_set, batch_size=8, shuffle=True, pin_memory=True)
    dev_loader = DataLoader(dev_set, batch_size=8, shuffle=False, pin_memory=True)
    test_loader = DataLoader(test_set, batch_size=8, shuffle=False, pin_memory=True)

    return train_loader, dev_loader, test_loader

In [None]:
train_loader, dev_loader, test_loader = load_data('./data/dontpatronizeme_pcl.tsv', tokenizer)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10469 entries, 0 to 10468
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       10469 non-null  int64 
 1   1       10469 non-null  object
 2   2       10469 non-null  object
 3   3       10469 non-null  object
 4   4       10468 non-null  object
 5   5       10469 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 490.9+ KB
         0           1        2   3    4  5
8639  8640  @@16852855  migrant  ke  NaN  0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10468 entries, 0 to 10468
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   keyword_text  10468 non-null  object
 1   label         10468 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 245.3+ KB
0    8528
1     947
3     458
4     391
2     144
Name: label, dtype: int64


In [None]:
# 模型层面缓解类别不均衡
class Focal_loss(nn.Module):  # (训练一定时间加入)
    def __init__(self, alpha=0.1, gamma=2, num_classes=5, size_average=True):
        """
        alpha: 类别权重. 当α是列表时, 为各类别权重, 当α为常数时, 类别权重为[α, 1-α, 1-α, ....]
        gamma: 难易样本调节参数.
        num_classes: 类别数量
        size_average: 损失计算方式, 默认取均值
        """
        super(Focal_loss,self).__init__()
        self.size_average = size_average
        if isinstance(alpha, list):
            assert len(alpha) == num_classes   # α可以以list方式输入, 用于对不同类别精细地赋予权重
            self.alpha = torch.Tensor(alpha)
        else:
            assert alpha < 1
            self.alpha = torch.zeros(num_classes)
            self.alpha[0] += alpha
            self.alpha[1:] += (1-alpha) # α 最终为[α, 1-α, 1-α, 1-α, 1-α, ...]

        self.gamma = gamma

    def forward(self, preds, labels):
        """
        preds: 预测类别. size:[B, C] or [B, S, C] B 批次, S长度, C类别数
        labels: 实际类别. size:[B] or [B, S] B批次, S长度
        """
        # assert preds.dim() == 2 and labels.dim()==1
        labels = labels.view(-1, 1) # [B * S, 1]
        preds = preds.view(-1, preds.size(-1)) # [B * S, C]
        self.alpha = self.alpha.to(preds.device)
        preds_logsoft = F.log_softmax(preds, dim=1) # 先softmax, 然后取log
        preds_softmax = torch.exp(preds_logsoft)    # softmax

        preds_softmax = preds_softmax.gather(1, labels)   # 这部分实现nll_loss ( crossempty = log_softmax + nll )
        preds_logsoft = preds_logsoft.gather(1, labels)
        alpha = self.alpha.gather(0, labels.view(-1))
        loss = -torch.mul(torch.pow((1-preds_softmax), self.gamma), preds_logsoft)  # torch.pow((1-preds_softmax), self.gamma) 为focal loss中 (1-pt)**γ

        loss = torch.mul(alpha, loss.t())
        if self.size_average:
            loss = loss.mean()
        else:
            loss = loss.sum()
        return loss


loss_fuction = Focal_loss()

In [None]:
# 数据层面缓解数据不均衡
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='emb'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='emb'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}



In [None]:
fgm = FGM(model)

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-4)

In [None]:
def train(model, optimizer, train_loader, dev_loader):
    
    # total_steps = len(train_data) * num_epochs / batch_size
    num_epochs = 10
    logging_step = 200
    validation = True
  
    for epoch in range(num_epochs):
        model.train()
      
        step = 1
        train_loss = train_acc = 0

        for data in tqdm(train_loader):

            data = [i.to(device) for i in data]
            # 模型输入: input_ids,  attention_mask
            # 模型输出: logits,
            output = model(data)
            labels = data[3]
            optimizer.zero_grad()
            loss1 = F.cross_entropy(output, labels)
            # loss2 = loss_fuction(output1, labels)
            loss = loss1
            train_loss += loss.item()
            loss.backward()

            pred_class = torch.max(output, 1)[1]
            train_acc += (pred_class == labels).float().mean()

            """
            fgm.attack() 
            # optimizer.zero_grad()
            output2 = model(data) 
            loss_sum = F.cross_entropy(output2, labels)
            loss_sum.backward() 
            fgm.restore() 
            """

            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            

            step += 1
            # optimizer.param_groups[0]["lr"] -= args.learning_rate / (total_steps)
            if step % logging_step == 0:

                print(
                    f"Epoch {epoch + 1} | Step {step} | Train_loss = {train_loss / logging_step:.3f}, Train_acc = {train_acc / logging_step:.3f}"
                )
                train_loss = train_acc = 0
  
        if validation:
            print("Evaluating Dev Set ...")
            model.eval()
            dev_loss = 0
            predict_all = np.array([], dtype=int)
            labels_all = np.array([], dtype=int)
            with torch.no_grad():
    
                for data in tqdm(dev_loader):
                    data = [i.to(device) for i in data]
                    output = model(data)
                    labels = data[3]
                    loss = F.cross_entropy(output, labels)
                    dev_loss += loss.item()

                    labels = data[3].data.cpu().numpy()
                    pred_class = torch.max(output.data, 1)[1].cpu().numpy()
                    
                    labels_all = np.append(labels_all, labels)
                    predict_all = np.append(predict_all, pred_class)

            dev_acc = metrics.accuracy_score(labels_all, predict_all)

            f1_score_micro = metrics.f1_score(labels_all, predict_all, average='micro')
            f1_score_macro = metrics.f1_score(labels_all, predict_all, average='macro')

            report = metrics.classification_report(labels_all, predict_all, digits=3)
            confusion = metrics.confusion_matrix(labels_all, predict_all)
            print(f"Validation | Epoch {epoch + 1} | loss = {dev_loss / len(dev_loader):.3f} | acc = {dev_acc:.3f}")
            print(f"F1 Score (Micro) = {f1_score_micro}")
            print(f"F1 Score (Macro) = {f1_score_macro}")
            print(report)
            print(confusion)

    print("Saving Model ...")
    model_save_dir = './save_dict/bert.ckpt'
    torch.save(model.state_dict(), model_save_dir)

In [None]:
train(model, optimizer, train_loader, dev_loader)

  0%|          | 0/1047 [00:00<?, ?it/s]

Epoch 1 | Step 200 | Train_loss = 0.710, Train_acc = 0.809
Epoch 1 | Step 400 | Train_loss = 0.691, Train_acc = 0.824
Epoch 1 | Step 600 | Train_loss = 0.720, Train_acc = 0.822
Epoch 1 | Step 800 | Train_loss = 0.740, Train_acc = 0.813
Epoch 1 | Step 1000 | Train_loss = 0.772, Train_acc = 0.800
Evaluating Dev Set ...


  0%|          | 0/131 [00:00<?, ?it/s]

Validation | Epoch 1 | loss = 0.756 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17967351237493417
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 45   1   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1047 [00:00<?, ?it/s]

Epoch 2 | Step 200 | Train_loss = 0.771, Train_acc = 0.803
Epoch 2 | Step 400 | Train_loss = 0.774, Train_acc = 0.804
Epoch 2 | Step 600 | Train_loss = 0.743, Train_acc = 0.814
Epoch 2 | Step 800 | Train_loss = 0.676, Train_acc = 0.832
Epoch 2 | Step 1000 | Train_loss = 0.739, Train_acc = 0.813
Evaluating Dev Set ...


  0%|          | 0/131 [00:00<?, ?it/s]

Validation | Epoch 2 | loss = 0.762 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1047 [00:00<?, ?it/s]

Epoch 3 | Step 200 | Train_loss = 0.663, Train_acc = 0.829
Epoch 3 | Step 400 | Train_loss = 0.724, Train_acc = 0.813
Epoch 3 | Step 600 | Train_loss = 0.758, Train_acc = 0.803
Epoch 3 | Step 800 | Train_loss = 0.747, Train_acc = 0.809
Epoch 3 | Step 1000 | Train_loss = 0.692, Train_acc = 0.824
Evaluating Dev Set ...


  0%|          | 0/131 [00:00<?, ?it/s]

Validation | Epoch 3 | loss = 0.711 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1047 [00:00<?, ?it/s]

Epoch 4 | Step 200 | Train_loss = 0.725, Train_acc = 0.809
Epoch 4 | Step 400 | Train_loss = 0.716, Train_acc = 0.814
Epoch 4 | Step 600 | Train_loss = 0.727, Train_acc = 0.812
Epoch 4 | Step 800 | Train_loss = 0.727, Train_acc = 0.812
Epoch 4 | Step 1000 | Train_loss = 0.691, Train_acc = 0.821
Evaluating Dev Set ...


  0%|          | 0/131 [00:00<?, ?it/s]

Validation | Epoch 4 | loss = 0.705 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1047 [00:00<?, ?it/s]

Epoch 5 | Step 200 | Train_loss = 0.745, Train_acc = 0.796
Epoch 5 | Step 400 | Train_loss = 0.725, Train_acc = 0.814
Epoch 5 | Step 600 | Train_loss = 0.681, Train_acc = 0.827
Epoch 5 | Step 800 | Train_loss = 0.713, Train_acc = 0.817
Epoch 5 | Step 1000 | Train_loss = 0.726, Train_acc = 0.812
Evaluating Dev Set ...


  0%|          | 0/131 [00:00<?, ?it/s]

Validation | Epoch 5 | loss = 0.717 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1047 [00:00<?, ?it/s]

Epoch 6 | Step 200 | Train_loss = 0.741, Train_acc = 0.801
Epoch 6 | Step 400 | Train_loss = 0.715, Train_acc = 0.813
Epoch 6 | Step 600 | Train_loss = 0.738, Train_acc = 0.812
Epoch 6 | Step 800 | Train_loss = 0.709, Train_acc = 0.821
Epoch 6 | Step 1000 | Train_loss = 0.688, Train_acc = 0.822
Evaluating Dev Set ...


  0%|          | 0/131 [00:00<?, ?it/s]

Validation | Epoch 6 | loss = 0.711 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1047 [00:00<?, ?it/s]

KeyboardInterrupt: ignored