<a href="https://colab.research.google.com/github/Gongmengjie/Sentiment_classification/blob/main/three_labels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers==4.5.0

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 5.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 55.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 70.1 MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.5.0


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
from sklearn.model_selection import train_test_split
import os
from torch.utils.data import TensorDataset
import random
from argparse import Namespace

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def set_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
set_seed(0)

In [5]:
args = Namespace(
    num_class = 5,
    num_epochs = 5,
    lr = 2e-5,
    batch_size = 32,
    data_dir = './data/dontpatronizeme_pcl.tsv',
    save_dict_path = './bertbasline/',
    
)

In [6]:
tokenizer =  RobertaTokenizer.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
class Model(nn.Module):
    def __init__(self, args):
        super(Model,self).__init__()
        self.hidden_size = 768
        self.num_class = args.num_class
        

        self.bert = RobertaModel.from_pretrained("roberta-base")
        self.fc1 = nn.Linear(self.hidden_size, 3)
        self.fc2 = nn.Linear(self.hidden_size, 3)
        self.fc3 = nn.Linear(3 * 2, 512)
        self.fc = nn.Linear(512, self.num_class)                 
        
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        outputs = self.bert(input_ids=x[0], attention_mask=x[1])
        pooler_out = outputs[1]
        out = self.dropout(pooler_out)

        out1 = self.fc1(out)
        out2 = self.fc2(out)
        loss1 = F.cross_entropy(out1, x[3])
        loss2 = F.cross_entropy(out2, x[4])

        out = torch.cat((out1, out2), -1)
        out = self.fc3(out)
        out = self.fc(out)
        loss0 = F.cross_entropy(out, x[2])

        loss = 0.5 * loss0 + 0.5 * (loss1 + loss2)

        return out, loss
       

model = Model(args).to(device)

print(model)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Model(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
def data_process(filename):

    data = pd.read_csv(filename, delimiter='\t', header=None)
    # 查缺失值(是否是重要信息)，并删除
    data.info()
    df = pd.DataFrame(data)
    print(df[df.isnull().T.any()])
    df.dropna(inplace=True)
    # 重新命名列

    df.columns=['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label']
    df['keyword_text'] = df['keyword'] + ', '+ df['text']
    
    new_data = df[['keyword_text', 'label']]
    new_data.info()
    print(new_data['label'].value_counts())

    new_data = np.array(new_data).tolist()

    for i in range(len(new_data)):
      if new_data[i][1] == 0:
        new_data[i].extend([0, 0])
      if new_data[i][1] == 1:
        new_data[i].extend([0, 1])
      if new_data[i][1] == 2:
        new_data[i].extend([1, 1])
      if new_data[i][1] == 3:
        new_data[i].extend([1, 2])
      if new_data[i][1] == 4:
        new_data[i].extend([2, 2])
    
    new_data = pd.DataFrame(new_data, columns=['keyword_text', 'label', 'label1', 'label2'])

    return new_data
      




In [9]:
def split(new_data):
    train_set, x = train_test_split(
        new_data, 
        stratify=new_data['label'],
        test_size=0.2, 
        random_state=0,
        )
    val_set, test_set = train_test_split(
        x, 
        stratify=x['label'],
        test_size=0.5, 
        random_state=1,
        )
    
    return train_set, val_set, test_set

In [10]:
def myDataset(data_set):
  texts = data_set['keyword_text'].tolist()
  labels = data_set['label'].tolist()
  label1 = data_set['label1'].tolist()
  label2 = data_set['label2'].tolist()
  input_ids = []
  attention_masks = []
  for text in texts:
      encoded_dict = tokenizer.encode_plus(
                    text,                     
                    add_special_tokens = True,
                    max_length = 256,        
                    pad_to_max_length = True,
                    return_attention_mask = True,
                    return_tensors = 'pt',   
      )
      
     
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])


  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  label1 = torch.tensor(label1)
  label2 = torch.tensor(label2)

  dataset = TensorDataset(input_ids, attention_masks, labels, label1, label2)

  return dataset

In [11]:
def load_data(args):

    new_data = data_process(args.data_dir)
    split_data = split(new_data)
  
    train_set, dev_set, test_set = split_data

    train_dataset = myDataset(train_set)
    dev_dataset = myDataset(dev_set)
    test_dataset = myDataset(test_set)
    # 计算机内存充足时pin_memory = True，可加快计算速度
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    return train_loader, dev_loader, test_loader

In [12]:
train_loader, dev_loader, test_loader = load_data(args)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10469 entries, 0 to 10468
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       10469 non-null  int64 
 1   1       10469 non-null  object
 2   2       10469 non-null  object
 3   3       10469 non-null  object
 4   4       10468 non-null  object
 5   5       10469 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 490.9+ KB
         0           1        2   3    4  5
8639  8640  @@16852855  migrant  ke  NaN  0


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10468 entries, 0 to 10468
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   keyword_text  10468 non-null  object
 1   label         10468 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 245.3+ KB
0    8528
1     947
3     458
4     391
2     144
Name: label, dtype: int64




In [13]:
optimizer = AdamW(model.parameters(), lr=args.lr, eps=1e-8)

In [14]:
def train(args, model, optimizer, train_loader, dev_loader):
    
    num_epochs = args.num_epochs   # bert建议是3，但使用bert微调，结果不稳定，适当增大epochs
    total_steps = len(train_loader)* num_epochs
    logging_step = 50
    validation = True
    learning_rate = args.lr
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 0,
        num_training_steps = total_steps,
        )  # 确实有效果，尝试其他warmup
    for epoch in range(num_epochs):
        model.train()
      
        step = 1
        train_loss = train_acc = 0

        for data in tqdm(train_loader):

            data = [i.to(device) for i in data]
            # 模型输入: input_ids,  attention_mask
            # 模型输出: logits,
            output, loss = model(data)
            labels = data[2]
            optimizer.zero_grad()
            
            train_loss += loss.item()
            loss.backward()

            pred_class = torch.max(output, 1)[1]
            train_acc += (pred_class == labels).float().mean()

            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step() 

            step += 1
            # optimizer.param_groups[0]["lr"] -= learning_rate / (total_steps)
            if step % logging_step == 0:

                print(
                    f"Epoch {epoch + 1} | Step {step} | Train_loss = {train_loss / logging_step:.3f}, Train_acc = {train_acc / logging_step:.3f}"
                )
                train_loss = train_acc = 0
  
        if validation:
            print("Evaluating Dev Set ...")
            model.eval()
            dev_loss = 0
            predict_all = np.array([], dtype=int)
            labels_all = np.array([], dtype=int)
            with torch.no_grad():
    
                for data in tqdm(dev_loader):
                    data = [i.to(device) for i in data]
                    output, loss = model(data)
                    labels = data[2]
                
                    dev_loss += loss.item()

                    labels = data[2].data.cpu().numpy()
                    pred_class = torch.max(output.data, 1)[1].cpu().numpy()
                    
                    labels_all = np.append(labels_all, labels)
                    predict_all = np.append(predict_all, pred_class)

            dev_acc = metrics.accuracy_score(labels_all, predict_all)

            f1_score_micro = metrics.f1_score(labels_all, predict_all, average='micro')
            f1_score_macro = metrics.f1_score(labels_all, predict_all, average='macro')

            report = metrics.classification_report(labels_all, predict_all, digits=3)
            confusion = metrics.confusion_matrix(labels_all, predict_all)
            print(f"Validation | Epoch {epoch + 1} | loss = {dev_loss / len(dev_loader):.3f} | acc = {dev_acc:.3f}")
            print(f"F1 Score (Micro) = {f1_score_micro}")
            print(f"F1 Score (Macro) = {f1_score_macro}")
            print(report)
            print(confusion)
    if not os.path.exists(args.save_dict_path):
        os.makedirs(args.save_dict_path)

    print("Saving Model ...")
    torch.save(model.save_dict(), args.save_dict_path)

In [15]:
train(args, model, optimizer, train_loader, dev_loader)

  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 1 | Step 50 | Train_loss = 1.065, Train_acc = 0.779
Epoch 1 | Step 100 | Train_loss = 0.830, Train_acc = 0.797
Epoch 1 | Step 150 | Train_loss = 0.742, Train_acc = 0.812
Epoch 1 | Step 200 | Train_loss = 0.649, Train_acc = 0.825
Epoch 1 | Step 250 | Train_loss = 0.668, Train_acc = 0.812
Evaluating Dev Set ...


  0%|          | 0/33 [00:00<?, ?it/s]

Validation | Epoch 1 | loss = 0.624 | acc = 0.817
F1 Score (Micro) = 0.8166189111747851
F1 Score (Macro) = 0.21559356913183284
              precision    recall  f1-score   support

           0      0.838     0.995     0.910       853
           1      0.100     0.032     0.048        95
           2      0.000     0.000     0.000        14
           3      0.750     0.065     0.120        46
           4      0.000     0.000     0.000        39

    accuracy                          0.817      1047
   macro avg      0.338     0.218     0.216      1047
weighted avg      0.725     0.817     0.751      1047

[[849   4   0   0   0]
 [ 92   3   0   0   0]
 [ 13   1   0   0   0]
 [ 32  11   0   3   0]
 [ 27  11   0   1   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 2 | Step 50 | Train_loss = 0.535, Train_acc = 0.818
Epoch 2 | Step 100 | Train_loss = 0.577, Train_acc = 0.828
Epoch 2 | Step 150 | Train_loss = 0.558, Train_acc = 0.834
Epoch 2 | Step 200 | Train_loss = 0.582, Train_acc = 0.826
Epoch 2 | Step 250 | Train_loss = 0.568, Train_acc = 0.826
Evaluating Dev Set ...


  0%|          | 0/33 [00:00<?, ?it/s]

Validation | Epoch 2 | loss = 0.636 | acc = 0.813
F1 Score (Micro) = 0.8127984718242598
F1 Score (Macro) = 0.2979154109120505
              precision    recall  f1-score   support

           0      0.910     0.951     0.930       853
           1      0.250     0.147     0.185        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.260     0.667     0.374        39

    accuracy                          0.813      1047
   macro avg      0.284     0.353     0.298      1047
weighted avg      0.774     0.813     0.788      1047

[[811  24   0   0  18]
 [ 54  14   0   0  27]
 [  7   3   0   0   4]
 [ 11  10   0   0  25]
 [  8   5   0   0  26]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 3 | Step 50 | Train_loss = 0.459, Train_acc = 0.829
Epoch 3 | Step 100 | Train_loss = 0.500, Train_acc = 0.831
Epoch 3 | Step 150 | Train_loss = 0.457, Train_acc = 0.852
Epoch 3 | Step 200 | Train_loss = 0.445, Train_acc = 0.846
Epoch 3 | Step 250 | Train_loss = 0.431, Train_acc = 0.859
Evaluating Dev Set ...


  0%|          | 0/33 [00:00<?, ?it/s]

Validation | Epoch 3 | loss = 0.615 | acc = 0.818
F1 Score (Micro) = 0.8175740210124164
F1 Score (Macro) = 0.30306097500982954
              precision    recall  f1-score   support

           0      0.903     0.959     0.930       853
           1      0.234     0.158     0.189        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.299     0.590     0.397        39

    accuracy                          0.818      1047
   macro avg      0.287     0.341     0.303      1047
weighted avg      0.768     0.818     0.790      1047

[[818  22   0   0  13]
 [ 61  15   0   0  19]
 [  6   5   0   0   3]
 [ 14  13   0   0  19]
 [  7   9   0   0  23]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 4 | Step 50 | Train_loss = 0.380, Train_acc = 0.839
Epoch 4 | Step 100 | Train_loss = 0.403, Train_acc = 0.856
Epoch 4 | Step 150 | Train_loss = 0.337, Train_acc = 0.881
Epoch 4 | Step 200 | Train_loss = 0.351, Train_acc = 0.870
Epoch 4 | Step 250 | Train_loss = 0.346, Train_acc = 0.874
Evaluating Dev Set ...


  0%|          | 0/33 [00:00<?, ?it/s]

Validation | Epoch 4 | loss = 0.656 | acc = 0.817
F1 Score (Micro) = 0.8166189111747851
F1 Score (Macro) = 0.3243142803244671
              precision    recall  f1-score   support

           0      0.912     0.944     0.927       853
           1      0.305     0.305     0.305        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.304     0.538     0.389        39

    accuracy                          0.817      1047
   macro avg      0.304     0.357     0.324      1047
weighted avg      0.782     0.817     0.798      1047

[[805  33   0   0  15]
 [ 53  29   0   0  13]
 [  6   6   0   0   2]
 [ 12  16   0   0  18]
 [  7  11   0   0  21]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/262 [00:00<?, ?it/s]

Epoch 5 | Step 50 | Train_loss = 0.286, Train_acc = 0.867
Epoch 5 | Step 100 | Train_loss = 0.313, Train_acc = 0.884
Epoch 5 | Step 150 | Train_loss = 0.330, Train_acc = 0.880
Epoch 5 | Step 200 | Train_loss = 0.287, Train_acc = 0.887
Epoch 5 | Step 250 | Train_loss = 0.283, Train_acc = 0.888
Evaluating Dev Set ...


  0%|          | 0/33 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation | Epoch 5 | loss = 0.683 | acc = 0.816
F1 Score (Micro) = 0.8156638013371538
F1 Score (Macro) = 0.3186499796296164
              precision    recall  f1-score   support

           0      0.912     0.946     0.929       853
           1      0.281     0.263     0.272        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.301     0.564     0.393        39

    accuracy                          0.816      1047
   macro avg      0.299     0.355     0.319      1047
weighted avg      0.780     0.816     0.796      1047

[[807  33   0   0  13]
 [ 55  25   0   0  15]
 [  6   6   0   0   2]
 [ 13  12   0   0  21]
 [  4  13   0   0  22]]
Saving Model ...


AttributeError: ignored

In [None]:
def test(test_loader):

    result = []
    model.load_state_dict(torch.load(args.save_path))
    model.eval()
    with torch.no_grad():
        for data in tqdm(test_loader):
            data = [i.to(device) for i in data]

            output = model(data)
            pred_class = torch.max(output, dim=1)[1]
            result.append(pred_class)

    result_file = "./result.csv"
    with open(result_file, 'w') as f:
        f.write("ID,Classification\n")
        for i, res in enumerate(result):
            f.write(f"{i},{res}\n")