<a href="https://colab.research.google.com/github/Gongmengjie/Sentiment_classification/blob/main/longformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers==4.5.0

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 7.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 55.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.5 MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.5.0


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch
from transformers import (BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, 
                 LongformerConfig, LongformerModel, LongformerTokenizer, AdamW, get_linear_schedule_with_warmup)
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
from sklearn.model_selection import train_test_split
import os
from torch.utils.data import TensorDataset
import random
from argparse import Namespace

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def set_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
set_seed(42)

In [None]:
args = Namespace(
    num_class = 5,
    num_epochs = 10,
    lr = 2e-5,
    batch_size = 3,
    data_dir = './data/dontpatronizeme_pcl.tsv',
    save_dict_path = './bertbasline/',
    
)

In [None]:
# tokenizer =  RobertaTokenizer.from_pretrained("roberta-base")

In [None]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
text = 'This is the configuration class to store the configuration of a LongformerModel.'
encoded_dict = tokenizer.encode_plus(
                    text,                     
                    add_special_tokens = True,
                    max_length = 1000,        
                    pad_to_max_length = True,
                    return_attention_mask = True,
                    return_tensors = 'pt',   
      )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
alist = [0] * 3
alist[0] = 1
alist = torch.tensor([alist])
alist.size()

torch.Size([1, 3])

In [None]:
encoded_dict['input_ids'].size()

torch.Size([1, 1000])

In [None]:
attention_window = [256] * 12
config = LongformerConfig(attention_window=256)
model = LongformerModel(config)
print(model.config)

LongformerConfig {
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.5.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [None]:
class Model(nn.Module):
    def __init__(self, args):
        super(Model,self).__init__()
        self.hidden_size = 768
        self.num_class = args.num_class
        
        # self.bert = RobertaModel.from_pretrained("roberta-base")
        self.bert = LongformerModel.from_pretrained('allenai/longformer-base-4096')

        self.fc1 = nn.Linear(self.hidden_size, 3)
        self.fc2 = nn.Linear(self.hidden_size, 3)
        self.fc3 = nn.Linear(3 * 2, 768)
        self.fc = nn.Linear(768, self.num_class)
        self.act = nn.ReLU()               
        
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        outputs = self.bert(input_ids=x[0], attention_mask=x[1]) # global_attention_mask=x[2]
        pooler_out = outputs[1]
        
        out1 = self.fc1(pooler_out)
        out2 = self.fc2(pooler_out)
        loss1 = F.cross_entropy(out1, x[4])
        loss2 = F.cross_entropy(out2, x[5])

        out = torch.cat((out1, out2), -1)
        out = self.fc3(out)

        out = self.dropout(out)
        out = self.act(out)
        out = self.fc(out)
        loss0 = F.cross_entropy(out, x[3])

        loss = 0.6 * loss0 + 0.4 * (loss1 + loss2)

        return out, loss
       

model = Model(args).to(device)

model.cuda()

In [None]:
def data_process(filename):

    data = pd.read_csv(filename, delimiter='\t', header=None)
    # 查缺失值(是否是重要信息)，并删除
    # data.info()
    df = pd.DataFrame(data)
    # print(df[df.isnull().T.any()])
    df.dropna(inplace=True)
    # 重新命名列

    df.columns=['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label']
    df['keyword_text'] = df['keyword'] + ', '+ df['text']
    
    new_data = df[['keyword_text', 'label']]
    # new_data.info()
    # print(new_data['label'].value_counts())

    new_data = np.array(new_data).tolist()

    for i in range(len(new_data)):
      if new_data[i][1] == 0:
        new_data[i].extend([0, 0])
      if new_data[i][1] == 1:
        new_data[i].extend([0, 1])
      if new_data[i][1] == 2:
        new_data[i].extend([1, 1])
      if new_data[i][1] == 3:
        new_data[i].extend([1, 2])
      if new_data[i][1] == 4:
        new_data[i].extend([2, 2])
    
    new_data = pd.DataFrame(new_data, columns=['keyword_text', 'label', 'label1', 'label2'])

    return new_data
      




In [None]:
def split(new_data):
    train_set, x = train_test_split(
        new_data, 
        stratify=new_data['label'],
        test_size=0.2, 
        random_state=0,
        )
    val_set, test_set = train_test_split(
        x, 
        stratify=x['label'],
        test_size=0.5, 
        random_state=1,
        )
    
    return train_set, val_set, test_set

In [None]:
def myDataset(data_set):
  texts = data_set['keyword_text'].tolist()
  labels = data_set['label'].tolist()
  label1 = data_set['label1'].tolist()
  label2 = data_set['label2'].tolist()
  input_ids = []
  attention_masks = []
  global_attention_mask = []
  alist = [0] * 512
  alist[0] = 1

  for text in texts:
      encoded_dict = tokenizer.encode_plus(
                    text,                     
                    add_special_tokens = True,
                    max_length = 2000,        
                    pad_to_max_length = True,
                    return_attention_mask = True,
                    return_tensors = 'pt',   
      )
      
     
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])
      global_attention_mask.append(torch.tensor([alist]))


  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  global_attention_mask = torch.cat(global_attention_mask, dim=0)
  
  labels = torch.tensor(labels)
  label1 = torch.tensor(label1)
  label2 = torch.tensor(label2)

  dataset = TensorDataset(input_ids, attention_masks, global_attention_mask, labels, label1, label2)

  return dataset

In [None]:
def load_data(args):

    new_data = data_process(args.data_dir)
    split_data = split(new_data)
  
    train_set, dev_set, test_set = split_data

    train_dataset = myDataset(train_set)
    dev_dataset = myDataset(dev_set)
    test_dataset = myDataset(test_set)
    # 计算机内存充足时pin_memory = True，可加快计算速度
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)

    return train_loader, dev_loader, test_loader

In [None]:
train_loader, dev_loader, test_loader = load_data(args)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
optimizer = AdamW(model.parameters(), lr=args.lr, eps=1e-8)

In [None]:
def train(args, model, optimizer, train_loader, dev_loader):
    
    num_epochs = args.num_epochs   # bert建议是3，但使用bert微调，结果不稳定，适当增大epochs
    total_steps = len(train_loader) * num_epochs
    logging_step = 500
    validation = True
    learning_rate = args.lr
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 0,
        num_training_steps = total_steps,
        )  # 确实有效果，尝试其他warmup
    for epoch in range(num_epochs):
        model.train()
      
        step = 1
        train_loss = train_acc = 0

        for data in tqdm(train_loader):

            data = [i.to(device) for i in data]
            # 模型输入: input_ids,  attention_mask
            # 模型输出: logits,
            output, loss = model(data)
            labels = data[3]
            optimizer.zero_grad()
            
            train_loss += loss.item()
            loss.backward()

            pred_class = torch.max(output, 1)[1]
            train_acc += (pred_class == labels).float().mean()

            # nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step() 

            step += 1
            # optimizer.param_groups[0]["lr"] -= learning_rate / (total_steps)
            if step % logging_step == 0:

                print(
                    f"Epoch {epoch + 1} | Step {step} | Train_loss = {train_loss / logging_step:.3f}, Train_acc = {train_acc / logging_step:.3f}"
                )
                train_loss = train_acc = 0
  
        if validation:
            print("Evaluating Dev Set ...")
            model.eval()
            dev_loss = 0
            predict_all = np.array([], dtype=int)
            labels_all = np.array([], dtype=int)
            with torch.no_grad():
    
                for data in tqdm(dev_loader):
                    data = [i.to(device) for i in data]
                    output, loss = model(data)
                    
                    dev_loss += loss.item()

                    labels = data[3].data.cpu().numpy()
                    pred_class = torch.max(output.data, 1)[1].cpu().numpy()
                    
                    labels_all = np.append(labels_all, labels)
                    predict_all = np.append(predict_all, pred_class)

            dev_acc = metrics.accuracy_score(labels_all, predict_all)

            f1_score_micro = metrics.f1_score(labels_all, predict_all, average='micro')
            f1_score_macro = metrics.f1_score(labels_all, predict_all, average='macro')

            report = metrics.classification_report(labels_all, predict_all, digits=3)
            confusion = metrics.confusion_matrix(labels_all, predict_all)
            print(f"Validation | Epoch {epoch + 1} | loss = {dev_loss / len(dev_loader):.3f} | acc = {dev_acc:.3f}")
            print(f"F1 Score (Micro) = {f1_score_micro}")
            print(f"F1 Score (Macro) = {f1_score_macro}")
            print(report)
            print(confusion)
    if not os.path.exists(args.save_dict_path):
        os.makedirs(args.save_dict_path)

    print("Saving Model ...")
    torch.save(model.save_dict(), args.save_dict_path)

In [None]:
train(args, model, optimizer, train_loader, dev_loader)

  0%|          | 0/2792 [00:00<?, ?it/s]

Epoch 1 | Step 500 | Train_loss = 0.925, Train_acc = 0.787
Epoch 1 | Step 1000 | Train_loss = 0.747, Train_acc = 0.832
Epoch 1 | Step 1500 | Train_loss = 0.876, Train_acc = 0.802
Epoch 1 | Step 2000 | Train_loss = 0.847, Train_acc = 0.805
Epoch 1 | Step 2500 | Train_loss = 0.736, Train_acc = 0.839
Evaluating Dev Set ...


  0%|          | 0/349 [00:00<?, ?it/s]

Validation | Epoch 1 | loss = 0.821 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/2792 [00:00<?, ?it/s]

Epoch 2 | Step 500 | Train_loss = 0.845, Train_acc = 0.809
Epoch 2 | Step 1000 | Train_loss = 0.794, Train_acc = 0.816
Epoch 2 | Step 1500 | Train_loss = 0.854, Train_acc = 0.808
Epoch 2 | Step 2000 | Train_loss = 0.807, Train_acc = 0.821
Epoch 2 | Step 2500 | Train_loss = 0.808, Train_acc = 0.820
Evaluating Dev Set ...


  0%|          | 0/349 [00:00<?, ?it/s]

Validation | Epoch 2 | loss = 0.824 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/2792 [00:00<?, ?it/s]

Epoch 3 | Step 500 | Train_loss = 0.765, Train_acc = 0.835
Epoch 3 | Step 1000 | Train_loss = 0.897, Train_acc = 0.793
Epoch 3 | Step 1500 | Train_loss = 0.860, Train_acc = 0.798
Epoch 3 | Step 2000 | Train_loss = 0.816, Train_acc = 0.819
Epoch 3 | Step 2500 | Train_loss = 0.812, Train_acc = 0.817
Evaluating Dev Set ...


  0%|          | 0/349 [00:00<?, ?it/s]

Validation | Epoch 3 | loss = 0.816 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/2792 [00:00<?, ?it/s]

Epoch 4 | Step 500 | Train_loss = 0.775, Train_acc = 0.825
Epoch 4 | Step 1000 | Train_loss = 0.791, Train_acc = 0.825
Epoch 4 | Step 1500 | Train_loss = 0.852, Train_acc = 0.809
Epoch 4 | Step 2000 | Train_loss = 0.856, Train_acc = 0.805
Epoch 4 | Step 2500 | Train_loss = 0.837, Train_acc = 0.807
Evaluating Dev Set ...


  0%|          | 0/349 [00:00<?, ?it/s]

Validation | Epoch 4 | loss = 0.816 | acc = 0.815
F1 Score (Micro) = 0.8147086914995223
F1 Score (Macro) = 0.17957894736842103
              precision    recall  f1-score   support

           0      0.815     1.000     0.898       853
           1      0.000     0.000     0.000        95
           2      0.000     0.000     0.000        14
           3      0.000     0.000     0.000        46
           4      0.000     0.000     0.000        39

    accuracy                          0.815      1047
   macro avg      0.163     0.200     0.180      1047
weighted avg      0.664     0.815     0.732      1047

[[853   0   0   0   0]
 [ 95   0   0   0   0]
 [ 14   0   0   0   0]
 [ 46   0   0   0   0]
 [ 39   0   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/2792 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
def test(test_loader):

    result = []
    model.load_state_dict(torch.load(args.save_path))
    model.eval()
    with torch.no_grad():
        for data in tqdm(test_loader):
            data = [i.to(device) for i in data]

            output = model(data)
            pred_class = torch.max(output, dim=1)[1]
            result.append(pred_class)

    result_file = "./result.csv"
    with open(result_file, 'w') as f:
        f.write("ID,Classification\n")
        for i, res in enumerate(result):
            f.write(f"{i},{res}\n")

In [None]:
new_data = data_process(args.data_dir)
new_data = np.array(new_data)
res = []
count1 = 0
count2 = 0

for j in [0, 1, 2, 3, 4]:

  for i in range(len(new_data)):
    if new_data[i][1] == j:
      count2 += 1
    if len(new_data[i][0]) > 510 and new_data[i][1] == j:
      count1 += 1
    
    
  res.append((count1, count1/count2))
  count1 = 0
  count2 = 0
      
    
res