## 导入环境

In [1]:
# ! pip install numpy pandas torch transformers progressbar tqdm

In [2]:
from pathlib import Path
from datetime import datetime
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from progressbar import ProgressBar, Percentage, Bar, Timer, ETA, FileTransferSpeed
from transformers import BertTokenizer
from longformer.longformer import LongformerConfig, LongformerForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


## 模型训练

### 加载预训练模型

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device('cpu')

In [3]:
# model_path = Path("../../models/distilbert-base-uncased")
# tokenizer = BertTokenizer.from_pretrained(model_path)
# config = LongformerConfig.from_pretrained(model_path)
# config.problem_type = "multi_label_classification"
# config.num_labels = 6
# model = BertForSequenceClassification.from_pretrained(model_path, config=config)
# model.to(device)

In [4]:
model_path = Path("../../models/longformer_zh")
tokenizer = BertTokenizer.from_pretrained(model_path)
# config = LongformerConfig.from_pretrained(model_path)
config = LongformerConfig.from_json_file(model_path /"config.json")
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'
# If Use singe label
# config.problem_type = "single_label_classification" If Use singe label
config.problem_type = "multi_label_classification"
config.num_labels = 6
model = LongformerForSequenceClassification.from_pretrained(model_path, config=config)
model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LongformerTokenizer'. 
The class this function is called from is 'BertTokenizer'.


----------------- LongformerConfig {
  "_name_or_path": "..\\..\\models\\longformer_zh",
  "architectures": [
    "LongformerModel"
  ],
  "attention_dilation": null,
  "attention_mode": "sliding_chunks",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "autoregressive": false,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at ..\..\models\longformer_zh and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LongformerForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=76

### 加载数据集

In [5]:
import pandas as pd 


def load_dataset(train_path, train_size=0.9):
    df = pd.read_csv(train_path)[:1000]
    df['label'] = df[df.columns[2:]].values.tolist()
    new_df = df[['comment_text', 'label']].copy()
    print(f'df: {new_df.head()}')
    train_data = new_df.sample(frac=train_size, random_state=200)
    val_data = new_df.drop(train_data.index)


    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    
    print(f"FULL Dataset: {new_df.shape}")
    print(f"TRAIN Dataset: {train_data.shape}")
    print(f"VALIDATION Dataset: {val_data.shape}")

    return train_data, val_data

# train_path = "../data/MultilabelSequenceClassification/toxic-comment-classification/train.csv.zip"
# _train_data, _validation_data = load_dataset(train_path=train_path)

In [6]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=config.max_position_embeddings):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['comment_text']
        self.targets = self.data.label if "label" in  dataframe.columns else None
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer(
                                text,
                                None,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length',
                                return_token_type_ids=True,
                                truncation=True,
                                return_tensors='pt'
                               ).to(device)
        inputs_single = {}
        inputs_single['input_ids'] = inputs['input_ids'][0]
        inputs_single['attention_mask'] = inputs['attention_mask'][0]
        inputs_single['token_type_ids'] = inputs['token_type_ids'][0]
        if self.targets is not None:
            targets = torch.tensor(self.targets[index], dtype=torch.float).to(device)
        else:
            targets = torch.tensor([])
        return inputs_single, targets

# train_dataset = CustomDataset(_train_data, tokenizer)

### 测试模型

In [7]:
def test_model(train_path, batch_size=16):
    train_data, validation_data = load_dataset(train_path)
    train_dataset = CustomDataset(train_data, tokenizer)
    validation_dataset = CustomDataset(validation_data, tokenizer )
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(validation_dataset, batch_size=batch_size)
    
    for batch, data in enumerate(val_loader):
        print(batch, data)
        inputs, labels = data
        if batch == 1:
            break
    outputs = model(**inputs, labels=labels)
    print(outputs)

# test_model(train_path = "../data/MultilabelSequenceClassification/toxic-comment-classification/train.csv.zip")

In [8]:
# break

### 训练模型

In [9]:
# The Training Loop    
def train_one_epoch(train_loader, epoch_index, tb_writer=None):
    # we’ll be using simple stochastic gradient descent with momentum

    

    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)
    running_loss = 0.
    last_loss = 0.
    print("train_loader", len(train_loader))
    # Add progress bar
    loop = tqdm(train_loader)

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(loop):
        
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(**inputs, labels=labels)
        # Gain loss
        loss = outputs.loss
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data 
        running_loss += loss.item()
        avg_loss = running_loss / (i + 1)
    
        # Finally, it reports the average per-batch loss for the last 1000 batches, for comparison with a validation run
        if (tb_writer) is not None  and (i % 1000 == 999):
            last_loss = running_loss / 1000 # loss per batch
            print('batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

        # Update progress bar
        loop.set_description(f"Epoch [{epoch_index}]") #
        loop.set_postfix(train_loss=avg_loss, accuracy=torch.rand(1).item()) 
        

    return avg_loss


In [10]:
def train_model(train_path, max_len=1024, batch_size=16, epochs=10):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
    best_vloss = 1_000_000.

    train_data, validation_data = load_dataset(train_path, train_size=0.9)
    train_dataset = CustomDataset(train_data, tokenizer, max_len=max_len)
    validation_dataset = CustomDataset(validation_data, tokenizer, max_len=max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size)

    for epoch in range(1, epochs+1):
        print(f'EPOCH: {epoch}')
        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_one_epoch(train_loader, epoch, writer)

        widgets = ['Validation Progress: ', Percentage(), ' ', Bar('#'),' ', Timer(),
                ' ', ETA(), ' ', FileTransferSpeed()]
        pbar = ProgressBar(widgets=widgets, maxval=len(validation_loader)).start()
        # Validate the model. We don't need gradients on to do reporting
        with torch.no_grad():
            running_vloss = 0.0
            for batch, vdata in enumerate(validation_loader):
                vinputs, vlabels = vdata
                voutputs = model(**vinputs, labels=vlabels)
                vloss = voutputs.loss
                running_vloss += vloss
                pbar.update(batch + 1)
            avg_vloss = running_vloss / (batch + 1)
        print(f'LOSS train {avg_loss} valid {avg_vloss}')
        pbar.finish()
        
        # Log the running loss averaged per batch
        # for both training and validation
        writer.add_scalars('Training vs. Validation Loss',
                           { 'Training' : avg_loss, 'Validation' : avg_vloss },
                        epoch)
        writer.flush()

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = f'test_model/model_{timestamp}_{best_vloss}.pt'
            torch.save(model.state_dict(), model_path)
        torch.cuda.empty_cache()



In [11]:
train_path = "../data/MultilabelSequenceClassification/toxic-comment-classification/train.csv.zip"

EPOCHS = 2
MAX_LEN = 1024
BATCH_SIZE = 8
train_model(train_path, max_len=MAX_LEN, batch_size=BATCH_SIZE, epochs=EPOCHS)

df:                                         comment_text               label
0  Explanation\nWhy the edits made under my usern...  [0, 0, 0, 0, 0, 0]
1  D'aww! He matches this background colour I'm s...  [0, 0, 0, 0, 0, 0]
2  Hey man, I'm really not trying to edit war. It...  [0, 0, 0, 0, 0, 0]
3  "\nMore\nI can't make any real suggestions on ...  [0, 0, 0, 0, 0, 0]
4  You, sir, are my hero. Any chance you remember...  [0, 0, 0, 0, 0, 0]
FULL Dataset: (1000, 2)
TRAIN Dataset: (900, 2)
VALIDATION Dataset: (100, 2)
EPOCH: 1
train_loader 113


Epoch [1]: 100%|██████████| 113/113 [3:18:43<00:00, 105.52s/it, accuracy=0.687, train_loss=0.221]   
Validation Progress: 100% |###| Elapsed Time: 0:06:16 Time: 0:06:16   0.03  B/s


LOSS train 0.22053214769184062 valid 0.23097021877765656
EPOCH: 2
train_loader 113


Epoch [2]: 100%|██████████| 113/113 [3:08:02<00:00, 99.84s/it, accuracy=0.0786, train_loss=0.129] 
Validation Progress: 100% |###| Elapsed Time: 0:06:03 Time: 0:06:03   0.04  B/s


LOSS train 0.12926062842649697 valid 0.1975124478340149


In [3]:
import pandas as pd

df =  pd.read_csv('../data/MultilabelSequenceClassification/chinese_dataset/train.zip')
df.head()

Unnamed: 0,label,content
0,组织关系-裁员,雀巢裁员4000人：时代抛弃你时，连招呼都不会打！
1,组织关系-裁员,美国“未来为”子公司大幅度裁员，这是为什么呢？任正非正式回应
2,组织关系-裁员,这一全球巨头“凉凉”“捅刀”华为后裁员5000现市值缩水800亿
3,组织关系-裁员,被证实将再裁员1800人AT&T在为落后的经营模式买单
4,组织关系-裁员,又一网约车巨头倒下：三个月裁员835名员工，滴滴又该何去何从


In [12]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
test_path = "../data/MultilabelSequenceClassification/toxic-comment-classification/test.csv.zip"
test_data = pd.read_csv(test_path)
test_dataset = CustomDataset(test_data, tokenizer, max_len=MAX_LEN)
print(f"TEST Dataset: {test_data.shape}")
test_loader = DataLoader(test_dataset, batch_size=16)  

TEST Dataset: (153164, 2)
