In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
import os
os.chdir('/content/gdrive/MyDrive/Colab Notebooks/sequence_classification/longformer_zh')

In [None]:
ls

classification.py  longformer_suquence_classification.ipynb  [0m[01;34mruns[0m/
__init__.py        predict.py                                test.ipynb
[01;34mlongformer[0m/        README.md
longformer.ipynb   requirements.txt


## 导入环境

In [None]:
! pip install numpy pandas torch transformers progressbar tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pathlib import Path
from datetime import datetime
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from progressbar import ProgressBar, Percentage, Bar, Timer, ETA, FileTransferSpeed
from transformers import BertTokenizer
from longformer.longformer import LongformerConfig, LongformerForSequenceClassification

## 构建数据集

### 数据清洗

In [None]:
fp = '../data/MultilabelSequenceClassification/chinese_dataset/train.zip'
train_df = pd.read_csv(fp).fillna(value='')
train_df.label = train_df.label.str.split('|')
train_df.sample(5)

### 标签id

In [None]:

def id2label(label):
    labels = label.explode().drop_duplicates()
    id2label = dict(zip(range(len(labels)), labels))
    label2id = {v: k for k, v in id2label.items()}

    with open("../data/MultilabelSequenceClassification/chinese_dataset/label.json", "w", encoding="utf-8") as f:
        f.write(json.dumps(id2label, ensure_ascii=False, indent=2))
    return id2label, label2id
    
id2label, label2id = id2label(train_df.label)
len(id2label)

In [None]:
def label_ids(label):
    label_ids = [0] * len(id2label)
    for separate_label in label:
        label_id = label2id.get(separate_label)
        if label_id is not None:
            label_ids[label_id] = 1
    return  label_ids

train_df['label_ids'] = train_df.label.apply(label_ids)
train_df.to_csv('../data/MultilabelSequenceClassification/chinese_dataset/train_dataset.zip')
train_df.head()

## 模型训练

### 加载预训练模型

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')

In [None]:
# model_path = Path("../../models/distilbert-base-uncased")
# tokenizer = BertTokenizer.from_pretrained(model_path)
# config = LongformerConfig.from_pretrained(model_path)
# config.problem_type = "multi_label_classification"
# config.num_labels = 6
# model = BertForSequenceClassification.from_pretrained(model_path, config=config)
# model.to(device)

In [None]:
model_path = Path("../../models/longformer_zh")
tokenizer = BertTokenizer.from_pretrained(model_path)
# config = LongformerConfig.from_pretrained(model_path)
config = LongformerConfig.from_json_file(model_path /"config.json")
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'
# If Use singe label
# config.problem_type = "single_label_classification" If Use singe label
config.problem_type = "multi_label_classification"
config.num_labels = 65
model = LongformerForSequenceClassification.from_pretrained(model_path, config=config)
model.to(device)

### 加载数据集

In [None]:
import pandas as pd 
from ast import literal_eval

def load_dataset(train_path, train_size=0.9):
    df = pd.read_csv(train_path)[:1000]
    new_df = df[['content', 'label_ids']].copy()
    new_df.rename(columns={'label_ids':'label'}, inplace=True)
    new_df.label = new_df.label.apply(literal_eval)
    print(f'df: {new_df.head()}')
    train_data = new_df.sample(frac=train_size, random_state=200)
    val_data = new_df.drop(train_data.index)


    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    
    print(f"FULL Dataset: {new_df.shape}")
    print(f"TRAIN Dataset: {train_data.shape}")
    print(f"VALIDATION Dataset: {val_data.shape}")

    return train_data, val_data

# train_path = "../data/MultilabelSequenceClassification/chinese_dataset/train_dataset.zip"
# _train_data, _validation_data = load_dataset(train_path=train_path)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=config.max_position_embeddings):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['content']
        self.targets = self.data.label if "label" in  dataframe.columns else None
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer(
                                text,
                                None,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length',
                                return_token_type_ids=True,
                                truncation=True,
                                return_tensors='pt'
                               ).to(device)
        inputs_single = {}
        inputs_single['input_ids'] = inputs['input_ids'][0]
        inputs_single['attention_mask'] = inputs['attention_mask'][0]
        inputs_single['token_type_ids'] = inputs['token_type_ids'][0]
        if self.targets is not None:
            targets = torch.tensor(self.targets[index], dtype=torch.float).to(device)
        else:
            targets = torch.tensor([])
        return inputs_single, targets

# train_dataset = CustomDataset(_train_data, tokenizer)

### 测试模型

In [None]:
def test_model(train_path, batch_size=16):
    train_data, validation_data = load_dataset(train_path)
    train_dataset = CustomDataset(train_data, tokenizer)
    validation_dataset = CustomDataset(validation_data, tokenizer )
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(validation_dataset, batch_size=batch_size)
    
    for batch, data in enumerate(val_loader):
        print(batch, data)
        inputs, labels = data
        if batch == 1:
            break
    outputs = model(**inputs, labels=labels)
    print(outputs)

# test_model(train_path="../data/MultilabelSequenceClassification/chinese_dataset/train_dataset.zip")

In [None]:
# break

### 训练模型

In [None]:
# The Training Loop    
def train_one_epoch(train_loader, epoch_index, tb_writer=None):
    # we’ll be using simple stochastic gradient descent with momentum

    

    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)
    running_loss = 0.
    last_loss = 0.
    print("train_loader", len(train_loader))
    # Add progress bar
    loop = tqdm(train_loader)

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(loop):
        
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(**inputs, labels=labels)
        # Gain loss
        loss = outputs.loss
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data 
        running_loss += loss.item()
        avg_loss = running_loss / (i + 1)
    
        # Finally, it reports the average per-batch loss for the last 1000 batches, for comparison with a validation run
        if (tb_writer) is not None  and (i % 1000 == 999):
            last_loss = running_loss / 1000 # loss per batch
            print('batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

        # Update progress bar
        loop.set_description(f"Epoch [{epoch_index}]") #
        loop.set_postfix(train_loss=avg_loss, accuracy=torch.rand(1).item()) 
        

    return avg_loss


In [None]:
def train_model(train_path, max_len=1024, batch_size=16, epochs=10):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
    best_vloss = 1_000_000.

    train_data, validation_data = load_dataset(train_path, train_size=0.9)
    train_dataset = CustomDataset(train_data, tokenizer, max_len=max_len)
    validation_dataset = CustomDataset(validation_data, tokenizer, max_len=max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size)

    for epoch in range(1, epochs+1):
        print(f'EPOCH: {epoch}')
        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_one_epoch(train_loader, epoch, writer)

        widgets = ['Validation Progress: ', Percentage(), ' ', Bar('#'),' ', Timer(),
                ' ', ETA(), ' ', FileTransferSpeed()]
        pbar = ProgressBar(widgets=widgets, maxval=len(validation_loader)).start()
        # Validate the model. We don't need gradients on to do reporting
        with torch.no_grad():
            running_vloss = 0.0
            for batch, vdata in enumerate(validation_loader):
                vinputs, vlabels = vdata
                voutputs = model(**vinputs, labels=vlabels)
                vloss = voutputs.loss
                running_vloss += vloss
                pbar.update(batch + 1)
            avg_vloss = running_vloss / (batch + 1)
        print(f'LOSS train {avg_loss} valid {avg_vloss}')
        pbar.finish()
        
        # Log the running loss averaged per batch
        # for both training and validation
        writer.add_scalars('Training vs. Validation Loss',
                           { 'Training' : avg_loss, 'Validation' : avg_vloss },
                        epoch)
        writer.flush()

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = f'test_model/model_{timestamp}_{best_vloss}.pt'
            torch.save(model.state_dict(), model_path)
        torch.cuda.empty_cache()



In [None]:
train_path = "../data/MultilabelSequenceClassification/chinese_dataset/train_dataset.zip"

EPOCHS = 2
MAX_LEN = 1024
BATCH_SIZE = 4
train_model(train_path, max_len=MAX_LEN, batch_size=BATCH_SIZE, epochs=EPOCHS)

In [None]:
import pandas as pd

df =  pd.read_csv('../data/MultilabelSequenceClassification/chinese_dataset/train.zip')
df.head()
df_label = pd.read_json('../data/MultilabelSequenceClassification/chinese_dataset/label.zip', orient='index')
df_label.head()

In [None]:
break

In [None]:
test_path = "../data/MultilabelSequenceClassification/toxic-comment-classification/test.csv.zip"
test_data = pd.read_csv(test_path)
test_dataset = CustomDataset(test_data, tokenizer, max_len=MAX_LEN)
print(f"TEST Dataset: {test_data.shape}")
test_loader = DataLoader(test_dataset, batch_size=16)  