In [1]:
! pip install datasets
! pip install accelerate

Collecting accelerate
  Using cached accelerate-0.30.1-py3-none-any.whl (302 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cufft_cu12-1

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import jieba
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from gensim.models import Word2Vec
from torch.nn.utils.rnn import pad_sequence
from gensim.corpora.dictionary import Dictionary
from sklearn.model_selection import train_test_split
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.simplefilter('ignore')
from sklearn.metrics import accuracy_score

In [3]:
data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)
cn_stopwords = pd.read_csv('https://mirror.coggle.club/stopwords/baidu_stopwords.txt', header=None)[0].values

le = LabelEncoder()
train_data[1] = le.fit_transform(train_data[1])

train_data['text'] = train_data[0]
train_data['label'] = train_data[1]
train_data.drop(columns=[0, 1], inplace=True)

test_data['text'] = test_data[0]
test_data.drop(columns=[0], inplace=True)

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold

In [19]:
class CFG:
    n_splits = 5
    seed = 42
    max_length = 20
    lr = 1e-5
    train_batch_size = 4
    eval_batch_size = 8
    train_epochs = 10
    weight_decay = 0.01
    warmup_ratio = 0.0
    num_labels = 12

In [6]:
class Tokenize(object):
    def __init__(self, train, valid, tokenizer):
        self.tokenizer = tokenizer
        self.train = train
        self.valid = valid

    def get_dataset(self, df):
        ds = Dataset.from_dict({
                'text': [ft for ft in df['text']],
                'label': [s for s in df['label']],
            })
        return ds

    def tokenize_function(self, example):
        tokenized_inputs = self.tokenizer(
            example['text'], truncation=True, max_length=CFG.max_length
        )
        return tokenized_inputs

    def __call__(self):
        train_ds = self.get_dataset(self.train)
        valid_ds = self.get_dataset(self.valid)

        tokenized_train = train_ds.map(
            self.tokenize_function, batched=True
        )
        tokenized_valid = valid_ds.map(
            self.tokenize_function, batched=True
        )

        return tokenized_train, tokenized_valid, self.tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

In [8]:
skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (_, val_index) in enumerate(skf.split(train_data, train_data["label"])):
    train_data.loc[val_index, "fold"] = i

In [20]:
training_args = TrainingArguments(
    output_dir=f'output_v{1}',
    fp16=True,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.train_batch_size,
    per_device_eval_batch_size=CFG.eval_batch_size,
    num_train_epochs=CFG.train_epochs,
    weight_decay=CFG.weight_decay,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to='none',
    warmup_ratio=CFG.warmup_ratio,
    lr_scheduler_type='linear', # "cosine" or "linear" or "constant"
    optim='adamw_torch',
    logging_first_step=True,
)

In [10]:
def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    results = {
        'accuracy': accuracy
    }
    return results

In [None]:
for fold in range(len(train_data['fold'].unique())):
        print(f"fold{fold + 1}:")

        train = train_data[train_data['fold'] != fold]
        valid = train_data[train_data['fold'] == fold].copy()
        tokenize = Tokenize(train, valid, tokenizer)
        tokenized_train, tokenized_valid, _ = tokenize()

        config = AutoConfig.from_pretrained("bert-base-chinese")
        config.num_labels = CFG.num_labels

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", config=config)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_valid,
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        y_true = valid['label'].values
        predictions = trainer.predict(tokenized_valid).predictions
        predictions = predictions.argmax(axis=1)
        accuracy = accuracy_score(y_true, predictions)
        print(f'Fold {fold + 1} Accuracy: {accuracy}')

fold1:


Map:   0%|          | 0/9680 [00:00<?, ? examples/s]

Map:   0%|          | 0/2420 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3417,0.280989,0.935124
2,0.2363,0.327618,0.937603
3,0.1849,0.339374,0.942562
4,0.0638,0.371817,0.944628
5,0.0449,0.39451,0.946281
6,0.0352,0.432436,0.944215
7,0.0115,0.471534,0.942562
8,0.0071,0.499431,0.941322
9,0.0052,0.478413,0.947934
10,0.0001,0.481619,0.946694


Fold 1 Accuracy: 0.9351239669421487
fold2:


Map:   0%|          | 0/9680 [00:00<?, ? examples/s]

Map:   0%|          | 0/2420 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2884,0.316379,0.923554
2,0.2436,0.384549,0.926446
3,0.136,0.349254,0.940496
4,0.1235,0.499708,0.929339
