In [3]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [4]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Load data
data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)

train_data[1], lbl = pd.factorize(train_data[1])

# Split data
x_train, x_test, train_label, test_label = train_test_split(
    train_data[0].values, train_data[1].values, test_size=0.2, stratify=train_data[1].values
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

# Encode data
train_encoding = tokenizer(list(x_train), truncation=True, padding=True, max_length=30)
test_encoding = tokenizer(list(x_test), truncation=True, padding=True, max_length=30)

# Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = NewsDataset(train_encoding, train_label)
test_dataset = NewsDataset(test_encoding, test_label)

# Accuracy calculation
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=12)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optim = AdamW(model.parameters(), lr=1e-5)

# Training function
def train(model, train_loader, epoch):
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        iter_num += 1
        if iter_num % 100 == 0:
            print(f"Epoch: {epoch}, Iteration: {iter_num}, Loss: {loss.item():.4f}, {iter_num/total_iter*100:.2f}%")
    print(f"Epoch: {epoch}, Average training loss: {total_train_loss/len(train_loader):.4f}")

# Validation function
def validation(model, val_dataloader):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in val_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            logits = outputs[1]
            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print(f"Accuracy: {avg_val_accuracy:.4f}")
    print(f"Average testing loss: {total_eval_loss/len(val_dataloader):.4f}")
    print("-------------------------------")

# K-Fold Cross Validation
kf = KFold(n_splits=5)
fold = 0
for train_idx, val_idx in kf.split(train_data[0].values, train_data[1].values):
    print(f"Fold {fold}")
    train_text = train_data[0].iloc[train_idx]
    val_text = train_data[0].iloc[val_idx]
    train_label = train_data[1].iloc[train_idx].values
    val_label = train_data[1].iloc[val_idx].values

    train_encoding = tokenizer(list(train_text), truncation=True, padding=True, max_length=30)
    val_encoding = tokenizer(list(val_text), truncation=True, padding=True, max_length=30)

    train_dataset = NewsDataset(train_encoding, train_label)
    val_dataset = NewsDataset(val_encoding, val_label)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

    model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=12)
    model.to(device)

    optim = AdamW(model.parameters(), lr=1e-5)
    total_steps = len(train_loader) * 1
    scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch in range(5):
        train(model, train_loader, epoch)
        validation(model, val_dataloader)

    torch.save(model.state_dict(), f'model_{fold}.pt')
    fold += 1

# Test data encoding
test_encoding = tokenizer(list(test_data[0]), truncation=True, padding=True, max_length=30)
test_dataset = NewsDataset(test_encoding, [0] * len(test_data))
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Prediction function
def prediction(model, test_dataloader):
    model.eval()
    pred = []
    for batch in test_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs[1]
            logits = logits.detach().cpu().numpy()
            pred.append(logits)
    return np.vstack(pred)

# Model ensemble prediction
pred = np.zeros((len(test_data), 12))
for path in ['model_0.pt', 'model_1.pt', 'model_2.pt', 'model_3.pt', 'model_4.pt']:
    model.load_state_dict(torch.load(path))
    pred += prediction(model, test_dataloader)

# Save predictions
pd.DataFrame({
    'ID': range(1, len(test_data) + 1),
    'Target': [lbl[x] for x in pred.argmax(1)],
}).to_csv('nlp_submit.csv', index=None)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Iteration: 100, Loss: 0.9321, 16.53%
Epoch: 0, Iteration: 200, Loss: 0.4917, 33.06%
Epoch: 0, Iteration: 300, Loss: 0.2872, 49.59%
Epoch: 0, Iteration: 400, Loss: 0.2804, 66.12%
Epoch: 0, Iteration: 500, Loss: 0.1549, 82.64%
Epoch: 0, Iteration: 600, Loss: 0.6048, 99.17%
Epoch: 0, Average training loss: 0.6682
Accuracy: 0.9782
Average testing loss: 0.0983
-------------------------------
Epoch: 1, Iteration: 100, Loss: 0.0663, 16.53%
Epoch: 1, Iteration: 200, Loss: 0.2800, 33.06%
Epoch: 1, Iteration: 300, Loss: 0.2356, 49.59%
Epoch: 1, Iteration: 400, Loss: 0.0378, 66.12%
Epoch: 1, Iteration: 500, Loss: 0.1495, 82.64%
Epoch: 1, Iteration: 600, Loss: 0.0222, 99.17%
Epoch: 1, Average training loss: 0.2138
Accuracy: 0.9823
Average testing loss: 0.0736
-------------------------------
Epoch: 2, Iteration: 100, Loss: 0.0078, 16.53%
Epoch: 2, Iteration: 200, Loss: 0.2759, 33.06%
Epoch: 2, Iteration: 300, Loss: 0.0077, 49.59%
Epoch: 2, Iteration: 400, Loss: 0.0111, 66.12%
Epoch: 2, It

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Iteration: 100, Loss: 1.0920, 16.53%
Epoch: 0, Iteration: 200, Loss: 0.5357, 33.06%
Epoch: 0, Iteration: 300, Loss: 0.3096, 49.59%
Epoch: 0, Iteration: 400, Loss: 0.5786, 66.12%
Epoch: 0, Iteration: 500, Loss: 0.1322, 82.64%
Epoch: 0, Iteration: 600, Loss: 0.4417, 99.17%
Epoch: 0, Average training loss: 0.6247
Accuracy: 0.9741
Average testing loss: 0.1037
-------------------------------
Epoch: 1, Iteration: 100, Loss: 0.4638, 16.53%
Epoch: 1, Iteration: 200, Loss: 0.0842, 33.06%
Epoch: 1, Iteration: 300, Loss: 0.4485, 49.59%
Epoch: 1, Iteration: 400, Loss: 0.1434, 66.12%
Epoch: 1, Iteration: 500, Loss: 0.0967, 82.64%
Epoch: 1, Iteration: 600, Loss: 0.7682, 99.17%
Epoch: 1, Average training loss: 0.2027
Accuracy: 0.9770
Average testing loss: 0.0890
-------------------------------
Epoch: 2, Iteration: 100, Loss: 0.1822, 16.53%
Epoch: 2, Iteration: 200, Loss: 0.1379, 33.06%
Epoch: 2, Iteration: 300, Loss: 0.0321, 49.59%
Epoch: 2, Iteration: 400, Loss: 0.0104, 66.12%
Epoch: 2, It

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Iteration: 100, Loss: 1.2159, 16.53%
Epoch: 0, Iteration: 200, Loss: 0.5786, 33.06%
Epoch: 0, Iteration: 300, Loss: 0.8468, 49.59%
Epoch: 0, Iteration: 400, Loss: 0.4800, 66.12%
Epoch: 0, Iteration: 500, Loss: 0.3524, 82.64%
Epoch: 0, Iteration: 600, Loss: 0.5096, 99.17%
Epoch: 0, Average training loss: 0.6871
Accuracy: 0.9757
Average testing loss: 0.1115
-------------------------------
Epoch: 1, Iteration: 100, Loss: 0.4830, 16.53%
Epoch: 1, Iteration: 200, Loss: 0.1798, 33.06%
Epoch: 1, Iteration: 300, Loss: 0.4542, 49.59%
Epoch: 1, Iteration: 400, Loss: 0.5498, 66.12%
Epoch: 1, Iteration: 500, Loss: 0.1848, 82.64%
Epoch: 1, Iteration: 600, Loss: 0.0700, 99.17%
Epoch: 1, Average training loss: 0.2179
Accuracy: 0.9790
Average testing loss: 0.0879
-------------------------------
Epoch: 2, Iteration: 100, Loss: 0.0823, 16.53%
Epoch: 2, Iteration: 200, Loss: 0.0085, 33.06%
Epoch: 2, Iteration: 300, Loss: 0.0553, 49.59%
Epoch: 2, Iteration: 400, Loss: 0.0064, 66.12%
Epoch: 2, It

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Iteration: 100, Loss: 1.3313, 16.53%
Epoch: 0, Iteration: 200, Loss: 0.9005, 33.06%
Epoch: 0, Iteration: 300, Loss: 0.1865, 49.59%
Epoch: 0, Iteration: 400, Loss: 0.4677, 66.12%
Epoch: 0, Iteration: 500, Loss: 0.5219, 82.64%
Epoch: 0, Iteration: 600, Loss: 0.4911, 99.17%
Epoch: 0, Average training loss: 0.6324
Accuracy: 0.9400
Average testing loss: 0.2154
-------------------------------
Epoch: 1, Iteration: 100, Loss: 0.1297, 16.53%
Epoch: 1, Iteration: 200, Loss: 0.0392, 33.06%
Epoch: 1, Iteration: 300, Loss: 0.0543, 49.59%
Epoch: 1, Iteration: 400, Loss: 0.0194, 66.12%
Epoch: 1, Iteration: 500, Loss: 0.0228, 82.64%
Epoch: 1, Iteration: 600, Loss: 0.0723, 99.17%
Epoch: 1, Average training loss: 0.1890
Accuracy: 0.9531
Average testing loss: 0.1839
-------------------------------
Epoch: 2, Iteration: 100, Loss: 0.0150, 16.53%
Epoch: 2, Iteration: 200, Loss: 1.0649, 33.06%
Epoch: 2, Iteration: 300, Loss: 0.0058, 49.59%
Epoch: 2, Iteration: 400, Loss: 0.0123, 66.12%
Epoch: 2, It

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Iteration: 100, Loss: 1.0555, 16.53%
Epoch: 0, Iteration: 200, Loss: 0.5458, 33.06%
Epoch: 0, Iteration: 300, Loss: 0.0874, 49.59%
Epoch: 0, Iteration: 400, Loss: 0.1243, 66.12%
Epoch: 0, Iteration: 500, Loss: 0.0220, 82.64%
Epoch: 0, Iteration: 600, Loss: 0.2244, 99.17%
Epoch: 0, Average training loss: 0.4817
Accuracy: 0.7471
Average testing loss: 0.9137
-------------------------------
Epoch: 1, Iteration: 100, Loss: 0.0834, 16.53%
Epoch: 1, Iteration: 200, Loss: 0.0099, 33.06%
Epoch: 1, Iteration: 300, Loss: 0.7936, 49.59%
Epoch: 1, Iteration: 400, Loss: 0.0073, 66.12%
Epoch: 1, Iteration: 500, Loss: 0.0077, 82.64%
Epoch: 1, Iteration: 600, Loss: 0.0050, 99.17%
Epoch: 1, Average training loss: 0.1000
Accuracy: 0.8010
Average testing loss: 0.7950
-------------------------------
Epoch: 2, Iteration: 100, Loss: 0.4277, 16.53%
Epoch: 2, Iteration: 200, Loss: 0.0033, 33.06%
Epoch: 2, Iteration: 300, Loss: 0.0041, 49.59%
Epoch: 2, Iteration: 400, Loss: 0.1232, 66.12%
Epoch: 2, It

In [5]:
from sklearn.metrics import classification_report

def generate_classification_report(model, val_dataloader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    # 生成分类报告
    lbl = ['Label_' + str(i) for i in range(12)]  # 替换成实际的标签名
    report = classification_report(all_labels, all_preds, target_names=lbl)
    print(report)

# 加载模型和验证集数据
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=12)
model.load_state_dict(torch.load('model_0.pt'))  # 替换成实际的模型路径
model.to(device)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)  # 替换成实际的验证集数据加载器

# 生成分类报告
generate_classification_report(model, val_dataloader, device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


              precision    recall  f1-score   support

     Label_0       1.00      0.99      1.00       177
     Label_1       0.96      0.99      0.98       237
     Label_2       0.99      0.99      0.99       297
     Label_3       0.93      1.00      0.96       265
     Label_4       1.00      0.94      0.97       229
     Label_5       1.00      1.00      1.00       174
     Label_6       1.00      1.00      1.00       185
     Label_7       1.00      1.00      1.00       206
     Label_8       1.00      1.00      1.00       178
     Label_9       0.93      0.97      0.95       153
    Label_10       0.96      0.90      0.93       140
    Label_11       0.99      0.92      0.95       179

    accuracy                           0.98      2420
   macro avg       0.98      0.98      0.98      2420
weighted avg       0.98      0.98      0.98      2420

