In [2]:
import warnings
warnings.filterwarnings("ignore")

import jieba
import time
import pandas as pd
import numpy as np
import os
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
import wandb
from argparse import Namespace

def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def load_data(train_file, test_file):
    train_data = pd.read_csv(train_file, sep='\t', header=None)
    test_data = pd.read_csv(test_file, sep='\t', header=None)
    train_data[1], lbl = pd.factorize(train_data[1])
    return train_data, test_data, lbl

def coustom_data_iter(texts, labels):
    for x, y in zip(texts, labels):
        yield x, y

def build_vocab(data_iter, tokenizer):
    def yield_tokens(data_iter):
        for text, _ in data_iter:
            yield tokenizer(text)
    vocab = build_vocab_from_iterator(yield_tokens(data_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab

def text_pipeline(text, vocab, tokenizer):
    return vocab(tokenizer(text))

def collate_batch(batch, vocab, tokenizer, device, max_len=40):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text, vocab, tokenizer), dtype=torch.int64)
        processed_text = F.pad(processed_text, pad=(0, max_len - len(processed_text)), mode='constant', value=0)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True)
    return label_list.to(device), text_list.to(device)

class textCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, kernel_wins, num_class=12, dropout_rate=0):
        super(textCNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, emb_dim, (w, emb_dim)) for w in kernel_wins])
        self.dropout = nn.Dropout(dropout_rate)  # 添加Dropout层
        self.fc = nn.Linear(len(kernel_wins)*emb_dim, num_class)

    def forward(self, x):
        emb_x = self.embed(x)
        emb_x = emb_x.unsqueeze(1)
        con_x = [conv(emb_x) for conv in self.convs]
        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x]
        fc_x = torch.cat(pool_x, dim=1)
        fc_x = fc_x.squeeze(-1)
        fc_x = self.dropout(fc_x)  # 在全连接层之前应用Dropout
        logit = self.fc(fc_x)
        return logit


def train(dataloader, model, criterion, optimizer, device):
    model.train()
    total_acc, total_loss, total_count = 0, 0, 0
    for label, text in dataloader:
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_loss += loss.item()
        total_count += label.size(0)
    return total_acc / total_count, total_loss / total_count

def evaluate(dataloader, model, criterion, device):
    model.eval()
    total_acc, total_loss, total_count = 0, 0, 0
    with torch.no_grad():
        for label, text in dataloader:
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_loss += loss.item()
            total_count += label.size(0)
    return total_acc / total_count, total_loss / total_count

def predict(dataloader, model, device, lbl):
    model.eval()
    test_pred = []
    with torch.no_grad():
        for label, text in dataloader:
            predicted_label = model(text).argmax(1)
            test_pred += list(predicted_label.cpu().numpy())
    return [lbl[x] for x in test_pred]

def run(args):
    # Initialize wandb
    wandb.init(project=args.project_name, config=args)

    # Load data
    train_data, test_data, lbl = load_data(args.train_file, args.test_file)

    # Tokenizer
    tokenizer = jieba.lcut

    # Build vocabulary
    train_iter = coustom_data_iter(train_data[0].values[:], train_data[1].values[:])
    vocab = build_vocab(train_iter, tokenizer)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Data preparation
    train_iter = coustom_data_iter(train_data[0].values[:], train_data[1].values[:])
    train_dataset = to_map_style_dataset(train_iter)

    num_train = int(len(train_dataset) * args.split)
    split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_, batch_size=args.batch_size, shuffle=True, collate_fn=lambda batch: collate_batch(batch, vocab, tokenizer, device))
    valid_dataloader = DataLoader(split_valid_, batch_size=args.batch_size, shuffle=True, collate_fn=lambda batch: collate_batch(batch, vocab, tokenizer, device))

    # Model initialization
    num_class = len(lbl)
    vocab_size = len(vocab)
    model = textCNN(vocab_size, args.embedding_dim, args.kernel_wins, num_class).to(device)

    # Hyperparameters
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
    best_accu = 0

    # Training and evaluation
    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train_acc, train_loss = train(train_dataloader, model, criterion, optimizer, device)
        valid_acc, valid_loss = evaluate(valid_dataloader, model, criterion, device)

        wandb.log({
            "epoch": epoch,
            "train_accuracy": train_acc,
            "train_loss": train_loss,
            "valid_accuracy": valid_acc,
            "valid_loss": valid_loss,
        })

        if valid_acc > best_accu:
            best_accu = valid_acc
            torch.save(model.state_dict(), 'TextCNN.pth')

        print(f'| end of epoch {epoch:3d} | time: {time.time() - epoch_start_time:5.2f}s | '
              f'train accuracy {train_acc:8.3f} | valid accuracy {valid_acc:8.3f} | valid loss {valid_loss:8.3f}')

        scheduler.step()

    # Testing
    test_iter = coustom_data_iter(test_data[0].values[:], [0] * len(test_data))
    test_dataset = to_map_style_dataset(test_iter)
    test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=lambda batch: collate_batch(batch, vocab, tokenizer, device))

    test_pred = predict(test_dataloader, model, device, lbl)

    pd.DataFrame({
        'ID': range(1, len(test_pred) + 1),
        'Target': test_pred,
    }).to_csv('textcnn.csv', index=None)
    wandb.finish()


seed_everything(42)
# Define parameters
args = Namespace(
    train_file='https://mirror.coggle.club/dataset/coggle-competition/intent-classify/train.csv',
    test_file='https://mirror.coggle.club/dataset/coggle-competition/intent-classify/test.csv',
    embedding_dim=100,
    hidden_dim=64,
    kernel_wins=[3, 4, 5],
    batch_size=32,
    epochs=30,
    lr=0.0001,
    step_size=5,
    gamma=0.5,
    split=0.8,
    project_name='intent-recognition'
)

run(args)

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
train_accuracy,▁▅▇▇██████████████
train_loss,█▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_accuracy,▁▅▆▇██████████████
valid_loss,█▃▂▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄

0,1
epoch,18.0
train_accuracy,0.9999
train_loss,1e-05
valid_accuracy,0.88388
valid_loss,0.01711


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112587257391877, max=1.0…

| end of epoch   1 | time:  2.11s | train accuracy    0.497 | valid accuracy    0.648 | valid loss    0.049
| end of epoch   2 | time:  2.17s | train accuracy    0.719 | valid accuracy    0.728 | valid loss    0.035
| end of epoch   3 | time:  2.16s | train accuracy    0.780 | valid accuracy    0.762 | valid loss    0.028
| end of epoch   4 | time:  2.25s | train accuracy    0.818 | valid accuracy    0.787 | valid loss    0.024
| end of epoch   5 | time:  2.13s | train accuracy    0.849 | valid accuracy    0.803 | valid loss    0.022
| end of epoch   6 | time:  2.24s | train accuracy    0.873 | valid accuracy    0.812 | valid loss    0.021
| end of epoch   7 | time:  2.22s | train accuracy    0.884 | valid accuracy    0.821 | valid loss    0.020
| end of epoch   8 | time:  2.37s | train accuracy    0.893 | valid accuracy    0.826 | valid loss    0.019
| end of epoch   9 | time:  2.46s | train accuracy    0.903 | valid accuracy    0.832 | valid loss    0.019
| end of epoch  10 | time:  

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_accuracy,▁▄▅▆▆▇▇▇▇▇▇███████████████████
train_loss,█▅▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_accuracy,▁▄▅▆▆▇▇▇██████████████████████
valid_loss,█▅▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,30.0
train_accuracy,0.95651
train_loss,0.00728
valid_accuracy,0.84587
valid_loss,0.0163


In [3]:
import torch
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader

def generate_classification_report(model_path, valid_dataloader, model, lbl, device):
    # 加载模型
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for label, text in valid_dataloader:
            predicted_label = model(text).argmax(1)
            all_labels.extend(label.cpu().numpy())
            all_preds.extend(predicted_label.cpu().numpy())

    # 生成分类报告
    report = classification_report(all_labels, all_preds, target_names=lbl)
    print(report)

# 设置参数
model_path = 'TextCNN.pth'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载数据
train_data, _, lbl = load_data(args.train_file, args.test_file)
tokenizer = jieba.lcut
vocab = build_vocab(coustom_data_iter(train_data[0].values[:], train_data[1].values[:]), tokenizer)

# 数据准备
train_iter = coustom_data_iter(train_data[0].values[:], train_data[1].values[:])
train_dataset = to_map_style_dataset(train_iter)
num_train = int(len(train_dataset) * args.split)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

valid_dataloader = DataLoader(split_valid_, batch_size=args.batch_size, shuffle=True, collate_fn=lambda batch: collate_batch(batch, vocab, tokenizer, device))

# 定义模型
num_class = len(lbl)
vocab_size = len(vocab)
model = textCNN(vocab_size, args.embedding_dim, args.kernel_wins, num_class).to(device)

# 生成分类报告
generate_classification_report(model_path, valid_dataloader, model, lbl, device)

                       precision    recall  f1-score   support

         Travel-Query       0.95      0.99      0.97       246
           Music-Play       0.93      0.91      0.92       256
        FilmTele-Play       0.86      0.93      0.90       261
           Video-Play       0.93      0.95      0.94       264
         Radio-Listen       0.92      0.95      0.94       244
HomeAppliance-Control       0.96      0.97      0.96       245
        Weather-Query       0.93      0.95      0.94       248
         Alarm-Update       0.99      0.98      0.98       288
       Calendar-Query       0.98      0.99      0.98       217
       TVProgram-Play       0.88      0.60      0.71        62
           Audio-Play       1.00      0.61      0.76        49
                Other       0.79      0.55      0.65        40

             accuracy                           0.93      2420
            macro avg       0.93      0.86      0.89      2420
         weighted avg       0.93      0.93      0.93 