In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import jieba
import time
import pandas as pd
import numpy as np
import wandb
import argparse
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def load_data(train_file, test_file):
    train_data = pd.read_csv(train_file, sep='\t', header=None)
    test_data = pd.read_csv(test_file, sep='\t', header=None)
    train_data[1], lbl = pd.factorize(train_data[1])
    return train_data, test_data, lbl

def custom_data_iter(texts, labels):
    for x, y in zip(texts, labels):
        yield x, y

def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield jieba.lcut(text)

def build_vocab(data_iter):
    vocab = build_vocab_from_iterator(yield_tokens(data_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab

def text_pipeline(x, vocab): 
    return vocab(jieba.lcut(x))

def collate_batch(batch, vocab, device):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text, vocab), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

class FastText(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(FastText, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

def train(dataloader, model, criterion, optimizer, device):
    model.train()
    total_acc, total_count = 0, 0

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)

    return total_acc / total_count, loss.item()

def evaluate(dataloader, model, criterion, device):
    model.eval()
    total_acc, total_count = 0, 0
    total_loss = 0.0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()
    return total_acc / total_count, total_loss / len(dataloader)

def predict(dataloader, model, device):
    model.eval()
    test_pred = []
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets).argmax(1)
            test_pred += list(predicted_label.cpu().numpy())
    return test_pred

def run(args):
    # Initialize wandb
    wandb.init(project=args.project_name, config=args)

    # Load data
    train_data, test_data, lbl = load_data(args.train_file, args.test_file)

    # Build vocabulary
    train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
    vocab = build_vocab(train_iter)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Data preparation
    train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
    train_dataset = to_map_style_dataset(train_iter)

    num_train = int(len(train_dataset) * args.split)
    split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_, batch_size=args.batch_size, shuffle=True, collate_fn=lambda batch: collate_batch(batch, vocab, device))
    valid_dataloader = DataLoader(split_valid_, batch_size=args.batch_size, shuffle=True, collate_fn=lambda batch: collate_batch(batch, vocab, device))

    num_class = len(lbl)
    vocab_size = len(vocab)
    model = FastText(vocab_size, args.embedding_size, num_class).to(device)

    # Hyperparameters
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=args.gamma)
    total_accu = None
    best_accu = 0

    # Log hyperparameters with wandb
    wandb.config.update({
        "epochs": args.epochs,
        "learning_rate": args.learning_rate,
        "batch_size": args.batch_size,
        "embedding_size": args.embedding_size
    })

    # Training loop
    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train_acc, train_loss = train(train_dataloader, model, criterion, optimizer, device)
        accu_val, val_loss = evaluate(valid_dataloader, model, criterion, device)

        # Log metrics with wandb
        wandb.log({
            "epoch": epoch,
            "train_accuracy": train_acc,
            "train_loss": train_loss,
            "valid_accuracy": accu_val,
            "valid_loss": val_loss
        })

        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val

        if accu_val > best_accu:
            best_accu = accu_val
            torch.save(model.state_dict(), args.model_path)
            wandb.save(args.model_path)

        print('| end of epoch {:3d} | time: {:5.2f}s | valid accuracy {:8.3f} | valid loss {:8.3f}'.format(epoch, time.time() - epoch_start_time, accu_val, val_loss))

    # Test set predictions
    test_iter = custom_data_iter(test_data[0].values[:], [0] * len(test_data))
    test_dataset = to_map_style_dataset(test_iter)
    test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=lambda batch: collate_batch(batch, vocab, device))

    test_pred = predict(test_dataloader, model, device)
    test_pred = [lbl[x] for x in test_pred]

    pd.DataFrame({
        'ID': range(1, len(test_pred) + 1),
        'Target': test_pred,
    }).to_csv('fasttext.csv', index=None)

    # Mark the run as finished
    wandb.finish()

# Create a Namespace object for Jupyter Notebook
seed_everything(42)
args = argparse.Namespace(
    train_file='https://mirror.coggle.club/dataset/coggle-competition/intent-classify/train.csv',
    test_file='https://mirror.coggle.club/dataset/coggle-competition/intent-classify/test.csv',
    batch_size=32,
    split=0.8,
    embedding_size=100,
    epochs=30,
    learning_rate=1,
    gamma=0.5,
    model_path='FastText.pth',
    project_name='intent-recognition',
)

# Run the training and evaluation process
run(args)

[34m[1mwandb[0m: Currently logged in as: [33mmoguw[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112448122973243, max=1.0…

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.759 seconds.
Prefix dict has been built successfully.


| end of epoch   1 | time:  1.95s | valid accuracy    0.655 | valid loss    1.514
| end of epoch   2 | time:  1.68s | valid accuracy    0.760 | valid loss    1.026
| end of epoch   3 | time:  1.56s | valid accuracy    0.802 | valid loss    0.805
| end of epoch   4 | time:  1.60s | valid accuracy    0.833 | valid loss    0.682
| end of epoch   5 | time:  1.57s | valid accuracy    0.842 | valid loss    0.602
| end of epoch   6 | time:  1.74s | valid accuracy    0.857 | valid loss    0.546
| end of epoch   7 | time:  1.56s | valid accuracy    0.864 | valid loss    0.507
| end of epoch   8 | time:  1.49s | valid accuracy    0.868 | valid loss    0.480
| end of epoch   9 | time:  1.65s | valid accuracy    0.874 | valid loss    0.455
| end of epoch  10 | time:  1.56s | valid accuracy    0.879 | valid loss    0.432
| end of epoch  11 | time:  1.63s | valid accuracy    0.882 | valid loss    0.418
| end of epoch  12 | time:  1.65s | valid accuracy    0.886 | valid loss    0.406
| end of epoch  

VBox(children=(Label(value='4.270 MB of 4.270 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_accuracy,▁▅▆▆▆▇▇▇▇▇▇▇██████████████████
train_loss,█▅▄▄▃▂▂▃▂▂▂▁▂▂▁▂▂▂▂▁▁▁▁▁▁▃▁▁▁▁
valid_accuracy,▁▄▅▆▆▇▇▇▇▇████████████████████
valid_loss,█▅▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,30.0
train_accuracy,0.96023
train_loss,0.1046
valid_accuracy,0.89669
valid_loss,0.36012


In [2]:
import torch
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader

def generate_classification_report(model_path, valid_dataloader, model, lbl, device):
    # 加载模型
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(valid_dataloader):
            predicted_label = model(text, offsets).argmax(1)
            all_labels.extend(label.cpu().numpy())
            all_preds.extend(predicted_label.cpu().numpy())

    # 生成分类报告
    report = classification_report(all_labels, all_preds, target_names=lbl)
    print(report)

# 设置参数
model_path = 'FastText.pth'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载数据
train_data, _, lbl = load_data(args.train_file, args.test_file)
tokenizer = jieba.lcut
vocab = build_vocab(custom_data_iter(train_data[0].values[:], train_data[1].values[:]))

# 数据准备
train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
train_dataset = to_map_style_dataset(train_iter)
num_train = int(len(train_dataset) * args.split)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

valid_dataloader = DataLoader(split_valid_, batch_size=args.batch_size, shuffle=True, collate_fn=lambda batch: collate_batch(batch, vocab, device))

# 定义模型
num_class = len(lbl)
vocab_size = len(vocab)
model = FastText(vocab_size, args.embedding_size, num_class)

# 生成分类报告
generate_classification_report(model_path, valid_dataloader, model, lbl, device)


                       precision    recall  f1-score   support

         Travel-Query       0.95      0.99      0.97       262
           Music-Play       0.95      0.95      0.95       261
        FilmTele-Play       0.87      0.94      0.90       249
           Video-Play       0.96      0.94      0.95       258
         Radio-Listen       0.95      0.96      0.95       246
HomeAppliance-Control       0.97      0.97      0.97       252
        Weather-Query       0.98      0.98      0.98       261
         Alarm-Update       0.98      0.94      0.96       265
       Calendar-Query       0.98      0.99      0.98       219
       TVProgram-Play       0.84      0.71      0.77        52
           Audio-Play       0.83      0.74      0.79        47
                Other       0.73      0.62      0.67        48

             accuracy                           0.95      2420
            macro avg       0.92      0.90      0.90      2420
         weighted avg       0.94      0.95      0.94 