In [None]:
# 导入库
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
import re
from collections import Counter
from tqdm import tqdm
import random

# 3. 设置随机种子保证可重复性
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# 4. 设备配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 5. 自定义分词器和数据集类
def cooking_tokenizer(text):
    """针对烹饪数据集的专用分词器"""
    text = re.sub(r'[^\w\s]', ' ', text.lower())  # 移除非字母数字字符并转为小写
    tokens = text.split()
    # 合并常见烹饪术语
    cooking_terms = ['stir fry', 'baking powder', 'olive oil']
    for term in cooking_terms:
        text = text.replace(term, term.replace(' ', '_'))
    return text.split()

class CookingDataset(Dataset):
    def __init__(self, filepath, max_len=200, min_word_count=5):
        self.texts = []
        self.labels = []
        self.max_len = max_len
        self.word2idx = {}
        self.label2idx = {}

        # 读取数据并统计词频
        self._load_data(filepath)

        # 构建词汇表（过滤低频词）
        self._build_vocab(min_word_count)

        # 构建标签映射
        self._build_label_map()

        # 打印数据集统计信息
        print(f"\n数据集统计:")
        print(f"- 总样本数: {len(self.texts)}")
        print(f"- 词汇表大小: {len(self.word2idx)}")
        print(f"- 类别数量: {len(self.label2idx)}")
        print(f"- 最长文本长度: {max(len(t.split()) for t in self.texts)}")
        print(f"- 示例标签分布: {Counter(self.labels).most_common(5)}")

    def _load_data(self, filepath):
        """加载数据并初步处理"""
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split(' ', 1)
                    if len(parts) == 2:
                        label = parts[0].replace('__label__', '')
                        text = parts[1]
                        self.texts.append(text)
                        self.labels.append(label)

    def _build_vocab(self, min_count):
        """构建词汇表"""
        word_counts = Counter()
        for text in self.texts:
            word_counts.update(cooking_tokenizer(text))

        # 过滤低频词
        vocab = [word for word, count in word_counts.items() if count >= min_count]

        # 添加特殊token
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        for idx, word in enumerate(vocab, start=2):
            self.word2idx[word] = idx

    def _build_label_map(self):
        """构建标签映射"""
        unique_labels = sorted(list(set(self.labels)))
        self.label2idx = {label: idx for idx, label in enumerate(unique_labels)}
        self.idx2label = {idx: label for label, idx in self.label2idx.items()}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 文本转token id
        tokens = cooking_tokenizer(text)
        token_ids = [self.word2idx.get(token, self.word2idx['<unk>']) for token in tokens][:self.max_len]

        # 填充或截断
        if len(token_ids) < self.max_len:
            token_ids += [self.word2idx['<pad>']] * (self.max_len - len(token_ids))
        else:
            token_ids = token_ids[:self.max_len]

        return torch.tensor(token_ids), torch.tensor(self.label2idx[label])

# 6. 创建数据集
dataset = CookingDataset('/content/cooking.stackexchange.txt', max_len=150)

# 7. 分割数据集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# 8. 创建数据加载器
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# 9. 模型定义
class CookingClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, hidden_dim=128, n_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # 双向GRU比LSTM在烹饪数据集上表现更好
        self.gru = nn.GRU(embed_dim, hidden_dim, n_layers,
                         batch_first=True, bidirectional=True, dropout=0.3)

        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(0.5)

        # 初始化权重
        self._init_weights()

    def _init_weights(self):
        """初始化权重"""
        for name, param in self.named_parameters():
            if 'weight' in name:
                if 'embedding' in name:
                    nn.init.uniform_(param, -0.1, 0.1)
                elif 'gru' in name:
                    if 'weight_ih' in name:
                        nn.init.xavier_uniform_(param)
                    elif 'weight_hh' in name:
                        nn.init.orthogonal_(param)
                elif 'fc' in name:
                    nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0.0)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))  # [batch, seq, embed]

        gru_out, _ = self.gru(embedded)  # [batch, seq, hidden*2]

        # 使用注意力机制聚合序列信息
        weights = torch.softmax(torch.mean(gru_out, dim=2), dim=1)
        weighted = torch.bmm(gru_out.transpose(1, 2), weights.unsqueeze(2)).squeeze(2)

        return self.fc(self.dropout(weighted))

# 10. 初始化模型
model = CookingClassifier(
    vocab_size=len(dataset.word2idx),
    embed_dim=128,
    num_classes=len(dataset.label2idx),
    hidden_dim=128,
    n_layers=2
).to(device)

# 11. 训练函数
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, total_correct = 0, 0

    for inputs, labels in tqdm(loader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()

        # 梯度裁剪防止爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_correct += (predicted == labels).sum().item()

    return total_loss / len(loader), total_correct / len(loader.dataset)

# 12. 评估函数
def evaluate(model, loader, criterion):
    model.eval()
    total_loss, total_correct = 0, 0

    with torch.no_grad():
        for inputs, labels in tqdm(loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_correct += (predicted == labels).sum().item()

    return total_loss / len(loader), total_correct / len(loader.dataset)

# 13. 训练配置
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.5)

# 14. 训练循环
num_epochs = 15
best_acc = 0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_loader, criterion)

    scheduler.step(val_acc)

    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    # 保存最佳模型
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"New best model saved with accuracy {best_acc:.4f}")

# 15. 测试函数
def predict(text, model, dataset, device='cuda'):
    model.eval()
    tokens = cooking_tokenizer(text)
    token_ids = [dataset.word2idx.get(token, dataset.word2idx['<unk>']) for token in tokens][:dataset.max_len]

    if len(token_ids) < dataset.max_len:
        token_ids += [dataset.word2idx['<pad>']] * (dataset.max_len - len(token_ids))
    else:
        token_ids = token_ids[:dataset.max_len]

    with torch.no_grad():
        inputs = torch.tensor([token_ids]).to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        return dataset.idx2label[predicted.item()]

# 16. 测试示例
test_questions = [
    "How to stop Xanthan Gum from clumping?",
    "Can you pure without a food processor?",
    "Why did my soufflé collapse?",
    "How to make authentic Italian pasta sauce?",
    "What's the difference between baking powder and baking soda?"
]

print("\n测试预测结果:")
for question in test_questions:
    pred = predict(question, model, dataset, device)
    print(f"Q: {question}\nA: Predicted category -> {pred}\n")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

Using device: cuda

数据集统计:
- 总样本数: 15404
- 词汇表大小: 3046
- 类别数量: 533
- 最长文本长度: 32
- 示例标签分布: [('baking', 1423), ('food-safety', 1152), ('substitutions', 810), ('equipment', 735), ('bread', 403)]

Epoch 1/15


Training: 100%|██████████| 193/193 [00:04<00:00, 42.95it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 135.71it/s]


Train Loss: 4.8278 | Train Acc: 0.0807
Val Loss: 4.6731 | Val Acc: 0.0893
New best model saved with accuracy 0.0893

Epoch 2/15


Training: 100%|██████████| 193/193 [00:03<00:00, 57.89it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 138.59it/s]


Train Loss: 4.4716 | Train Acc: 0.1091
Val Loss: 4.3374 | Val Acc: 0.1207
New best model saved with accuracy 0.1207

Epoch 3/15


Training: 100%|██████████| 193/193 [00:03<00:00, 57.82it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 140.77it/s]


Train Loss: 4.2044 | Train Acc: 0.1567
Val Loss: 4.1213 | Val Acc: 0.1970
New best model saved with accuracy 0.1970

Epoch 4/15


Training: 100%|██████████| 193/193 [00:03<00:00, 54.45it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 106.74it/s]


Train Loss: 3.9460 | Train Acc: 0.2226
Val Loss: 3.8726 | Val Acc: 0.2639
New best model saved with accuracy 0.2639

Epoch 5/15


Training: 100%|██████████| 193/193 [00:03<00:00, 57.29it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 137.84it/s]


Train Loss: 3.6763 | Train Acc: 0.2822
Val Loss: 3.6691 | Val Acc: 0.3132
New best model saved with accuracy 0.3132

Epoch 6/15


Training: 100%|██████████| 193/193 [00:03<00:00, 57.31it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 138.96it/s]


Train Loss: 3.4331 | Train Acc: 0.3191
Val Loss: 3.5354 | Val Acc: 0.3356
New best model saved with accuracy 0.3356

Epoch 7/15


Training: 100%|██████████| 193/193 [00:03<00:00, 51.64it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 68.21it/s]


Train Loss: 3.2163 | Train Acc: 0.3479
Val Loss: 3.4141 | Val Acc: 0.3557
New best model saved with accuracy 0.3557

Epoch 8/15


Training: 100%|██████████| 193/193 [00:03<00:00, 55.27it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 136.93it/s]


Train Loss: 3.0149 | Train Acc: 0.3803
Val Loss: 3.2931 | Val Acc: 0.3781
New best model saved with accuracy 0.3781

Epoch 9/15


Training: 100%|██████████| 193/193 [00:03<00:00, 57.06it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 137.87it/s]


Train Loss: 2.8151 | Train Acc: 0.4130
Val Loss: 3.2308 | Val Acc: 0.4002
New best model saved with accuracy 0.4002

Epoch 10/15


Training: 100%|██████████| 193/193 [00:03<00:00, 55.02it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 106.67it/s]


Train Loss: 2.6530 | Train Acc: 0.4463
Val Loss: 3.1670 | Val Acc: 0.4125
New best model saved with accuracy 0.4125

Epoch 11/15


Training: 100%|██████████| 193/193 [00:03<00:00, 54.86it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 138.01it/s]


Train Loss: 2.4979 | Train Acc: 0.4695
Val Loss: 3.1791 | Val Acc: 0.4158
New best model saved with accuracy 0.4158

Epoch 12/15


Training: 100%|██████████| 193/193 [00:03<00:00, 56.58it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 137.70it/s]


Train Loss: 2.3589 | Train Acc: 0.4867
Val Loss: 3.1223 | Val Acc: 0.4310
New best model saved with accuracy 0.4310

Epoch 13/15


Training: 100%|██████████| 193/193 [00:03<00:00, 54.54it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 87.31it/s]


Train Loss: 2.2301 | Train Acc: 0.5107
Val Loss: 3.1356 | Val Acc: 0.4314
New best model saved with accuracy 0.4314

Epoch 14/15


Training: 100%|██████████| 193/193 [00:03<00:00, 48.88it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 137.87it/s]


Train Loss: 2.1358 | Train Acc: 0.5235
Val Loss: 3.1432 | Val Acc: 0.4421
New best model saved with accuracy 0.4421

Epoch 15/15


Training: 100%|██████████| 193/193 [00:03<00:00, 55.83it/s]
Evaluating: 100%|██████████| 49/49 [00:00<00:00, 136.35it/s]

Train Loss: 2.0167 | Train Acc: 0.5419
Val Loss: 3.1965 | Val Acc: 0.4333

测试预测结果:
Q: How long should I bake chicken at 350°F?
A: Predicted category -> food-safety

Q: What's the best substitute for eggs in vegan baking?
A: Predicted category -> substitutions

Q: Why did my soufflé collapse?
A: Predicted category -> refrigerator

Q: How to make authentic Italian pasta sauce?
A: Predicted category -> pasta

Q: What's the difference between baking powder and baking soda?
A: Predicted category -> baking






In [None]:
test_questions = [
    "How to stop Xanthan Gum from clumping?",
    "Can you pure without a food processor?",
    "What's the difference between gazpacho and normal soups?",
    "How to make authentic Italian pasta sauce?",
    "Would ground popcorn meal differ from regular corn meal?"
]

print("\n测试预测结果:")
for question in test_questions:
    pred = predict(question, model, dataset, device)
    print(f"Q: {question}\nA: Predicted category -> {pred}\n")


测试预测结果:
Q: How to stop Xanthan Gum from clumping?
A: Predicted category -> rice

Q: Can you pure without a food processor?
A: Predicted category -> equipment

Q: What's the difference between gazpacho and normal soups?
A: Predicted category -> soup

Q: How to make authentic Italian pasta sauce?
A: Predicted category -> pasta

Q: Would ground popcorn meal differ from regular corn meal?
A: Predicted category -> juice

