In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

    
from tokenizers import Tokenizer, models



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# =====================
# 参数配置
# =====================
import torch

PARAMS = {
    "MODEL_TYPE": "decoder-only",   # "encoder-decoder" 或 "decoder-only"
    "EMBED_DIM": 128,
    "HIDDEN_DIM": 256,
    "NUM_LAYERS": 2,
    "NUM_HEADS": 4,
    "DROPOUT": 0.1,
    "LR": 1e-3,
    "BATCH_SIZE": 64,
    "EPOCHS": 5,
    "MAX_SEQ_LEN": 64,
    "TOKENIZER_TYPE": "bpe"  # 可选"char","bpe"或”word“ "pretrained-bpe" 

}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 日志保存文件
RESULTS_FILE = "/kaggle/working/results.csv"

In [3]:
# =====================
# 数据加载 & 预处理
# =====================
import os
from torch.utils.data import Dataset, DataLoader



# Tokenizer 类型: "word" / "char" / "bpe"
TOKENIZER_TYPE = PARAMS["TOKENIZER_TYPE"]


DATA_DIR = "wiki"  
TRAIN_FILE = os.path.join(DATA_DIR, "wiki.train.tokens")
VALID_FILE = os.path.join(DATA_DIR, "wiki.valid.tokens")
TEST_FILE  = os.path.join(DATA_DIR, "wiki.test.tokens")

def load_file(path):
    with open(path, encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

train_texts = load_file(TRAIN_FILE)
valid_texts = load_file(VALID_FILE)
test_texts  = load_file(TEST_FILE)

# ========== Tokenizer 定义 ==========
if TOKENIZER_TYPE == "word":
    def tokenize(texts):
        return [line.split() for line in texts]
    train_tokens = tokenize(train_texts)
    valid_tokens = tokenize(valid_texts)
    test_tokens  = tokenize(test_texts)

    vocab = {"<pad>":0, "<unk>":1}
    for line in train_tokens:
        for tok in line:
            if tok not in vocab:
                vocab[tok] = len(vocab)

elif TOKENIZER_TYPE == "char":
    def tokenize(texts):
        return [list(line) for line in texts]
    train_tokens = tokenize(train_texts)
    valid_tokens = tokenize(valid_texts)
    test_tokens  = tokenize(test_texts)

    vocab = {"<pad>":0, "<unk>":1}
    for line in train_tokens:
        for tok in line:
            if tok not in vocab:
                vocab[tok] = len(vocab)

elif TOKENIZER_TYPE == "bpe":
    from tokenizers import ByteLevelBPETokenizer

    # 训练 BPE
    bpe_tokenizer = ByteLevelBPETokenizer()
    bpe_tokenizer.train(files=[TRAIN_FILE], vocab_size=10000, min_frequency=2,
                        special_tokens=["<pad>", "<unk>"])
    bpe_tokenizer.save_model("./bpe")
    bpe_tokenizer = ByteLevelBPETokenizer(
        "./bpe/vocab.json",
        "./bpe/merges.txt"
    )

    train_tokens = train_texts  # 保留原始字符串
    valid_tokens = valid_texts
    test_tokens  = test_texts

    vocab = bpe_tokenizer.get_vocab()

elif PARAMS["TOKENIZER_TYPE"] == "pretrained-bpe":
    from tokenizers import Tokenizer, models
    tokenizer = Tokenizer(models.BPE.from_file(
        vocab="tokenizer/vocab.json",
        merges="tokenizer/merges.txt"
    ))

    train_tokens = train_texts
    valid_tokens = valid_texts
    test_tokens  = test_texts

    vocab = tokenizer.get_vocab()

     # ===== 手动补充特殊符号 =====
    if "<pad>" not in vocab:
        vocab["<pad>"] = len(vocab)
    if "<unk>" not in vocab:
        vocab["<unk>"] = len(vocab)
        
else:
    raise ValueError("TOKENIZER_TYPE must be 'word', 'char', or 'bpe'")

PAD_IDX = vocab["<pad>"]
UNK_IDX = vocab["<unk>"]
VOCAB_SIZE = len(vocab)

# ========== 编码函数 ==========
def encode(tokens, vocab, max_len):
    """word/char 模式"""
    ids = [vocab.get(t, UNK_IDX) for t in tokens]
    ids = ids[:max_len]
    ids += [PAD_IDX] * (max_len - len(ids))
    return ids

def encode_bpe(text, tokenizer, max_len):
    """BPE / Pretrained-BPE 模式"""
    ids = tokenizer.encode(text).ids
    ids = ids[:max_len]
    pad_id = tokenizer.token_to_id("<pad>") if "<pad>" in tokenizer.get_vocab() else PAD_IDX
    ids += [pad_id] * (max_len - len(ids))
    return ids

# ========== Dataset ==========
class TextDataset(Dataset):
    def __init__(self, data, vocab, max_len, mode="word", tokenizer=None):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len
        self.mode = mode
        self.tokenizer = tokenizer  # 只有 bpe / pretrained-bpe 模式需要

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.mode in ["word", "char"]:
            tokens = self.data[idx]
            x = encode(tokens[:-1], self.vocab, self.max_len)
            y = encode(tokens[1:], self.vocab, self.max_len)
        else:  # bpe / pretrained-bpe
            text = self.data[idx]
            x = encode_bpe(text[:-1], self.tokenizer, self.max_len)
            y = encode_bpe(text[1:], self.tokenizer, self.max_len)
        return torch.tensor(x), torch.tensor(y)


if TOKENIZER_TYPE == "pretrained-bpe":
    tok = tokenizer
elif TOKENIZER_TYPE == "bpe":
    tok = bpe_tokenizer
else:
    tok = None  # word / char 不需要传

train_dataset = TextDataset(train_tokens, vocab, PARAMS["MAX_SEQ_LEN"], TOKENIZER_TYPE, tokenizer=tok)
valid_dataset = TextDataset(valid_tokens, vocab, PARAMS["MAX_SEQ_LEN"], TOKENIZER_TYPE, tokenizer=tok)
test_dataset  = TextDataset(test_tokens, vocab, PARAMS["MAX_SEQ_LEN"], TOKENIZER_TYPE, tokenizer=tok)


train_loader = DataLoader(train_dataset, batch_size=PARAMS["BATCH_SIZE"], shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=PARAMS["BATCH_SIZE"])
test_loader  = DataLoader(test_dataset,  batch_size=PARAMS["BATCH_SIZE"])







In [4]:
import os, json, time
import matplotlib.pyplot as plt

def create_experiment_dir(params):
    """创建实验目录并保存参数"""
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    exp_name = f"{params['MODEL_TYPE']}_lr{params['LR']}_drop{params['DROPOUT']}_layers{params['NUM_LAYERS']}_{timestamp}"
    exp_dir = os.path.join("results", exp_name)
    os.makedirs(exp_dir, exist_ok=True)

    # 保存参数配置
    with open(os.path.join(exp_dir, "params.json"), "w") as f:
        json.dump(params, f, indent=4)

    return exp_dir


In [5]:
# =====================
# 模型定义
# =====================
import torch.nn as nn

class TransformerModel(nn.Module):
    def __init__(self, model_type, vocab_size, embed_dim, hidden_dim, num_layers, num_heads, dropout, max_len):
        super().__init__()
        self.model_type = model_type
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.pos_encoding = nn.Embedding(max_len, embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, tgt=None):
        seq_len = src.size(1)
        pos = torch.arange(0, seq_len, device=src.device).unsqueeze(0)
        src_embed = self.embedding(src) + self.pos_encoding(pos)

        if self.model_type == "encoder-decoder":
            memory = self.encoder(src_embed.transpose(0,1))  # (S,N,E)
            if tgt is not None:
                tgt_seq_len = tgt.size(1)
                tgt_pos = torch.arange(0, tgt_seq_len, device=tgt.device).unsqueeze(0)
                tgt_embed = self.embedding(tgt) + self.pos_encoding(tgt_pos)
                output = self.decoder(tgt_embed.transpose(0,1), memory)
            else:
                output = memory
        else:  # decoder-only
            output = self.encoder(src_embed.transpose(0,1))

        logits = self.fc(output.transpose(0,1))
        return logits


In [6]:
# =====================
# 训练 & 验证 & 测试
# =====================
import torch.optim as optim
from torch.nn.functional import cross_entropy

# def train_model(model, train_loader, valid_loader, test_loader, params):
#     optimizer = optim.Adam(model.parameters(), lr=params["LR"])
#     best_valid_loss = float("inf")

#     results = {"train_loss": [], "train_acc": [], "valid_loss": [], "valid_acc": []}

#     for epoch in range(params["EPOCHS"]):
#         # ---------- Train ----------
#         model.train()
#         total_loss, total_correct, total_count = 0, 0, 0
#         for x, y in train_loader:
#             x, y = x.to(DEVICE), y.to(DEVICE)
#             optimizer.zero_grad()
#             output = model(x, x if params["MODEL_TYPE"]=="encoder-decoder" else None)
#             loss = cross_entropy(output.view(-1, VOCAB_SIZE), y.view(-1), ignore_index=PAD_IDX)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
#             total_correct += (output.argmax(-1) == y).sum().item()
#             total_count += y.numel()
#         train_loss = total_loss / len(train_loader)
#         train_acc  = total_correct / total_count

#         # ---------- Valid ----------
#         model.eval()
#         total_loss, total_correct, total_count = 0, 0, 0
#         with torch.no_grad():
#             for x, y in valid_loader:
#                 x, y = x.to(DEVICE), y.to(DEVICE)
#                 output = model(x, x if params["MODEL_TYPE"]=="encoder-decoder" else None)
#                 loss = cross_entropy(output.view(-1, VOCAB_SIZE), y.view(-1), ignore_index=PAD_IDX)
#                 total_loss += loss.item()
#                 total_correct += (output.argmax(-1) == y).sum().item()
#                 total_count += y.numel()
#         valid_loss = total_loss / len(valid_loader)
#         valid_acc  = total_correct / total_count

#         print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Acc={train_acc:.4f}, Valid Loss={valid_loss:.4f}, Acc={valid_acc:.4f}")

#         results["train_loss"].append(train_loss)
#         results["train_acc"].append(train_acc)
#         results["valid_loss"].append(valid_loss)
#         results["valid_acc"].append(valid_acc)

#     # ---------- Test ----------
#     model.eval()
#     total_loss, total_correct, total_count = 0, 0, 0
#     with torch.no_grad():
#         for x, y in test_loader:
#             x, y = x.to(DEVICE), y.to(DEVICE)
#             output = model(x, x if params["MODEL_TYPE"]=="encoder-decoder" else None)
#             loss = cross_entropy(output.view(-1, VOCAB_SIZE), y.view(-1), ignore_index=PAD_IDX)
#             total_loss += loss.item()
#             total_correct += (output.argmax(-1) == y).sum().item()
#             total_count += y.numel()
#     test_loss = total_loss / len(test_loader)
#     test_acc  = total_correct / total_count

#     print(f"Test: Loss={test_loss:.4f}, Acc={test_acc:.4f}")
#     return results, (test_loss, test_acc)

def train_model(model, train_loader, valid_loader, test_loader, params):
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=params["LR"])

    # === 新增：创建实验目录 ===
    exp_dir = create_experiment_dir(params)
    log_file = open(os.path.join(exp_dir, "train_log.txt"), "w")

    results = {"train_loss": [], "train_acc": [], "valid_loss": [], "valid_acc": []}

    for epoch in range(1, params["EPOCHS"]+1):
        # ---------- Train ----------
        model.train()
        total_loss, total_correct, total_count = 0, 0, 0
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            output = model(x, x if params["MODEL_TYPE"]=="encoder-decoder" else None)
            loss = cross_entropy(output.view(-1, VOCAB_SIZE), y.view(-1), ignore_index=PAD_IDX)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_correct += (output.argmax(-1) == y).sum().item()
            total_count += y.numel()

        train_loss = total_loss / len(train_loader)
        train_acc = total_correct / total_count
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)

        # ---------- Valid ----------
        model.eval()
        total_loss, total_correct, total_count = 0, 0, 0
        with torch.no_grad():
            for x, y in valid_loader:
                x, y = x.to(DEVICE), y.to(DEVICE)
                output = model(x, x if params["MODEL_TYPE"]=="encoder-decoder" else None)
                loss = cross_entropy(output.view(-1, VOCAB_SIZE), y.view(-1), ignore_index=PAD_IDX)
                total_loss += loss.item()
                total_correct += (output.argmax(-1) == y).sum().item()
                total_count += y.numel()

        valid_loss = total_loss / len(valid_loader)
        valid_acc = total_correct / total_count
        results["valid_loss"].append(valid_loss)
        results["valid_acc"].append(valid_acc)

        log_str = f"Epoch {epoch}: Train Loss={train_loss:.4f}, Acc={train_acc:.4f}, Valid Loss={valid_loss:.4f}, Acc={valid_acc:.4f}"
        print(log_str)
        log_file.write(log_str + "\n")

    # ---------- Test ----------
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            output = model(x, x if params["MODEL_TYPE"]=="encoder-decoder" else None)
            loss = cross_entropy(output.view(-1, VOCAB_SIZE), y.view(-1), ignore_index=PAD_IDX)
            total_loss += loss.item()
            total_correct += (output.argmax(-1) == y).sum().item()
            total_count += y.numel()

    test_loss = total_loss / len(test_loader)
    test_acc  = total_correct / total_count
    test_result = {"test_loss": test_loss, "test_acc": test_acc}

    test_str = f"Test: Loss={test_loss:.4f}, Acc={test_acc:.4f}"
    print(test_str)
    log_file.write(test_str + "\n")
    log_file.close()

    # === 保存 results.json ===
    import json
    with open(os.path.join(exp_dir, "results.json"), "w") as f:
        json.dump({"train_valid": results, "test": test_result}, f, indent=4)

    # === 保存曲线图 ===
    plt.figure()
    plt.plot(results["train_loss"], label="Train Loss")
    plt.plot(results["valid_loss"], label="Valid Loss")
    plt.legend()
    plt.title("Loss Curve")
    plt.savefig(os.path.join(exp_dir, "loss_curve.png"))

    plt.figure()
    plt.plot(results["train_acc"], label="Train Acc")
    plt.plot(results["valid_acc"], label="Valid Acc")
    plt.legend()
    plt.title("Accuracy Curve")
    plt.savefig(os.path.join(exp_dir, "acc_curve.png"))

    print(f"✅ 实验结果已保存到 {exp_dir}")

    return results, test_result



In [7]:
import matplotlib.pyplot as plt

def plot_results(results):
    epochs = range(1, len(results["train_loss"]) + 1)

    # Loss
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1)
    plt.plot(epochs, results["train_loss"], label="Train Loss")
    plt.plot(epochs, results["valid_loss"], label="Valid Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Loss over Epochs")
    plt.legend()

    # Accuracy
    plt.subplot(1,2,2)
    plt.plot(epochs, results["train_acc"], label="Train Acc")
    plt.plot(epochs, results["valid_acc"], label="Valid Acc")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Accuracy over Epochs")
    plt.legend()

    plt.tight_layout()
    plt.show()




In [8]:
# =====================
# 主程序
# =====================
model = TransformerModel(
    model_type=PARAMS["MODEL_TYPE"],
    vocab_size=VOCAB_SIZE,
    embed_dim=PARAMS["EMBED_DIM"],
    hidden_dim=PARAMS["HIDDEN_DIM"],
    num_layers=PARAMS["NUM_LAYERS"],
    num_heads=PARAMS["NUM_HEADS"],
    dropout=PARAMS["DROPOUT"],
    max_len=PARAMS["MAX_SEQ_LEN"]
).to(DEVICE)

results, test_result = train_model(model, train_loader, valid_loader, test_loader, PARAMS)
plot_results(results)




Epoch 1: Train Loss=4.8493, Acc=0.2533, Valid Loss=1.3031, Acc=0.5641
Epoch 2: Train Loss=0.6913, Acc=0.6200, Valid Loss=0.2028, Acc=0.6771


KeyboardInterrupt: 

In [None]:
print("DEBUG results type:", type(results), results)
