In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# =====================
# 导入依赖
# =====================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch.nn.functional as F



# =====================
# 配置区（实验变量宏定义）
# =====================

# 数据路径
TRAIN_PATH = "/kaggle/input/train-tsv/new_train.tsv"
TEST_PATH  = "/kaggle/input/test-tsv/new_test.tsv"

# 数据预处理
MAX_LEN = 100              # 句子最大长度
VOCAB_MIN_FREQ = 1         # 词表最小词频（这里没用到，可扩展）

# 模型参数
EMBED_DIM = 100
USE_GLOVE = True
GLOVE_PATH = "/kaggle/input/glove-100d/wiki_giga_2024_100_MFT20_vectors_seed_2024_alpha_0.75_eta_0.05.050_combined.txt"
KERNEL_SIZES = [3, 4, 5]
NUM_CHANNELS = 50
DROPOUT = 0.5
MODEL_TYPE = "DeepCNN"   # 可选: "CNN", "RNN", "TRANSFORMER", "DeepCNN"

# 训练参数
NUM_CLASSES = 5
BATCH_SIZE = 32
NUM_EPOCHS = 15
LEARNING_RATE = 2e-4
LOSS_FUNCTION = "CrossEntropy"   # 可选: "CrossEntropy", "MSE"
OPTIMIZER = "Adam"               # 可选: "Adam", "SGD"

# 设备
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"



# =====================
# 1. 数据读取与预处理
# =====================
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

train_df = pd.read_csv(TRAIN_PATH, sep="\t", names=["text", "label"])
test_df  = pd.read_csv(TEST_PATH,  sep="\t", names=["text", "label"])
train_df["tokens"] = train_df["text"].apply(preprocess)
test_df["tokens"]  = test_df["text"].apply(preprocess)

# 构建词表
all_tokens = [tok for tokens in train_df["tokens"] for tok in tokens]
vocab = {"<PAD>": 0, "<UNK>": 1}
for tok in all_tokens:
    if tok not in vocab:
        vocab[tok] = len(vocab)
vocab_size = len(vocab)

# tokens → ids
def tokens_to_ids(tokens, vocab, max_len=MAX_LEN):
    ids = [vocab.get(tok, 1) for tok in tokens]  # 1 = <UNK>
    ids = ids[:max_len]
    ids += [0] * (max_len - len(ids))            # PAD
    return ids

train_ids = [tokens_to_ids(toks, vocab) for toks in train_df["tokens"]]
test_ids  = [tokens_to_ids(toks, vocab) for toks in test_df["tokens"]]

# 划分验证集
train_ids, valid_ids, train_labels, valid_labels = train_test_split(
    train_ids, train_df["label"], test_size=0.2, random_state=42
)

train_dataset = TensorDataset(torch.tensor(train_ids), torch.tensor(train_labels.values))
valid_dataset = TensorDataset(torch.tensor(valid_ids), torch.tensor(valid_labels.values))
test_dataset  = TensorDataset(torch.tensor(test_ids),  torch.tensor(test_df["label"].values))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE)

# =====================
# 2. 模型定义
# =====================

# (可选) GloVe 初始化
def load_glove_embeddings(glove_path, vocab, embed_dim):
    embeddings = torch.randn(len(vocab), embed_dim) * 0.6
    embeddings[0] = torch.zeros(embed_dim)   # <PAD> 用 0 向量

    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vec = parts[1:]
            if len(vec) != embed_dim:
                continue
            if word in vocab:
                embeddings[vocab[word]] = torch.tensor([float(x) for x in vec])
    return embeddings


# CNN
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if USE_GLOVE:
            glove_matrix = load_glove_embeddings(GLOVE_PATH, vocab, embed_dim)
            self.embedding = nn.Embedding.from_pretrained(glove_matrix, freeze=False, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=NUM_CHANNELS, kernel_size=k)
            for k in KERNEL_SIZES
        ])
        self.dropout = nn.Dropout(DROPOUT)
        self.fc = nn.Linear(NUM_CHANNELS * len(KERNEL_SIZES), num_classes)

    def forward(self, x):
        x = self.embedding(x)           # (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)          # (batch, embed_dim, seq_len)
        conv_outs = [torch.relu(conv(x)) for conv in self.convs]
        pooled = [torch.max(out, dim=2)[0] for out in conv_outs]
        features = torch.cat(pooled, dim=1)
        return self.fc(self.dropout(features))

# RNN (简单 LSTM)
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if USE_GLOVE:
            glove_matrix = load_glove_embeddings(GLOVE_PATH, vocab, embed_dim)
            self.embedding = nn.Embedding.from_pretrained(glove_matrix, freeze=False, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, hidden_size=128, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(128 * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.rnn(x)
        features = torch.cat((h[-2], h[-1]), dim=1)  # 拼接双向
        return self.fc(features)

# Transformer (简单版 Encoder)
class TextTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_heads=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if USE_GLOVE:
            glove_matrix = load_glove_embeddings(GLOVE_PATH, vocab, embed_dim)
            self.embedding = nn.Embedding.from_pretrained(glove_matrix, freeze=False, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)                # (batch, seq_len, embed_dim)
        x = x.permute(1, 0, 2)               # (seq_len, batch, embed_dim)
        out = self.transformer(x)            # (seq_len, batch, embed_dim)
        features = out.mean(dim=0)           # 平均池化
        return self.fc(features)

class DeepCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, pretrained_embeddings=None,
                 kernel_sizes=[3,4,5], num_channels=100, dropout=0.5):
        super(DeepCNN, self).__init__()
        
        # Embedding层
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embeddings, dtype=torch.float),
                freeze=False  # 允许微调
            )
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        self.convs = nn.ModuleList()
        for ks in kernel_sizes:
            # 堆叠两层卷积（增加深度）
            conv_block = nn.Sequential(
                nn.Conv1d(embed_dim, num_channels, kernel_size=ks, padding=ks//2),
                nn.ReLU(),
                nn.Conv1d(num_channels, num_channels, kernel_size=ks, padding=ks//2),
                nn.ReLU(),
                nn.MaxPool1d(kernel_size=2)  # 下采样，增加表达能力
            )
            self.convs.append(conv_block)
        
        # 全连接层
        self.fc = nn.Linear(num_channels * len(kernel_sizes), num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)           # (batch, seq_len, embed_dim)
        embedded = embedded.permute(0, 2, 1)   # (batch, embed_dim, seq_len) for Conv1d

        conv_outs = []
        for conv in self.convs:
            c = conv(embedded)  # (batch, num_channels, seq_len//2)
            pooled = F.adaptive_max_pool1d(c, 1).squeeze(-1)  # 全局池化到固定维度
            conv_outs.append(pooled)

        out = torch.cat(conv_outs, dim=1)
        out = self.dropout(out)
        return self.fc(out)

# 模型选择
if MODEL_TYPE == "CNN":
    model = TextCNN(vocab_size, EMBED_DIM, NUM_CLASSES)
elif MODEL_TYPE == "RNN":
    model = TextRNN(vocab_size, EMBED_DIM, NUM_CLASSES)
elif MODEL_TYPE == "TRANSFORMER":
    model = TextTransformer(vocab_size, EMBED_DIM, NUM_CLASSES)
elif MODEL_TYPE == "DeepCNN":
    model = DeepCNN(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    num_classes=NUM_CLASSES,
    pretrained_embeddings=None,   # 如果想用 GloVe，就传 numpy 矩阵
    kernel_sizes=[3,4,5],
    num_channels=100,
    dropout=0.5
).to(DEVICE)
else:
    raise ValueError("Unknown MODEL_TYPE")

model = model.to(DEVICE)

# =====================
# 3. 损失函数 & 优化器
# =====================
if LOSS_FUNCTION == "CrossEntropy":
    criterion = nn.CrossEntropyLoss()
elif LOSS_FUNCTION == "MSE":
    criterion = nn.MSELoss()

if OPTIMIZER == "Adam":
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
elif OPTIMIZER == "SGD":
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

import matplotlib.pyplot as plt

# =====================
# 4. 训练 & 验证（保存曲线数据）
# =====================
history = {
    "train_loss": [],
    "train_acc": [],
    "val_loss": [],
    "val_acc": []
}

def evaluate(loader, criterion=None):
    """评估函数，可以返回 acc，也可以计算 loss"""
    model.eval()
    correct, total, total_loss = 0, 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            preds = model(x)
            if LOSS_FUNCTION == "MSE":
                y_onehot = F.one_hot(y, num_classes=NUM_CLASSES).float()
                loss = criterion(preds, y_onehot)
            else:
                loss = criterion(preds, y)
            total_loss += loss.item()
            pred_labels = preds.argmax(dim=1)
            correct += (pred_labels == y).sum().item()
            total += y.size(0)
    acc = correct / total
    avg_loss = total_loss / len(loader) if criterion else 0
    return acc, avg_loss

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        preds = model(x)
        if LOSS_FUNCTION == "MSE":
            y_onehot = F.one_hot(y, num_classes=NUM_CLASSES).float()
            loss = criterion(preds, y_onehot)
        else:
            loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pred_labels = preds.argmax(dim=1)
        correct += (pred_labels == y).sum().item()
        total += y.size(0)

    # 计算 train 的平均 loss/acc
    train_loss = total_loss / len(train_loader)
    train_acc = correct / total

    # 计算 valid 的 loss/acc
    val_acc, val_loss = evaluate(valid_loader, criterion)

    # 保存记录
    history["train_loss"].append(train_loss)
    history["train_acc"].append(train_acc)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | "
          f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
          f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

# =====================
# 5. 可视化训练曲线
# =====================
plt.figure(figsize=(12,5))

# Loss 曲线
plt.subplot(1,2,1)
plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["val_loss"], label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training & Validation Loss")
plt.legend()

# Acc 曲线
plt.subplot(1,2,2)
plt.plot(history["train_acc"], label="Train Acc")
plt.plot(history["val_acc"], label="Val Acc")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training & Validation Accuracy")
plt.legend()

plt.show()

# =====================
# 6. 测试集结果
# =====================
test_acc, test_loss = evaluate(test_loader, criterion)
print(f"Final Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}")


In [None]:
import torch

# ===== GloVe 加载函数 =====
def load_glove_vocab(path, embed_dim):
    glove = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            word, vec = parts[0], parts[1:]
            if len(vec) != embed_dim:
                continue
            glove[word] = 1
    print(f"GloVe 文件中共有 {len(glove)} 个词")
    return glove

# ===== 验证 OOV =====
glove_vocab = load_glove_vocab(GLOVE_PATH, EMBED_DIM)

found, not_found = 0, 0
for word in vocab.keys():
    if word in glove_vocab:
        found += 1
    else:
        not_found += 1

print(f"词表大小: {len(vocab)}")
print(f"在 GloVe 中找到的词: {found}")
print(f"没找到 (OOV) 的词: {not_found}")
print(f"OOV率: {not_found / len(vocab):.2%}")


In [None]:
from collections import Counter
counts = Counter(train_df["label"])
print("label counts:", counts)
most_common_acc = max(counts.values()) / sum(counts.values())
print("Majority class accuracy (baseline):", most_common_acc)


In [None]:
from collections import Counter

model.eval()
all_preds = []
with torch.no_grad():
    for X, y in valid_loader:
        preds = model(X.to(DEVICE)).argmax(1).cpu().numpy()
        all_preds.extend(preds)

print("Predicted label distribution:", Counter(all_preds))


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(train_df["text"])
y = train_df["label"]

Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LogisticRegression(max_iter=2000).fit(Xt, yt)
print("BoW Logistic acc:", lr.score(Xv, yv))


In [None]:
#--------------使用经过大量文本预训练的bert------------
#发现其表现的确都优于原来实现的三种模型
import torch
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# ==============
# 参数
# ==============
TRAIN_PATH = "/kaggle/input/train-tsv/new_train.tsv"
TEST_PATH  = "/kaggle/input/test-tsv/new_test.tsv"
MAX_LEN = 100
BATCH_SIZE = 32
NUM_EPOCHS = 3
LEARNING_RATE = 1e-5
NUM_CLASSES = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ==============
# 1. 数据读取
# ==============
train_df = pd.read_csv(TRAIN_PATH, sep="\t", names=["text", "label"])
test_df  = pd.read_csv(TEST_PATH,  sep="\t", names=["text", "label"])

# 划分训练/验证
train_texts, valid_texts, train_labels, valid_labels = train_test_split(
    train_df["text"], train_df["label"], test_size=0.2, random_state=42
)

# ==============
# 2. 数据集类
# ==============
class BertTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# ==============
# 3. 模型 & DataLoader
# ==============
tokenizer = BertTokenizer.from_pretrained("/kaggle/input/bert-test")  # 本地路径
model = BertForSequenceClassification.from_pretrained(
    "/kaggle/input/bert-test", num_labels=NUM_CLASSES
).to(DEVICE)

train_dataset = BertTextDataset(train_texts, train_labels, tokenizer)
valid_dataset = BertTextDataset(valid_texts, valid_labels, tokenizer)
test_dataset  = BertTextDataset(test_df["text"], test_df["label"], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# ==============
# 4. 训练函数
# ==============
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(loader)

# ==============
# 5. 验证函数
# ==============
def eval_epoch(model, loader, device):
    model.eval()
    preds, labels_list = [], []
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels_list.extend(labels.cpu().numpy())

    acc = accuracy_score(labels_list, preds)
    return total_loss / len(loader), acc

# ==============
# 6. 主训练循环
# ==============
for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, DEVICE)
    val_loss, val_acc = eval_epoch(model, valid_loader, DEVICE)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val   Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

# ==============
# 7. 最终测试
# ==============
test_loss, test_acc = eval_epoch(model, test_loader, DEVICE)
print(f"\nFinal Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")
