# 第8章: ニューラルネット

第7章で取り組んだポジネガ分類を題材として、ニューラルネットワークで分類モデルを実装する。なお、この章ではPyTorchやTensorFlow、JAXなどの深層学習フレームワークを活用せよ。

## 70. 単語埋め込みの読み込み
事前学習済み単語埋め込みを活用し、$|V| \times d_{emb}$ の単語埋め込み行列$\pmb{E}$を作成せよ。ここで、$|V|$は単語埋め込みの語彙数、$d_{emb}$は単語埋め込みの次元数である。ただし、単語埋め込み行列の先頭の行ベクトル$\pmb{E}_{0,:}$は、将来的にパディング（`<PAD>`）トークンの埋め込みベクトルとして用いたいので、ゼロベクトルとして予約せよ。ゆえに、$\pmb{E}$の2行目以降に事前学習済み単語埋め込みを読み込むことになる。

もし、Google Newsデータセットの[学習済み単語ベクトル](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)（300万単語・フレーズ、300次元）を全て読み込んだ場合、$|V|=3000001, d_{emb}=300$になるはずである（ただ、300万単語の中には、殆ど用いられない稀な単語も含まれるので、語彙を削減した方がメモリの節約になる）。

また、単語埋め込み行列の構築と同時に、単語埋め込み行列の各行のインデックス番号（トークンID）と、単語（トークン）への双方向の対応付けを保持せよ。

In [1]:
from gensim.models import KeyedVectors
import numpy as np

model = KeyedVectors.load_word2vec_format(
    "../第6章：単語ベクトル/GoogleNews-vectors-negative300.bin", binary=True
)

vocab_size = len(model.key_to_index)
embedding_dim = model.vector_size

# 単語埋め込み行列を作成（語彙数+1 × 次元数）
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))

# 単語からIDへの辞書とIDから単語への辞書を作成
word_to_id = {"<PAD>": 0}  # パディングトークンのIDは0
id_to_word = {0: "<PAD>"}


for i, word in enumerate(model.key_to_index, start=1):
    embedding_matrix[i] = model[word]
    word_to_id[word] = i
    id_to_word[i] = word

print(f"単語埋め込み行列の形状: {embedding_matrix.shape}")
print(f"語彙数: {len(word_to_id)}")
print(f"埋め込み次元数: {embedding_matrix.shape[1]}")

print("\n最初の5単語:")
for i in range(1, 6):
    word = id_to_word[i]
    print(f"ID: {i}, 単語: {word}, ベクトル: {embedding_matrix[i][:5]}...")

単語埋め込み行列の形状: (3000001, 300)
語彙数: 3000001
埋め込み次元数: 300

最初の5単語:
ID: 1, 単語: </s>, ベクトル: [ 0.00112915 -0.00089645  0.00031853  0.00153351  0.00110626]...
ID: 2, 単語: in, ベクトル: [0.0703125  0.08691406 0.08789062 0.0625     0.06933594]...
ID: 3, 単語: for, ベクトル: [-0.01177979 -0.04736328  0.04467773  0.06347656 -0.01818848]...
ID: 4, 単語: that, ベクトル: [-0.01574707 -0.02832031  0.08349609  0.05029297 -0.11035156]...
ID: 5, 単語: is, ベクトル: [ 0.00704956 -0.07324219  0.171875    0.02258301 -0.1328125 ]...


## 71. データセットの読み込み

[General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) ベンチマークで配布されている[Stanford Sentiment Treebank (SST)](https://dl.fbaipublicfiles.com/glue/data/SST-2.zip) をダウンロードし、訓練セット（train.tsv）と開発セット（dev.tsv）のテキストと極性ラベルと読み込み、全てのテキストをトークンID列に変換せよ。このとき、単語埋め込みの語彙でカバーされていない単語は無視し、トークン列に含めないことにせよ。また、テキストの全トークンが単語埋め込みの語彙に含まれておらず、空のトークン列となってしまう事例は、訓練セットおよび開発セットから削除せよ（このため、第7章の実験で得られた正解率と比較できなくなることに注意せよ）。

事例の表現方法は任意でよいが、例えば"contains no wit , only labored gags"がネガティブに分類される事例は、次のような辞書オブジェクトで表現すればよい。

```
{'text': 'contains no wit , only labored gags',
 'label': tensor([0.]),
 'input_ids': tensor([ 3475,    87, 15888,    90, 27695, 42637])}
```

この例では、`text`はテキスト、`label`は分類ラベル（ポジティブなら`tensor([1.])`、ネガティブなら`tensor([0.])`）、`input_ids`はテキストのトークン列をID列で表現している。

In [None]:
import pandas as pd
import torch


def load_data(df, text_col_name, label_col_name):
    dict_list = []
    for text, label in zip(df[text_col_name], df[label_col_name]):
        input_ids = [word_to_id[token] for token in text.split() if token in word_to_id]
        if len(input_ids) > 0:
            dict = {
                "text": text,
                "label": torch.tensor([float(label)]),
                "input_ids": torch.tensor(input_ids),
            }
            dict_list.append(dict)
    return dict_list


df_train = pd.read_csv("../第7章：機械学習/SST-2/train.tsv", sep="\t")
df_dev = pd.read_csv("../第7章：機械学習/SST-2/dev.tsv", sep="\t")
train_data = load_data(df_train, "sentence", "label")
dev_data = load_data(df_dev, "sentence", "label")

print(f"train: {len(train_data)}, dev: {len(dev_data)}")
train_data[1]

train: 66650, dev: 872


{'text': 'contains no wit , only labored gags ',
 'label': tensor([0.]),
 'input_ids': tensor([ 3475,    87, 15888,    90, 27695, 42637])}

## 72. Bag of wordsモデルの構築

単語埋め込みの平均ベクトルでテキストの特徴ベクトルを表現し、重みベクトルとの内積でポジティブ及びネガティブを分類するニューラルネットワーク（ロジスティック回帰モデル）を設計せよ。

In [3]:
import torch
import torch.nn as nn


class BoWClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()

        num_embeddings, embedding_dim = embedding_matrix.shape

        # 通常の nn.Embedding はランダムなベクトルから始まるが、from_pretrainedを使うことで、すでにWord2Vecから作ったベクトルを使う
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=True,  # 訓練中にこのベクトルを更新しない
        )

        self.linear = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids):
        """
        input_ids : 形状 (batch, seq_len) または (seq_len, )
        """
        # 単語ID → 単語ベクトル
        embeds = self.embedding(input_ids)

        # 単語ベクトルの平均
        if len(embeds.shape) == 2:
            mean_embeds = embeds.mean(dim=0)  # 1文のとき  embeds.shape = (seq_len, embedding_dim)
        else:
            mean_embeds = embeds.mean(dim=1)  # ミニバッチの場合  embeds.shape = (batch, seq_len, embedding_dim)

        # 線形変換（ロジスティック回帰）
        logits = self.linear(mean_embeds)  # スカラー

        return logits  # このまま loss 関数に渡せる

## 73. モデルの学習

問題72で設計したモデルの重みベクトルを訓練セット上で学習せよ。ただし、学習中は単語埋め込み行列の値を固定せよ（単語埋め込み行列のファインチューニングは行わない）。また、学習時に損失値を表示するなど、学習の進捗状況をモニタリングできるようにせよ。

In [None]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm


def load_data(df, text_col_name, label_col_name):
    dict_list = []
    for text, label in zip(df[text_col_name], df[label_col_name]):
        input_ids = [word_to_id[token] for token in text.split() if token in word_to_id]
        if len(input_ids) > 0:
            dict = {
                "text": text,
                "label": torch.tensor([float(label)]),
                "input_ids": torch.tensor(input_ids),
            }
            dict_list.append(dict)
    return dict_list


class BoWClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()

        num_embeddings, embedding_dim = embedding_matrix.shape

        # 通常の nn.Embedding はランダムなベクトルから始まるが、from_pretrainedを使うことで、すでにWord2Vecから作ったベクトルを使う
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=True,  # 訓練中にこのベクトルを更新しない
        )

        self.linear = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids):
        """
        input_ids : 形状 (batch, seq_len) または (seq_len, )
        """
        # 単語ID → 単語ベクトル
        embeds = self.embedding(input_ids)

        # 単語ベクトルの平均
        if len(embeds.shape) == 2:
            mean_embeds = embeds.mean(dim=0)  # 1文のとき  embeds.shape = (seq_len, embedding_dim)
        else:
            mean_embeds = embeds.mean(dim=1)  # ミニバッチの場合  embeds.shape = (batch, seq_len, embedding_dim)

        # 線形変換（ロジスティック回帰）
        logits = self.linear(mean_embeds)  # スカラー

        return logits  # このまま loss 関数に渡せる


# 学習関数（非バッチ処理）
def train_model(model, train_data, dev_data, criterion, optimizer, device):
    train_losses = []
    model.train()
    for sample in tqdm(train_data, desc="Training", leave=True):
        input_ids = sample["input_ids"].unsqueeze(0).to(device)  # (1, seq_len)
        label = sample["label"].to(device)  # (1, )

        optimizer.zero_grad()  # 勾配初期化
        logits = model(input_ids)  # 順伝播
        loss = criterion(logits.view(-1), label)  # 損失計算
        loss.backward()  # 逆伝播
        optimizer.step()  # パラメータ更新
        train_losses.append(loss.item())

    dev_losses = []
    model.eval()
    with torch.no_grad():
        for sample in tqdm(dev_data, desc="Validation", leave=True):
            input_ids = sample["input_ids"].unsqueeze(0).to(device)
            label = sample["label"].to(device)

            logits = model(input_ids)
            loss = criterion(logits.view(-1), label)
            dev_losses.append(loss.item())

    train_loss = np.mean(train_losses)
    dev_loss = np.mean(dev_losses)

    return train_loss, dev_loss



# 単語埋め込みの読み込み
model = KeyedVectors.load_word2vec_format("../第6章：単語ベクトル/GoogleNews-vectors-negative300.bin", binary=True)
vocab_size = len(model.key_to_index)
embedding_dim = model.vector_size
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
word_to_id = {"<PAD>": 0} 
id_to_word = {0: "<PAD>"}

for i, word in enumerate(model.key_to_index, start=1):
    embedding_matrix[i] = model[word]
    word_to_id[word] = i
    id_to_word[i] = word

# データセット読み込み
df_train = pd.read_csv("../第7章：機械学習/SST-2/train.tsv", sep="\t")
df_dev = pd.read_csv("../第7章：機械学習/SST-2/dev.tsv", sep="\t")
train_data = load_data(df_train, "sentence", "label")
dev_data = load_data(df_dev, "sentence", "label")

# ハイパーパラメータ・デバイス設定
epochs = 10
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# モデル・損失関数・最適化手法の定義
model = BoWClassifier(embedding_matrix).to(device)
criterion = nn.BCEWithLogitsLoss()
parameters = model.linear.parameters()  # 単語埋め込みはfreeze=Trueなので、線形層だけが学習対象
optimizer = optim.Adam(parameters, lr=learning_rate)

# 学習
for epoch in range(epochs):
    print(f"epoch{epoch+1}")
    train_loss, dev_loss = train_model(model, train_data, dev_data, criterion, optimizer, device)
    print(f"---> Train loss: {train_loss:.4f}, Dev loss: {dev_loss:.4f}\n")

# モデル保存
save_path = "model/bow_classifier_73.pth"
torch.save(model.state_dict(), save_path)
print(f"モデルを保存しました: {save_path}")

epoch1


Training: 100%|██████████| 66650/66650 [01:17<00:00, 857.75it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 5911.04it/s]


---> Train loss: 0.4026, Dev loss: 0.4591

epoch2


Training: 100%|██████████| 66650/66650 [01:24<00:00, 790.28it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 6246.14it/s]


---> Train loss: 0.3726, Dev loss: 0.4528

epoch3


Training: 100%|██████████| 66650/66650 [01:19<00:00, 843.64it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 5527.34it/s]


---> Train loss: 0.3701, Dev loss: 0.4511

epoch4


Training: 100%|██████████| 66650/66650 [01:18<00:00, 845.70it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 5414.50it/s]


---> Train loss: 0.3692, Dev loss: 0.4505

epoch5


Training: 100%|██████████| 66650/66650 [01:14<00:00, 900.48it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 5723.03it/s]


---> Train loss: 0.3688, Dev loss: 0.4502

epoch6


Training: 100%|██████████| 66650/66650 [01:19<00:00, 838.22it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 6063.48it/s]


---> Train loss: 0.3686, Dev loss: 0.4500

epoch7


Training: 100%|██████████| 66650/66650 [01:18<00:00, 853.56it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 5699.00it/s]


---> Train loss: 0.3685, Dev loss: 0.4499

epoch8


Training: 100%|██████████| 66650/66650 [01:19<00:00, 840.79it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 6120.65it/s]


---> Train loss: 0.3684, Dev loss: 0.4498

epoch9


Training: 100%|██████████| 66650/66650 [01:11<00:00, 936.39it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 5635.83it/s]


---> Train loss: 0.3684, Dev loss: 0.4497

epoch10


Training: 100%|██████████| 66650/66650 [01:16<00:00, 869.55it/s] 
Validation: 100%|██████████| 872/872 [00:00<00:00, 6151.57it/s]


---> Train loss: 0.3683, Dev loss: 0.4497

モデルを保存しました: model/bow_classifier_73.pth


## 74. モデルの評価

問題73で学習したモデルの開発セットにおける正解率を求めよ。

In [None]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score


def load_data(df, text_col_name, label_col_name):
    dict_list = []
    for text, label in zip(df[text_col_name], df[label_col_name]):
        input_ids = [word_to_id[token] for token in text.split() if token in word_to_id]
        if len(input_ids) > 0:
            dict = {
                "text": text,
                "label": torch.tensor([float(label)]),
                "input_ids": torch.tensor(input_ids),
            }
            dict_list.append(dict)
    return dict_list


class BoWClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()

        num_embeddings, embedding_dim = embedding_matrix.shape

        # 通常の nn.Embedding はランダムなベクトルから始まるが、from_pretrainedを使うことで、すでにWord2Vecから作ったベクトルを使う
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=True,  # 訓練中にこのベクトルを更新しない
        )

        self.linear = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids):
        """
        input_ids : 形状 (batch, seq_len) または (seq_len, )
        """
        # 単語ID → 単語ベクトル
        embeds = self.embedding(input_ids)

        # 単語ベクトルの平均
        if len(embeds.shape) == 2:
            mean_embeds = embeds.mean(dim=0)  # 1文のとき  embeds.shape = (seq_len, embedding_dim)
        else:
            mean_embeds = embeds.mean(dim=1)  # ミニバッチの場合  embeds.shape = (batch, seq_len, embedding_dim)

        # 線形変換（ロジスティック回帰）
        logits = self.linear(mean_embeds)  # スカラー

        return logits  # このまま loss 関数に渡せる


def eval_model(model, eval_data, device):
    pred_labels = []
    gold_labels = []
    model.eval()
    with torch.no_grad():
        for sample in eval_data:
            input_ids = sample["input_ids"].unsqueeze(0).to(device)
            label = sample["label"].to(device)
            logits = model(input_ids)
            prob = torch.sigmoid(logits)
            pred = (prob >= 0.5).float()
            pred_labels.append(pred.item())
            gold_labels.append(label.item())

    return accuracy_score(gold_labels, pred_labels)



# 単語埋め込みの読み込み
model = KeyedVectors.load_word2vec_format("../第6章：単語ベクトル/GoogleNews-vectors-negative300.bin", binary=True)
vocab_size = len(model.key_to_index)
embedding_dim = model.vector_size
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
word_to_id = {"<PAD>": 0} 
id_to_word = {0: "<PAD>"}

for i, word in enumerate(model.key_to_index, start=1):
    embedding_matrix[i] = model[word]
    word_to_id[word] = i
    id_to_word[i] = word

# データセット読み込み
df_train = pd.read_csv("../第7章：機械学習/SST-2/train.tsv", sep="\t")
df_dev = pd.read_csv("../第7章：機械学習/SST-2/dev.tsv", sep="\t")
train_data = load_data(df_train, "sentence", "label")
dev_data = load_data(df_dev, "sentence", "label")

# モデルの読み込み
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BoWClassifier(embedding_matrix)
model.load_state_dict(torch.load("model/bow_classifier_73.pth"))
model = model.to(device)

# 評価
accuracy = eval_model(model, dev_data, device)
print(f"開発セットにおける正解率：{accuracy:.4f}")

開発セットにおける正解率：0.7982


## 75. パディング

複数の事例が与えられたとき、これらをまとめて一つのテンソル・オブジェクトで表現する関数`collate`を実装せよ。与えられた複数の事例のトークン列の長さが異なるときは、トークン列の長さが最も長いものに揃え、0番のトークンIDでパディングをせよ。さらに、トークン列の長さが長いものから順に、事例を並び替えよ。

例えば、訓練データセットの冒頭の4事例が次のように表されているとき、

```
[{'text': 'hide new secretions from the parental units',
  'label': tensor([0.]),
  'input_ids': tensor([  5785,     66, 113845,     18,     12,  15095,   1594])},
 {'text': 'contains no wit , only labored gags',
  'label': tensor([0.]),
  'input_ids': tensor([ 3475,    87, 15888,    90, 27695, 42637])},
 {'text': 'that loves its characters and communicates something rather beautiful about human nature',
  'label': tensor([1.]),
  'input_ids': tensor([    4,  5053,    45,  3305, 31647,   348,   904,  2815,    47,  1276,  1964])},
 {'text': 'remains utterly satisfied to remain the same throughout',
  'label': tensor([0.]),
  'input_ids': tensor([  987, 14528,  4941,   873,    12,   208,   898])}]
```

`collate`関数を通した結果は以下のようになることが想定される。

```
{'input_ids': tensor([
    [     4,   5053,     45,   3305,  31647,    348,    904,   2815,     47,   1276,   1964],
    [  5785,     66, 113845,     18,     12,  15095,   1594,      0,      0,      0,      0],
    [   987,  14528,   4941,    873,     12,    208,    898,      0,      0,      0,      0],
    [  3475,     87,  15888,     90,  27695,  42637,      0,      0,      0,      0,      0]]),
 'label': tensor([
    [1.],
    [0.],
    [0.],
    [0.]])}
```


In [None]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader


def load_data(df, text_col_name, label_col_name):
    dict_list = []
    for text, label in zip(df[text_col_name], df[label_col_name]):
        input_ids = [word_to_id[token] for token in text.split() if token in word_to_id]
        if len(input_ids) > 0:
            dict = {
                "text": text,
                "label": torch.tensor([float(label)]),
                "input_ids": torch.tensor(input_ids),
            }
            dict_list.append(dict)
    return dict_list


def collate_fn(batch):
    # 各サンプルのinput_idsの長さを取得
    lengths = [len(sample["input_ids"]) for sample in batch]

    # 長さの降順でソートし、lengthsにおけるインデックスを記録
    sorted_indices = np.argsort(lengths)[::-1]

    sorted_batch = [batch[i] for i in sorted_indices]

    input_ids_list = [sample["input_ids"] for sample in sorted_batch]
    labels = [sample["label"] for sample in sorted_batch]

    # 最長の長さ
    max_len = max(lengths)

    # パディング：すべてのシーケンスをmax_lenに揃える（PAD=0）
    padded_input_ids = []
    for ids in input_ids_list:
        pad_len = max_len - len(ids)
        padded = torch.cat([ids, torch.zeros(pad_len, dtype=torch.long)])
        padded_input_ids.append(padded)

    # Tensorにまとめる
    input_ids_tensor = torch.stack(padded_input_ids)  # (batch, max_len)
    labels_tensor = torch.stack(labels)  # (batch, 1)

    return {"input_ids": input_ids_tensor, "label": labels_tensor}



# 単語埋め込みの読み込み
model = KeyedVectors.load_word2vec_format("../第6章：単語ベクトル/GoogleNews-vectors-negative300.bin", binary=True)
vocab_size = len(model.key_to_index)
embedding_dim = model.vector_size
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
word_to_id = {"<PAD>": 0} 
id_to_word = {0: "<PAD>"}

for i, word in enumerate(model.key_to_index, start=1):
    embedding_matrix[i] = model[word]
    word_to_id[word] = i
    id_to_word[i] = word
    
# データセット読み込み
df_train = pd.read_csv("../第7章：機械学習/SST-2/train.tsv", sep="\t")
train_data = load_data(df_train, "sentence", "label")

# DataLoaderの作成
train_loader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=collate_fn)

# バッチを1つ取得
for batch in train_loader:
    print(batch)
    break

{'input_ids': tensor([[    12,  23441,    156, 337301,  19596,   1814,    412,  23017],
        [    12,    693,      5,    133,    254,    336,   1194,      0],
        [ 17848,  35389,    638,      0,      0,      0,      0,      0],
        [ 12063,      0,      0,      0,      0,      0,      0,      0]]), 'label': tensor([[0.],
        [0.],
        [1.],
        [1.]])}


## 76. ミニバッチ学習

問題75のパディングの処理を活用して、ミニバッチでモデルを学習せよ。また、学習したモデルの開発セットにおける正解率を求めよ。

In [None]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score


def load_data(df, text_col_name, label_col_name):
    dict_list = []
    for text, label in zip(df[text_col_name], df[label_col_name]):
        input_ids = [word_to_id[token] for token in text.split() if token in word_to_id]
        if len(input_ids) > 0:
            dict = {
                "text": text,
                "label": torch.tensor([float(label)]),
                "input_ids": torch.tensor(input_ids),
            }
            dict_list.append(dict)
    return dict_list


class BoWClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()

        num_embeddings, embedding_dim = embedding_matrix.shape

        # 通常の nn.Embedding はランダムなベクトルから始まるが、from_pretrainedを使うことで、すでにWord2Vecから作ったベクトルを使う
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=True,  # 訓練中にこのベクトルを更新しない
        )

        self.linear = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids):
        """
        input_ids : 形状 (batch, seq_len) または (seq_len, )
        """
        # 単語ID → 単語ベクトル
        embeds = self.embedding(input_ids)

        # 単語ベクトルの平均
        if len(embeds.shape) == 2:
            mean_embeds = embeds.mean(dim=0)  # 1文のとき  embeds.shape = (seq_len, embedding_dim)
        else:
            mean_embeds = embeds.mean(dim=1)  # ミニバッチの場合  embeds.shape = (batch, seq_len, embedding_dim)

        # 線形変換（ロジスティック回帰）
        logits = self.linear(mean_embeds)  # スカラー

        return logits  # このまま loss 関数に渡せる


def collate_fn(batch):
    # 各サンプルのinput_idsの長さを取得
    lengths = [len(sample["input_ids"]) for sample in batch]

    # 長さの降順でソートし、lengthsにおけるインデックスを記録
    sorted_indices = np.argsort(lengths)[::-1]

    sorted_batch = [batch[i] for i in sorted_indices]

    input_ids_list = [sample["input_ids"] for sample in sorted_batch]
    labels = [sample["label"] for sample in sorted_batch]

    # 最長の長さ
    max_len = max(lengths)

    # パディング：すべてのシーケンスをmax_lenに揃える（PAD=0）
    padded_input_ids = []
    for ids in input_ids_list:
        pad_len = max_len - len(ids)
        padded = torch.cat([ids, torch.zeros(pad_len, dtype=torch.long)])
        padded_input_ids.append(padded)

    # Tensorにまとめる
    input_ids_tensor = torch.stack(padded_input_ids)  # (batch, max_len)
    labels_tensor = torch.stack(labels)  # (batch, 1)

    return {"input_ids": input_ids_tensor, "label": labels_tensor}


# 学習関数
def train_batch_model(model, train_loader, dev_loader, criterion, optimizer, device):
    model.train()
    train_losses = []

    for batch in tqdm(train_loader, desc="Training", leave=True):
        input_ids = batch["input_ids"].to(device)  # (batch, seq_len)
        labels = batch["label"].to(device)  # (batch, 1)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    model.eval()
    dev_losses = []
    with torch.no_grad():
        for batch in tqdm(dev_loader, desc="Validation", leave=True):
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids)
            loss = criterion(logits, labels)
            dev_losses.append(loss.item())

    return np.mean(train_losses), np.mean(dev_losses)


# 評価関数
def eval_batch_model(model, data_loader, device):
    model.eval()
    pred_labels, gold_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)
            logits = model(input_ids)

            prob = torch.sigmoid(logits)
            pred = (prob >= 0.5).float()

            pred_labels.extend(pred.cpu().squeeze(1).tolist())
            gold_labels.extend(labels.cpu().squeeze(1).tolist())

    return accuracy_score(gold_labels, pred_labels)




# 単語埋め込みの読み込み
model = KeyedVectors.load_word2vec_format("../第6章：単語ベクトル/GoogleNews-vectors-negative300.bin", binary=True)
vocab_size = len(model.key_to_index)
embedding_dim = model.vector_size
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
word_to_id = {"<PAD>": 0} 
id_to_word = {0: "<PAD>"}

for i, word in enumerate(model.key_to_index, start=1):
    embedding_matrix[i] = model[word]
    word_to_id[word] = i
    id_to_word[i] = word

# データセット読み込み
df_train = pd.read_csv("../第7章：機械学習/SST-2/train.tsv", sep="\t")
df_dev = pd.read_csv("../第7章：機械学習/SST-2/dev.tsv", sep="\t")
train_data = load_data(df_train, "sentence", "label")
dev_data = load_data(df_dev, "sentence", "label")

# ハイパーパラメータと初期設定
batch_size = 64
epochs = 50
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# モデル・損失関数・最適化手法
model = BoWClassifier(embedding_matrix).to(device)
criterion = nn.BCEWithLogitsLoss()
parameters = model.linear.parameters()  # 単語埋め込みはfreeze=Trueなので、線形層だけが学習対象
optimizer = optim.Adam(parameters, lr=learning_rate)

# DataLoaderの作成
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# 学習
for epoch in range(epochs):
    print(f"epoch{epoch+1}")
    train_loss, dev_loss = train_batch_model(
        model, train_loader, dev_loader, criterion, optimizer, device
    )

    train_acc = eval_batch_model(model, train_loader, device)
    dev_acc = eval_batch_model(model, dev_loader, device)

    print(
        f"---> Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f},\n"
        f"---> Dev   loss: {dev_loss:.4f}, Dev   acc: {dev_acc:.4f}\n"
    )

# モデル保存
save_path = "model/bow_classifier_76.pth"
torch.save(model.state_dict(), save_path)
print(f"\nモデルを保存しました: {save_path}")

# 評価
model.load_state_dict(torch.load(save_path))
model = model.to(device)
accuracy = eval_batch_model(model, dev_loader, device)
print(f"開発セットにおける正解率：{accuracy:.4f}")

epoch1


Training: 100%|██████████| 1042/1042 [00:01<00:00, 591.87it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 617.86it/s]


---> Train loss: 0.6446, Train acc: 0.6928,
---> Dev   loss: 0.6107, Dev   acc: 0.7018

epoch2


Training: 100%|██████████| 1042/1042 [00:01<00:00, 598.65it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1177.61it/s]


---> Train loss: 0.5821, Train acc: 0.7562,
---> Dev   loss: 0.5642, Dev   acc: 0.7592

epoch3


Training: 100%|██████████| 1042/1042 [00:01<00:00, 575.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1179.26it/s]


---> Train loss: 0.5422, Train acc: 0.7894,
---> Dev   loss: 0.5354, Dev   acc: 0.7741

epoch4


Training: 100%|██████████| 1042/1042 [00:01<00:00, 545.74it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 603.01it/s]


---> Train loss: 0.5129, Train acc: 0.8061,
---> Dev   loss: 0.5175, Dev   acc: 0.7764

epoch5


Training: 100%|██████████| 1042/1042 [00:01<00:00, 590.63it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1125.02it/s]


---> Train loss: 0.4916, Train acc: 0.8173,
---> Dev   loss: 0.5034, Dev   acc: 0.7764

epoch6


Training: 100%|██████████| 1042/1042 [00:01<00:00, 591.97it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1116.57it/s]


---> Train loss: 0.4754, Train acc: 0.8212,
---> Dev   loss: 0.4966, Dev   acc: 0.7775

epoch7


Training: 100%|██████████| 1042/1042 [00:01<00:00, 584.67it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1145.74it/s]


---> Train loss: 0.4631, Train acc: 0.8266,
---> Dev   loss: 0.4887, Dev   acc: 0.7833

epoch8


Training: 100%|██████████| 1042/1042 [00:01<00:00, 599.65it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1228.30it/s]


---> Train loss: 0.4529, Train acc: 0.8283,
---> Dev   loss: 0.4859, Dev   acc: 0.7798

epoch9


Training: 100%|██████████| 1042/1042 [00:01<00:00, 580.45it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1228.33it/s]


---> Train loss: 0.4449, Train acc: 0.8305,
---> Dev   loss: 0.4836, Dev   acc: 0.7821

epoch10


Training: 100%|██████████| 1042/1042 [00:01<00:00, 578.19it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1235.90it/s]


---> Train loss: 0.4389, Train acc: 0.8307,
---> Dev   loss: 0.4838, Dev   acc: 0.7844

epoch11


Training: 100%|██████████| 1042/1042 [00:01<00:00, 574.59it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1205.43it/s]


---> Train loss: 0.4335, Train acc: 0.8330,
---> Dev   loss: 0.4807, Dev   acc: 0.7890

epoch12


Training: 100%|██████████| 1042/1042 [00:01<00:00, 559.51it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1201.39it/s]


---> Train loss: 0.4290, Train acc: 0.8340,
---> Dev   loss: 0.4795, Dev   acc: 0.7856

epoch13


Training: 100%|██████████| 1042/1042 [00:01<00:00, 588.61it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 712.26it/s]


---> Train loss: 0.4250, Train acc: 0.8345,
---> Dev   loss: 0.4803, Dev   acc: 0.7856

epoch14


Training: 100%|██████████| 1042/1042 [00:01<00:00, 568.19it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 531.37it/s]


---> Train loss: 0.4215, Train acc: 0.8354,
---> Dev   loss: 0.4805, Dev   acc: 0.7844

epoch15


Training: 100%|██████████| 1042/1042 [00:01<00:00, 578.52it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1184.81it/s]


---> Train loss: 0.4194, Train acc: 0.8362,
---> Dev   loss: 0.4783, Dev   acc: 0.7856

epoch16


Training: 100%|██████████| 1042/1042 [00:01<00:00, 578.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 501.66it/s]


---> Train loss: 0.4158, Train acc: 0.8363,
---> Dev   loss: 0.4795, Dev   acc: 0.7878

epoch17


Training: 100%|██████████| 1042/1042 [00:01<00:00, 572.95it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 551.50it/s]


---> Train loss: 0.4147, Train acc: 0.8376,
---> Dev   loss: 0.4797, Dev   acc: 0.7878

epoch18


Training: 100%|██████████| 1042/1042 [00:01<00:00, 586.48it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1178.06it/s]


---> Train loss: 0.4128, Train acc: 0.8378,
---> Dev   loss: 0.4803, Dev   acc: 0.7878

epoch19


Training: 100%|██████████| 1042/1042 [00:01<00:00, 601.82it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1192.51it/s]


---> Train loss: 0.4105, Train acc: 0.8383,
---> Dev   loss: 0.4801, Dev   acc: 0.7890

epoch20


Training: 100%|██████████| 1042/1042 [00:01<00:00, 595.02it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1231.86it/s]


---> Train loss: 0.4092, Train acc: 0.8388,
---> Dev   loss: 0.4805, Dev   acc: 0.7878

epoch21


Training: 100%|██████████| 1042/1042 [00:01<00:00, 611.08it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1206.30it/s]


---> Train loss: 0.4073, Train acc: 0.8390,
---> Dev   loss: 0.4795, Dev   acc: 0.7890

epoch22


Training: 100%|██████████| 1042/1042 [00:01<00:00, 584.48it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1043.95it/s]


---> Train loss: 0.4066, Train acc: 0.8393,
---> Dev   loss: 0.4800, Dev   acc: 0.7890

epoch23


Training: 100%|██████████| 1042/1042 [00:01<00:00, 585.45it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 547.78it/s]


---> Train loss: 0.4048, Train acc: 0.8400,
---> Dev   loss: 0.4801, Dev   acc: 0.7890

epoch24


Training: 100%|██████████| 1042/1042 [00:01<00:00, 588.19it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1211.75it/s]


---> Train loss: 0.4031, Train acc: 0.8404,
---> Dev   loss: 0.4808, Dev   acc: 0.7901

epoch25


Training: 100%|██████████| 1042/1042 [00:01<00:00, 592.73it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1198.37it/s]


---> Train loss: 0.4031, Train acc: 0.8405,
---> Dev   loss: 0.4800, Dev   acc: 0.7924

epoch26


Training: 100%|██████████| 1042/1042 [00:01<00:00, 605.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 568.52it/s]


---> Train loss: 0.4018, Train acc: 0.8403,
---> Dev   loss: 0.4800, Dev   acc: 0.7924

epoch27


Training: 100%|██████████| 1042/1042 [00:01<00:00, 582.98it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 927.83it/s]


---> Train loss: 0.4006, Train acc: 0.8404,
---> Dev   loss: 0.4831, Dev   acc: 0.7913

epoch28


Training: 100%|██████████| 1042/1042 [00:01<00:00, 677.03it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1190.02it/s]


---> Train loss: 0.4007, Train acc: 0.8412,
---> Dev   loss: 0.4813, Dev   acc: 0.7901

epoch29


Training: 100%|██████████| 1042/1042 [00:01<00:00, 605.49it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1225.38it/s]


---> Train loss: 0.3995, Train acc: 0.8421,
---> Dev   loss: 0.4813, Dev   acc: 0.7901

epoch30


Training: 100%|██████████| 1042/1042 [00:01<00:00, 576.38it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1183.69it/s]


---> Train loss: 0.3978, Train acc: 0.8408,
---> Dev   loss: 0.4829, Dev   acc: 0.7936

epoch31


Training: 100%|██████████| 1042/1042 [00:01<00:00, 599.43it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1170.78it/s]


---> Train loss: 0.3973, Train acc: 0.8419,
---> Dev   loss: 0.4823, Dev   acc: 0.7913

epoch32


Training: 100%|██████████| 1042/1042 [00:01<00:00, 615.44it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 593.15it/s]


---> Train loss: 0.3973, Train acc: 0.8421,
---> Dev   loss: 0.4830, Dev   acc: 0.7936

epoch33


Training: 100%|██████████| 1042/1042 [00:01<00:00, 590.19it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 635.80it/s]


---> Train loss: 0.3963, Train acc: 0.8421,
---> Dev   loss: 0.4844, Dev   acc: 0.7924

epoch34


Training: 100%|██████████| 1042/1042 [00:01<00:00, 575.33it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1201.76it/s]


---> Train loss: 0.3961, Train acc: 0.8426,
---> Dev   loss: 0.4835, Dev   acc: 0.7936

epoch35


Training: 100%|██████████| 1042/1042 [00:01<00:00, 598.29it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 808.77it/s]


---> Train loss: 0.3959, Train acc: 0.8425,
---> Dev   loss: 0.4836, Dev   acc: 0.7936

epoch36


Training: 100%|██████████| 1042/1042 [00:01<00:00, 587.63it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1205.38it/s]


---> Train loss: 0.3949, Train acc: 0.8424,
---> Dev   loss: 0.4846, Dev   acc: 0.7924

epoch37


Training: 100%|██████████| 1042/1042 [00:01<00:00, 598.37it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1198.10it/s]


---> Train loss: 0.3939, Train acc: 0.8433,
---> Dev   loss: 0.4831, Dev   acc: 0.7959

epoch38


Training: 100%|██████████| 1042/1042 [00:01<00:00, 644.08it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1198.23it/s]


---> Train loss: 0.3942, Train acc: 0.8426,
---> Dev   loss: 0.4855, Dev   acc: 0.7947

epoch39


Training: 100%|██████████| 1042/1042 [00:01<00:00, 607.41it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1141.75it/s]


---> Train loss: 0.3938, Train acc: 0.8432,
---> Dev   loss: 0.4842, Dev   acc: 0.7947

epoch40


Training: 100%|██████████| 1042/1042 [00:01<00:00, 750.94it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1124.56it/s]


---> Train loss: 0.3929, Train acc: 0.8434,
---> Dev   loss: 0.4837, Dev   acc: 0.7947

epoch41


Training: 100%|██████████| 1042/1042 [00:01<00:00, 585.22it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1173.70it/s]


---> Train loss: 0.3925, Train acc: 0.8427,
---> Dev   loss: 0.4857, Dev   acc: 0.7947

epoch42


Training: 100%|██████████| 1042/1042 [00:01<00:00, 583.78it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1016.73it/s]


---> Train loss: 0.3919, Train acc: 0.8429,
---> Dev   loss: 0.4866, Dev   acc: 0.7959

epoch43


Training: 100%|██████████| 1042/1042 [00:01<00:00, 657.93it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1046.22it/s]


---> Train loss: 0.3922, Train acc: 0.8436,
---> Dev   loss: 0.4847, Dev   acc: 0.7982

epoch44


Training: 100%|██████████| 1042/1042 [00:01<00:00, 586.34it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1190.91it/s]


---> Train loss: 0.3922, Train acc: 0.8441,
---> Dev   loss: 0.4851, Dev   acc: 0.7993

epoch45


Training: 100%|██████████| 1042/1042 [00:01<00:00, 578.44it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1207.99it/s]


---> Train loss: 0.3918, Train acc: 0.8435,
---> Dev   loss: 0.4854, Dev   acc: 0.7993

epoch46


Training: 100%|██████████| 1042/1042 [00:01<00:00, 583.97it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 949.90it/s]


---> Train loss: 0.3910, Train acc: 0.8438,
---> Dev   loss: 0.4854, Dev   acc: 0.7993

epoch47


Training: 100%|██████████| 1042/1042 [00:01<00:00, 595.61it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 647.28it/s]


---> Train loss: 0.3909, Train acc: 0.8442,
---> Dev   loss: 0.4844, Dev   acc: 0.7970

epoch48


Training: 100%|██████████| 1042/1042 [00:01<00:00, 587.35it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 872.50it/s]


---> Train loss: 0.3902, Train acc: 0.8428,
---> Dev   loss: 0.4877, Dev   acc: 0.8016

epoch49


Training: 100%|██████████| 1042/1042 [00:01<00:00, 594.28it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 979.14it/s]


---> Train loss: 0.3896, Train acc: 0.8438,
---> Dev   loss: 0.4864, Dev   acc: 0.8005

epoch50


Training: 100%|██████████| 1042/1042 [00:01<00:00, 578.27it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1211.58it/s]


---> Train loss: 0.3896, Train acc: 0.8438,
---> Dev   loss: 0.4860, Dev   acc: 0.8016


モデルを保存しました: model/bow_classifier_76.pth
開発セットにおける正解率：0.8016


## 77. GPU上での学習

問題76のモデル学習をGPU上で実行せよ。また、学習したモデルの開発セットにおける正解率を求めよ。

In [None]:
# 問題76と同じ

## 78. 単語埋め込みのファインチューニング

問題77の学習において、単語埋め込みのパラメータも同時に更新するファインチューニングを導入せよ。また、学習したモデルの開発セットにおける正解率を求めよ。

In [None]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score


def load_data(df, text_col_name, label_col_name):
    dict_list = []
    for text, label in zip(df[text_col_name], df[label_col_name]):
        input_ids = [word_to_id[token] for token in text.split() if token in word_to_id]
        if len(input_ids) > 0:
            dict = {
                "text": text,
                "label": torch.tensor([float(label)]),
                "input_ids": torch.tensor(input_ids),
            }
            dict_list.append(dict)
    return dict_list


class BoWClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()

        num_embeddings, embedding_dim = embedding_matrix.shape

        # 通常の nn.Embedding はランダムなベクトルから始まるが、from_pretrainedを使うことで、すでにWord2Vecから作ったベクトルを使う
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=False  # 単語埋め込みのパラメータも学習
        )

        self.linear = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids):
        """
        input_ids : 形状 (batch, seq_len) または (seq_len, )
        """
        # 単語ID → 単語ベクトル
        embeds = self.embedding(input_ids)

        # 単語ベクトルの平均
        if len(embeds.shape) == 2:
            mean_embeds = embeds.mean(dim=0)  # 1文のとき  embeds.shape = (seq_len, embedding_dim)
        else:
            mean_embeds = embeds.mean(dim=1)  # ミニバッチの場合  embeds.shape = (batch, seq_len, embedding_dim)

        # 線形変換（ロジスティック回帰）
        logits = self.linear(mean_embeds)  # スカラー

        return logits  # このまま loss 関数に渡せる
    

def collate_fn(batch):
    # 各サンプルのinput_idsの長さを取得
    lengths = [len(sample["input_ids"]) for sample in batch]

    # 長さの降順でソートし、lengthsにおけるインデックスを記録
    sorted_indices = np.argsort(lengths)[::-1]

    sorted_batch = [batch[i] for i in sorted_indices]

    input_ids_list = [sample["input_ids"] for sample in sorted_batch]
    labels = [sample["label"] for sample in sorted_batch]

    # 最長の長さ
    max_len = max(lengths)

    # パディング：すべてのシーケンスをmax_lenに揃える（PAD=0）
    padded_input_ids = []
    for ids in input_ids_list:
        pad_len = max_len - len(ids)
        padded = torch.cat([ids, torch.zeros(pad_len, dtype=torch.long)])
        padded_input_ids.append(padded)

    # Tensorにまとめる
    input_ids_tensor = torch.stack(padded_input_ids)  # (batch, max_len)
    labels_tensor = torch.stack(labels)  # (batch, 1)

    return {"input_ids": input_ids_tensor, "label": labels_tensor}


# 学習関数
def train_batch_model(model, train_loader, dev_loader, criterion, optimizer, device):
    model.train()
    train_losses = []

    for batch in tqdm(train_loader, desc="Training", leave=True):
        input_ids = batch["input_ids"].to(device)  # (batch, seq_len)
        labels = batch["label"].to(device)  # (batch, 1)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    model.eval()
    dev_losses = []
    with torch.no_grad():
        for batch in tqdm(dev_loader, desc="Validation", leave=True):
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids)
            loss = criterion(logits, labels)
            dev_losses.append(loss.item())

    return np.mean(train_losses), np.mean(dev_losses)


# 評価関数
def eval_batch_model(model, data_loader, device):
    model.eval()
    pred_labels, gold_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)
            logits = model(input_ids)

            prob = torch.sigmoid(logits)
            pred = (prob >= 0.5).float()

            pred_labels.extend(pred.cpu().squeeze(1).tolist())
            gold_labels.extend(labels.cpu().squeeze(1).tolist())

    return accuracy_score(gold_labels, pred_labels)




# 単語埋め込みの読み込み
model = KeyedVectors.load_word2vec_format("../第6章：単語ベクトル/GoogleNews-vectors-negative300.bin", binary=True)
vocab_size = len(model.key_to_index)
embedding_dim = model.vector_size
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
word_to_id = {"<PAD>": 0} 
id_to_word = {0: "<PAD>"}

for i, word in enumerate(model.key_to_index, start=1):
    embedding_matrix[i] = model[word]
    word_to_id[word] = i
    id_to_word[i] = word

# データセット読み込み
df_train = pd.read_csv("../第7章：機械学習/SST-2/train.tsv", sep="\t")
df_dev = pd.read_csv("../第7章：機械学習/SST-2/dev.tsv", sep="\t")
train_data = load_data(df_train, "sentence", "label")
dev_data = load_data(df_dev, "sentence", "label")

# ハイパーパラメータと初期設定
batch_size = 64
epochs = 10
learning_rate = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# モデル・損失関数・最適化手法
model = BoWClassifier(embedding_matrix).to(device)
criterion = nn.BCEWithLogitsLoss()
parameters=model.parameters()  # 単語埋め込みのパラメータも学習
optimizer = optim.Adam(parameters, lr=learning_rate)

# DataLoaderの作成
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# 学習
for epoch in range(epochs):
    print(f"epoch{epoch+1}")
    train_loss, dev_loss = train_batch_model(
        model, train_loader, dev_loader, criterion, optimizer, device
    )

    train_acc = eval_batch_model(model, train_loader, device)
    dev_acc = eval_batch_model(model, dev_loader, device)

    print(
        f"---> Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f},\n"
        f"---> Dev   loss: {dev_loss:.4f}, Dev   acc: {dev_acc:.4f}\n"
    )

# モデル保存
save_path = "model/bow_classifier_76.pth"
torch.save(model.state_dict(), save_path)
print(f"\nモデルを保存しました: {save_path}")

# 評価
model.load_state_dict(torch.load(save_path))
model = model.to(device)
accuracy = eval_batch_model(model, dev_loader, device)
print(f"開発セットにおける正解率：{accuracy:.4f}")

epoch1


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.82it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1150.68it/s]


---> Train loss: 0.6744, Train acc: 0.6007,
---> Dev   loss: 0.6676, Dev   acc: 0.5734

epoch2


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1198.67it/s]


---> Train loss: 0.6298, Train acc: 0.7062,
---> Dev   loss: 0.6060, Dev   acc: 0.7133

epoch3


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1104.28it/s]


---> Train loss: 0.5599, Train acc: 0.7848,
---> Dev   loss: 0.5394, Dev   acc: 0.7557

epoch4


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1210.40it/s]


---> Train loss: 0.4901, Train acc: 0.8348,
---> Dev   loss: 0.4905, Dev   acc: 0.7867

epoch5


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1212.35it/s]


---> Train loss: 0.4353, Train acc: 0.8518,
---> Dev   loss: 0.4621, Dev   acc: 0.7936

epoch6


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1228.30it/s]


---> Train loss: 0.3900, Train acc: 0.8801,
---> Dev   loss: 0.4412, Dev   acc: 0.8050

epoch7


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1206.45it/s]


---> Train loss: 0.3566, Train acc: 0.8876,
---> Dev   loss: 0.4337, Dev   acc: 0.8131

epoch8


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1201.49it/s]


---> Train loss: 0.3300, Train acc: 0.8899,
---> Dev   loss: 0.4344, Dev   acc: 0.8108

epoch9


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1034.48it/s]


---> Train loss: 0.3073, Train acc: 0.8993,
---> Dev   loss: 0.4337, Dev   acc: 0.8142

epoch10


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.86it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1199.50it/s]


---> Train loss: 0.2886, Train acc: 0.9049,
---> Dev   loss: 0.4373, Dev   acc: 0.8119


モデルを保存しました: model/bow_classifier_76.pth
開発セットにおける正解率：0.8119


## 79. アーキテクチャの変更

ニューラルネットワークのアーキテクチャを自由に変更し、モデルを学習せよ。また、学習したモデルの開発セットにおける正解率を求めよ。例えば、テキストの特徴ベクトル（単語埋め込みの平均ベクトル）に対して多層のニューラルネットワークを通したり、畳み込みニューラルネットワーク（CNN; Convolutional Neural Network）や再帰型ニューラルネットワーク（RNN; Recurrent Neural Network）などのモデルの学習に挑戦するとよい。

In [1]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score


def load_data(df, text_col_name, label_col_name):
    dict_list = []
    for text, label in zip(df[text_col_name], df[label_col_name]):
        input_ids = [word_to_id[token] for token in text.split() if token in word_to_id]
        if len(input_ids) > 0:
            dict = {
                "text": text,
                "label": torch.tensor([float(label)]),
                "input_ids": torch.tensor(input_ids),
            }
            dict_list.append(dict)
    return dict_list

# 多層のニューラルネットワーク
class MLPClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=100, dropout=0.5):
        super().__init__()

        num_embeddings, embedding_dim = embedding_matrix.shape

        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=False  
        )

        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1)
        )


    def forward(self, input_ids):
        """
        input_ids : 形状 (batch, seq_len) または (seq_len, )
        """
        # 単語ID → 単語ベクトル
        embeds = self.embedding(input_ids)

        # 単語ベクトルの平均
        if len(embeds.shape) == 2:
            mean_embeds = embeds.mean(dim=0)  # 1文のとき  embeds.shape = (seq_len, embedding_dim)
        else:
            mean_embeds = embeds.mean(dim=1)  # ミニバッチの場合  embeds.shape = (batch, seq_len, embedding_dim)

        # 線形変換（ロジスティック回帰）
        logits = self.mlp(mean_embeds)  # スカラー

        return logits  # このまま loss 関数に渡せる
    

def collate_fn(batch):
    # 各サンプルのinput_idsの長さを取得
    lengths = [len(sample["input_ids"]) for sample in batch]

    # 長さの降順でソートし、lengthsにおけるインデックスを記録
    sorted_indices = np.argsort(lengths)[::-1]

    sorted_batch = [batch[i] for i in sorted_indices]

    input_ids_list = [sample["input_ids"] for sample in sorted_batch]
    labels = [sample["label"] for sample in sorted_batch]

    # 最長の長さ
    max_len = max(lengths)

    # パディング：すべてのシーケンスをmax_lenに揃える（PAD=0）
    padded_input_ids = []
    for ids in input_ids_list:
        pad_len = max_len - len(ids)
        padded = torch.cat([ids, torch.zeros(pad_len, dtype=torch.long)])
        padded_input_ids.append(padded)

    # Tensorにまとめる
    input_ids_tensor = torch.stack(padded_input_ids)  # (batch, max_len)
    labels_tensor = torch.stack(labels)  # (batch, 1)

    return {"input_ids": input_ids_tensor, "label": labels_tensor}


# 学習関数
def train_batch_model(model, train_loader, dev_loader, criterion, optimizer, device):
    model.train()
    train_losses = []

    for batch in tqdm(train_loader, desc="Training", leave=True):
        input_ids = batch["input_ids"].to(device)  # (batch, seq_len)
        labels = batch["label"].to(device)  # (batch, 1)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    model.eval()
    dev_losses = []
    with torch.no_grad():
        for batch in tqdm(dev_loader, desc="Validation", leave=True):
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids)
            loss = criterion(logits, labels)
            dev_losses.append(loss.item())

    return np.mean(train_losses), np.mean(dev_losses)


# 評価関数
def eval_batch_model(model, data_loader, device):
    model.eval()
    pred_labels, gold_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)
            logits = model(input_ids)

            prob = torch.sigmoid(logits)
            pred = (prob >= 0.5).float()

            pred_labels.extend(pred.cpu().squeeze(1).tolist())
            gold_labels.extend(labels.cpu().squeeze(1).tolist())

    return accuracy_score(gold_labels, pred_labels)




# 単語埋め込みの読み込み
model = KeyedVectors.load_word2vec_format("../第6章：単語ベクトル/GoogleNews-vectors-negative300.bin", binary=True)
vocab_size = len(model.key_to_index)
embedding_dim = model.vector_size
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
word_to_id = {"<PAD>": 0} 
id_to_word = {0: "<PAD>"}

for i, word in enumerate(model.key_to_index, start=1):
    embedding_matrix[i] = model[word]
    word_to_id[word] = i
    id_to_word[i] = word

# データセット読み込み
df_train = pd.read_csv("../第7章：機械学習/SST-2/train.tsv", sep="\t")
df_dev = pd.read_csv("../第7章：機械学習/SST-2/dev.tsv", sep="\t")
train_data = load_data(df_train, "sentence", "label")
dev_data = load_data(df_dev, "sentence", "label")

# ハイパーパラメータと初期設定
batch_size = 64
epochs = 20
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# モデル・損失関数・最適化手法
model = MLPClassifier(embedding_matrix).to(device)
criterion = nn.BCEWithLogitsLoss()
parameters=model.parameters()  # 単語埋め込みのパラメータも学習
optimizer = optim.Adam(parameters, lr=learning_rate)

# DataLoaderの作成
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# 学習
for epoch in range(epochs):
    print(f"epoch{epoch+1}")
    train_loss, dev_loss = train_batch_model(
        model, train_loader, dev_loader, criterion, optimizer, device
    )

    train_acc = eval_batch_model(model, train_loader, device)
    dev_acc = eval_batch_model(model, dev_loader, device)

    print(
        f"---> Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f},\n"
        f"---> Dev   loss: {dev_loss:.4f}, Dev   acc: {dev_acc:.4f}\n"
    )

# モデル保存
save_path = "model/bow_classifier_79_MLP.pth"
torch.save(model.state_dict(), save_path)
print(f"\nモデルを保存しました: {save_path}")

# 評価
model.load_state_dict(torch.load(save_path))
model = model.to(device)
accuracy = eval_batch_model(model, dev_loader, device)
print(f"開発セットにおける正解率：{accuracy:.4f}")

epoch1


Training: 100%|██████████| 1042/1042 [02:15<00:00,  7.71it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1053.03it/s]


---> Train loss: 0.6920, Train acc: 0.5585,
---> Dev   loss: 0.6899, Dev   acc: 0.5092

epoch2


Training: 100%|██████████| 1042/1042 [02:14<00:00,  7.75it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1033.08it/s]


---> Train loss: 0.6845, Train acc: 0.5651,
---> Dev   loss: 0.6846, Dev   acc: 0.5149

epoch3


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1100.48it/s]


---> Train loss: 0.6759, Train acc: 0.5829,
---> Dev   loss: 0.6770, Dev   acc: 0.5447

epoch4


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.84it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1096.06it/s]


---> Train loss: 0.6656, Train acc: 0.6052,
---> Dev   loss: 0.6666, Dev   acc: 0.5883

epoch5


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1076.98it/s]


---> Train loss: 0.6541, Train acc: 0.6247,
---> Dev   loss: 0.6538, Dev   acc: 0.6284

epoch6


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.84it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 984.12it/s]


---> Train loss: 0.6399, Train acc: 0.6498,
---> Dev   loss: 0.6378, Dev   acc: 0.6686

epoch7


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.84it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1106.36it/s]


---> Train loss: 0.6232, Train acc: 0.6834,
---> Dev   loss: 0.6189, Dev   acc: 0.7087

epoch8


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 433.46it/s]


---> Train loss: 0.6045, Train acc: 0.7047,
---> Dev   loss: 0.5989, Dev   acc: 0.7282

epoch9


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.84it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1105.78it/s]


---> Train loss: 0.5838, Train acc: 0.7384,
---> Dev   loss: 0.5775, Dev   acc: 0.7489

epoch10


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1105.24it/s]


---> Train loss: 0.5611, Train acc: 0.7653,
---> Dev   loss: 0.5560, Dev   acc: 0.7683

epoch11


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.84it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1048.09it/s]


---> Train loss: 0.5388, Train acc: 0.7964,
---> Dev   loss: 0.5350, Dev   acc: 0.7821

epoch12


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1111.98it/s]


---> Train loss: 0.5156, Train acc: 0.8148,
---> Dev   loss: 0.5159, Dev   acc: 0.7878

epoch13


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.84it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 695.71it/s]


---> Train loss: 0.4937, Train acc: 0.8333,
---> Dev   loss: 0.4985, Dev   acc: 0.7936

epoch14


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1025.11it/s]


---> Train loss: 0.4714, Train acc: 0.8452,
---> Dev   loss: 0.4834, Dev   acc: 0.8073

epoch15


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1132.48it/s]


---> Train loss: 0.4511, Train acc: 0.8543,
---> Dev   loss: 0.4704, Dev   acc: 0.8062

epoch16


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1078.11it/s]


---> Train loss: 0.4343, Train acc: 0.8600,
---> Dev   loss: 0.4603, Dev   acc: 0.8085

epoch17


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1114.02it/s]


---> Train loss: 0.4156, Train acc: 0.8711,
---> Dev   loss: 0.4497, Dev   acc: 0.8165

epoch18


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1119.70it/s]


---> Train loss: 0.4009, Train acc: 0.8688,
---> Dev   loss: 0.4454, Dev   acc: 0.8177

epoch19


Training: 100%|██████████| 1042/1042 [02:12<00:00,  7.84it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1111.81it/s]


---> Train loss: 0.3873, Train acc: 0.8775,
---> Dev   loss: 0.4377, Dev   acc: 0.8234

epoch20


Training: 100%|██████████| 1042/1042 [02:13<00:00,  7.83it/s]
Validation: 100%|██████████| 14/14 [00:00<00:00, 1097.76it/s]


---> Train loss: 0.3739, Train acc: 0.8780,
---> Dev   loss: 0.4351, Dev   acc: 0.8222


モデルを保存しました: model/bow_classifier_79_MLP.pth
開発セットにおける正解率：0.8222
