<a href="https://colab.research.google.com/github/GuraTom9/NLP100/blob/main/NLP100_80_89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#第9章　RNN, CNN

##80. ID番号への変換

In [None]:
%cd "drive/MyDrive/NLP100"

/content/drive/MyDrive/NLP100


In [None]:
! head -15 train.txt

TITLE	CATEGORY
UPDATE 1-Sandwich chain Quiznos files for bankruptcy protection	b
Iraq concerns lift top-rated euro zone bonds but Fed limits gains	b
China Manufacturing Gauge Rises in Stabilization Sign: Economy	b
Crucifixion: A New Way to Think About Jesus' Death	e
Tracy Morgan still critical but 'doing better' following deadly crash which killed his  ...	e
FOREX-Dollar off to slow start in event-packed week	b
Alstom should be a good investment for France, says CEO Kron	b
Diabetes-Related Problems Have Decreased Over Last 20 Years	m
2 Cases Of Ebola Confirmed In Liberia	m
'You Are Not Pregnant. We're Pregnant!' Mila Kunis Tells Off Overeager Fathers  ...	e
Home > Justin Bieber > Justin Bieber Sends Selena Gomez $10k Flowers?	e
Data storage firm Box files for US IPO of about $250 million	b
Cocaine use in U.S. cut in HALF while marijuana use jumps 30 per cent	m
UPDATE 1-HKMA intervenes as deals, China optimism spur Hong Kong dollar  ...	b


In [None]:
import re
from collections import Counter
import pickle

# 学習データから辞書を作成する関数
def create_dictionary(file_path):
    # 単語のリストを作成
    words_list = []
    with open(file_path, 'r') as f:
        next(f)
        for line in f:
            text = line.strip().split('\t')[0]
            words_list.append(re.findall(r'\w+', text.lower()))

    words = [item for sublist in words_list for item in sublist]

    # 単語の頻度を数える
    word_counts = Counter(words)

    # 2回以上出現する単語を抽出
    frequent_words = [word for word, count in word_counts.items() if count >= 2]

    # 単語の頻度でソート
    frequent_words.sort(key=lambda word: word_counts[word], reverse=True)

    # 辞書を作成
    dictionary = {}
    for i, word in enumerate(frequent_words):
        dictionary[word] = i + 1

    return dictionary

# 学習データのパスを指定して辞書を作成
train_file_path = 'train.txt'
w2id_dict = create_dictionary(train_file_path)
print(w2id_dict)

# pickleで辞書を保存
with open("w2id_dict.pkl", 'wb') as f:
    pickle.dump(w2id_dict, f)




In [None]:
# 単語列をID列に変換する関数
import string

def w2id(text, w2id_dict, UNK=0):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    words = text.translate(table).lower().split()
    id_list = [w2id_dict.get(word, UNK) for word in words]

    return id_list

text = "UPDATE 1-Sandwich chain Quiznos files for bankruptcy protection"
text_id = w2id(text, w2id_dict)
print(text_id)

[9, 13, 0, 1197, 0, 593, 7, 911, 2510]


##81. RNNによる予測

In [None]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx):
        super().__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(dict_size, emb_size, padding_idx=padding_idx)
        self.dropout = nn.Dropout(p=0.1)
        self.rnn = nn.RNN(emb_size, hidden_size,  n_layers, nonlinearity='tanh', batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.shape[0]
        embedded = self.dropout(self.embedding(x))
        init_hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        out, hidden = self.rnn(embedded, init_hidden)
        out = self.fc(out[:, -1, :])
        return out

In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        inputs = self.X[idx]
        inputs = torch.tensor(inputs, dtype=torch.int64)
        outputs = self.y[idx]
        outputs = torch.tensor(outputs, dtype=torch.int64)

        return inputs, outputs

In [None]:
import numpy as np

def divide_data(file_name, w2id):
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]

    # TITLEテキストからリストを作成
    X = []
    for line in lines:
        text = line.split('\t')[0]
        X.append(w2id(text, w2id_dict))

    # CATEGORYをラベルに変換
    label_map = {'b': 0, 't': 1, 'e': 2, 'm': 3}
    y = [label_map[line.split('\t')[1].strip()] for line in lines]

    return X, y

train_file_path = 'train.txt'
valid_file_path = 'valid.txt'
test_file_path = 'test.txt'

X_train, y_train = divide_data(train_file_path, w2id)
X_valid, y_valid = divide_data(valid_file_path, w2id)
X_test, y_test = divide_data(test_file_path, w2id)
print(X_train[:10])
print(y_train[:10])

[[9, 13, 0, 1197, 0, 593, 7, 911, 2510], [227, 568, 1076, 135, 2249, 33, 306, 198, 62, 41, 2250, 126], [27, 489, 2825, 100, 3, 0, 912, 147], [5651, 12, 19, 447, 1, 863, 39, 2826, 169], [2511, 490, 171, 1703, 62, 2036, 751, 864, 718, 812, 1026, 1269, 70], [63, 43, 60, 1, 1467, 594, 3, 2037, 4540, 88], [429, 378, 35, 12, 341, 813, 7, 430, 21, 69, 5652], [1270, 3788, 1574, 115, 4541, 29, 421, 491, 148], [16, 814, 5, 237, 1575, 3, 3231], [51, 44, 46, 342, 184, 508, 342, 690, 691, 752, 60, 0, 3789]]
[0, 0, 0, 2, 2, 0, 0, 3, 3, 2]


In [None]:
# データセットの作成
train_dataset = MyDataset(X_train, y_train)
valid_dataset = MyDataset(X_valid, y_valid)
test_dataset = MyDataset(X_test, y_test)
print(train_dataset[0])

(tensor([   9,   13,    0, 1197,    0,  593,    7,  911, 2510]), tensor(0))


In [None]:
#　RNNの設定
dict_size = len(w2id_dict) + 1
emb_size = 256
hidden_size = 64
output_size = 4
n_layers  = 1
padding_idx = 0

#　モデルのインスタンスを生成
model = RNN(dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx)
print(model)

RNN(
  (embedding): Embedding(7647, 256, padding_idx=0)
  (dropout): Dropout(p=0.1, inplace=False)
  (rnn): RNN(256, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)


In [None]:
for i in range(10):
  X = train_dataset[i][0]
  print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

tensor([[0.2617, 0.1955, 0.2329, 0.3099]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2742, 0.1428, 0.2713, 0.3116]], grad_fn=<SoftmaxBackward0>)
tensor([[0.4855, 0.1992, 0.1721, 0.1432]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1153, 0.2781, 0.2975, 0.3092]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2616, 0.1923, 0.2893, 0.2569]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2899, 0.2068, 0.2517, 0.2515]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3666, 0.1737, 0.1682, 0.2914]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2697, 0.1686, 0.2415, 0.3202]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1379, 0.1636, 0.5344, 0.1641]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2753, 0.1644, 0.2909, 0.2693]], grad_fn=<SoftmaxBackward0>)


##82. 確率的勾配降下法による学習

In [None]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx, device):
        super().__init__()
        self.device = device
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(dict_size, emb_size, padding_idx=padding_idx)
        self.dropout = nn.Dropout(p=0.1)
        self.rnn = nn.RNN(emb_size, hidden_size,  n_layers, nonlinearity='tanh', batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.shape[0]
        embedded = self.dropout(self.embedding(x))
        init_hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size, device=self.device)
        out, hidden = self.rnn(embedded, init_hidden)
        out = self.fc(out[:, -1, :])
        return out

In [None]:
# 推論結果の正解率を計算
def calc_acc(model, dataloader, device):
    model.eval()
    with torch.no_grad():
        correct = 0
        num_data = 0
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, -1)
            num_data += len(inputs)
            correct += (preds == labels).sum().item()
        accuracy = correct / num_data
        return accuracy

# 推論結果の損失を計算
def calc_loss(model, dataloader, device):
    model.eval()
    with torch.no_grad():
        loss = 0
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss += criterion(outputs, labels).item()
        return loss / len(dataloader)

In [None]:
# モデルを学習し、損失と正解率を表示する関数
def train_model(model, train_loader, valid_loader, epochs, criterion, optimizer, device=None):
    model.to(device)
    for epoch in range(epochs):
        model.train() # 学習モード
        running_loss =0.0 # 記録用loss初期化
        for i, (x, y) in enumerate(train_loader): # データローダーからバッチ毎に取り出す
            optimizer.zero_grad() # 勾配を初期化

            x = x.to(device)
            y = y.to(device)
            outputs = model(x) # RNNで予測
            loss = criterion(outputs, y) # loss計算
            loss.backward()  # 逆伝番
            optimizer.step()  # 勾配を更新

        model.eval() # 予測モード
        # 損失と正解率を計算
        acc_train = calc_acc(model, train_loader, device=device)
        loss_train = calc_loss(model, train_loader, device=device)
        acc_valid = calc_acc(model, valid_loader, device=device)
        loss_valid = calc_loss(model, valid_loader, device=device)

        # ログを出力
        print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}')

        # チェックポイントの保存
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')


In [None]:
from torch.utils.data import DataLoader
from torch import optim

# GPUが使用可能な場合は、deviceをGPUに設定する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# RNNの設定
dict_size = len(w2id_dict) + 1
emb_size = 256
hidden_size = 64
output_size = 4
n_layers  = 1
padding_idx = 0

# モデルのインスタンスを生成
model = RNN(dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx, device)
print(model)

epochs = 5
lr = 0.01
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

# データローダーの作成
train_dataset = MyDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
valid_dataset = MyDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False)

train_model(model, train_loader, valid_loader, epochs, criterion, optimizer, device=device)

RNN(
  (embedding): Embedding(7647, 256, padding_idx=0)
  (dropout): Dropout(p=0.1, inplace=False)
  (rnn): RNN(256, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)
epoch: 1, loss_train: 0.8597, accuracy_train: 0.7121, loss_valid: 0.9961, accuracy_valid: 0.6849
epoch: 2, loss_train: 0.7187, accuracy_train: 0.7539, loss_valid: 0.8589, accuracy_valid: 0.7216
epoch: 3, loss_train: 0.6754, accuracy_train: 0.7599, loss_valid: 0.8659, accuracy_valid: 0.7141
epoch: 4, loss_train: 0.5632, accuracy_train: 0.8004, loss_valid: 0.7556, accuracy_valid: 0.7470
epoch: 5, loss_train: 0.4715, accuracy_train: 0.8302, loss_valid: 0.6985, accuracy_valid: 0.7650


##83. ミニバッチ化・GPU上での学習

In [None]:
from torch.nn.utils.rnn import pad_sequence

# ミニバッチ内のバディング処理
def collate_fn(batch):
    # テキストとラベルに分ける
    x_batch, y_batch = zip(*batch)
    # テキストのPadding処理
    src_lengths = torch.tensor([len(src) for src in x_batch])
    x_padded = pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_padded = torch.LongTensor(y_batch)

    # Padding後のテンソルを返す
    return x_padded, y_padded

In [None]:
from torch.utils.data import DataLoader
from torch import optim

# GPUが使用可能な場合は、deviceをGPUに設定する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# RNNの設定
dict_size = len(w2id_dict) + 1
emb_size = 256
hidden_size = 64
output_size = 4
n_layers  = 1
padding_idx = 0

# モデルのインスタンスを生成
model = RNN(dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx, device)
print(model)

epochs = 10
batch_size = 64
lr = 0.1
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

# データローダーの作成
train_dataset = MyDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
valid_dataset = MyDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

# モデルの学習
train_model(model, train_loader, valid_loader, epochs, criterion, optimizer, device=device)

RNN(
  (embedding): Embedding(7647, 256, padding_idx=0)
  (dropout): Dropout(p=0.1, inplace=False)
  (rnn): RNN(256, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)
epoch: 1, loss_train: 1.1537, accuracy_train: 0.4645, loss_valid: 1.1595, accuracy_valid: 0.4506
epoch: 2, loss_train: 1.1028, accuracy_train: 0.5343, loss_valid: 1.1247, accuracy_valid: 0.5195
epoch: 3, loss_train: 1.0386, accuracy_train: 0.6249, loss_valid: 1.0739, accuracy_valid: 0.5958
epoch: 4, loss_train: 1.0222, accuracy_train: 0.6303, loss_valid: 1.0440, accuracy_valid: 0.6160
epoch: 5, loss_train: 1.1724, accuracy_train: 0.5923, loss_valid: 1.2393, accuracy_valid: 0.5636
epoch: 6, loss_train: 1.0867, accuracy_train: 0.5949, loss_valid: 1.1062, accuracy_valid: 0.5689
epoch: 7, loss_train: 1.0787, accuracy_train: 0.5905, loss_valid: 1.0938, accuracy_valid: 0.5831
epoch: 8, loss_train: 1.1979, accuracy_train: 0.4015, loss_valid: 1.2027, accuracy_valid: 0.4019
epoch: 9, loss_train: 1.1

##84. 単語ベクトルの導入

In [None]:
from gensim.models import KeyedVectors

# 学習済みモデルのロード
w2v_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
import pickle

# 辞書のロード
with open("w2id_dict.pkl", 'rb') as p:
    w2id_dict = pickle.load(p)
    print(w2id_dict)

# 辞書に対応する単語ベクトルの取得
dict_size = len(set(w2id_dict.values())) + 1
emb_size = 300
weights = np.zeros((dict_size, emb_size))

for i, word in enumerate(w2id_dict.keys()):
    try:
        weights[i] = w2v_model[word]
    except KeyError:
        weights[i] = np.random.normal(scale=0.4, size=(emb_size,))
weights = torch.from_numpy(weights.astype(np.float32))

print(weights.shape)

torch.Size([7647, 300])


In [None]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx, device, emb_weights=None):
        super().__init__()
        self.device = device
        self.n_layers = n_layers
        if emb_weights != None:
            self.embedding = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
        else:
            self.embedding = nn.Embedding(dict_size, emb_size, padding_idx=padding_idx)
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(p=0.1)
        self.rnn = nn.RNN(emb_size, hidden_size,  n_layers, nonlinearity='tanh', batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.shape[0]
        embedded = self.dropout(self.embedding(x))
        init_hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size, device=self.device)
        out, hidden = self.rnn(embedded, init_hidden)
        out = self.fc(out[:, -1, :])
        return out


In [None]:
from torch.utils.data import DataLoader
from torch import optim

# GPUが使用可能な場合は、deviceをGPUに設定する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# RNNの設定
dict_size = len(w2id_dict) + 1
emb_size = 300
hidden_size = 64
output_size = 4
n_layers  = 1
padding_idx = 0

# モデルのインスタンスを生成
model = RNN(dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx, device, weights)
print(model)

epochs = 10
batch_size = 64
lr = 0.1
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

# データローダーの作成
train_dataset = MyDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
valid_dataset = MyDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

# モデルの学習
train_model(model, train_loader, valid_loader, epochs, criterion, optimizer, device=device)

RNN(
  (embedding): Embedding(7647, 300, padding_idx=0)
  (dropout): Dropout(p=0.1, inplace=False)
  (rnn): RNN(300, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)
epoch: 1, loss_train: 1.1695, accuracy_train: 0.3960, loss_valid: 1.1740, accuracy_valid: 0.3952
epoch: 2, loss_train: 1.1581, accuracy_train: 0.4578, loss_valid: 1.1650, accuracy_valid: 0.4476
epoch: 3, loss_train: 1.1618, accuracy_train: 0.4657, loss_valid: 1.1697, accuracy_valid: 0.4513
epoch: 4, loss_train: 1.1569, accuracy_train: 0.4395, loss_valid: 1.1618, accuracy_valid: 0.4626
epoch: 5, loss_train: 1.1605, accuracy_train: 0.4625, loss_valid: 1.1722, accuracy_valid: 0.4484
epoch: 6, loss_train: 1.1656, accuracy_train: 0.4269, loss_valid: 1.1696, accuracy_valid: 0.4259
epoch: 7, loss_train: 1.2397, accuracy_train: 0.4476, loss_valid: 1.2568, accuracy_valid: 0.4454
epoch: 8, loss_train: 1.1190, accuracy_train: 0.5187, loss_valid: 1.1251, accuracy_valid: 0.5120
epoch: 9, loss_train: 1.1

##85. 双方向RNN・多層化

In [None]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx, device, emb_weights=None, bidirectional=False):
        super().__init__()
        self.device = device
        self.n_layers = n_layers
        self.num_directions = bidirectional + 1
        if emb_weights != None:
            self.embedding = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
        else:
            self.embedding = nn.Embedding(dict_size, emb_size, padding_idx=padding_idx)
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(p=0.1)
        self.rnn = nn.RNN(emb_size, hidden_size,  n_layers, nonlinearity='tanh', bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_size * self.num_directions, output_size)

    def forward(self, x):
        batch_size = x.shape[0]
        embedded = self.dropout(self.embedding(x))
        init_hidden = torch.zeros(self.n_layers * self.num_directions, batch_size, self.hidden_size, device=self.device)
        out, hidden = self.rnn(embedded, init_hidden)
        out = self.fc(out[:, -1, :])
        return out


In [None]:
from torch.utils.data import DataLoader
from torch import optim

# GPUが使用可能な場合は、deviceをGPUに設定する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 双方向RNNの設定
dict_size = len(w2id_dict) + 1
emb_size = 300
hidden_size = 64
output_size = 4
n_layers  = 2
padding_idx = 0

# モデルのインスタンスを生成
model = RNN(dict_size, emb_size, hidden_size, output_size, n_layers, padding_idx, device, emb_weights=weights, bidirectional=True).to(device)
print(model)

epochs = 10
batch_size = 64
lr = 0.1
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

# データローダーの作成
train_dataset = MyDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
valid_dataset = MyDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

# モデルの学習
train_model(model, train_loader, valid_loader, epochs, criterion, optimizer, device=device)

RNN(
  (embedding): Embedding(7647, 300, padding_idx=0)
  (dropout): Dropout(p=0.1, inplace=False)
  (rnn): RNN(300, 64, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=4, bias=True)
)
epoch: 1, loss_train: 1.1640, accuracy_train: 0.4554, loss_valid: 1.1726, accuracy_valid: 0.4431
epoch: 2, loss_train: 1.1517, accuracy_train: 0.4619, loss_valid: 1.1578, accuracy_valid: 0.4528
epoch: 3, loss_train: 1.1551, accuracy_train: 0.4671, loss_valid: 1.1620, accuracy_valid: 0.4499
epoch: 4, loss_train: 1.1489, accuracy_train: 0.4738, loss_valid: 1.1541, accuracy_valid: 0.4611
epoch: 5, loss_train: 1.0980, accuracy_train: 0.5451, loss_valid: 1.1092, accuracy_valid: 0.5382
epoch: 6, loss_train: 1.0515, accuracy_train: 0.6031, loss_valid: 1.0700, accuracy_valid: 0.5921
epoch: 7, loss_train: 1.0492, accuracy_train: 0.6277, loss_valid: 1.0658, accuracy_valid: 0.6033
epoch: 8, loss_train: 1.0074, accuracy_train: 0.6387, loss_valid: 1.0458, accuracy_vali

##86. 畳み込みニューラルネットワーク（CNN）

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class CNN(nn.Module):
    def __init__(self, dict_size, emb_size, output_size, padding_idx, out_channels, kernel_heights, stride, padding, emb_weights=None):
        super().__init__()
        if emb_weights != None:
            self.embedding = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
        else:
            self.embedding = nn.Embedding(dict_size, emb_size, padding_idx=padding_idx)
        self.conv = nn.Conv2d(1, out_channels, (kernel_heights, emb_size), stride, (padding, 0))
        self.dropout = nn.Dropout(p=0.3)
        self.fc = nn.Linear(out_channels, output_size)

    def forward(self, x):
        embedded = self.embedding(x).unsqueeze(1)  # embedded : [batch_size, 1, seq_len, emb_size]
        conv = self.conv(embedded) # conv : [batch_size, out_channels, seq_len, 1]
        feature = F.relu(conv.squeeze(-1))
        max_pool = F.max_pool1d(feature, feature.shape[2])
        out = self.fc(self.dropout(max_pool.squeeze(-1)))
        return out

In [None]:
#CNNの設定
dict_size = len(w2id_dict) + 1
emb_size = 300
output_size = 4
out_channels = 100
kernel_heights = 3  # 畳み込みのフィルターサイズ
stride = 1  # 畳み込みのストライド（トークン）
padding = 1 # 畳み込みのパディング有無
padding_idx = 0

# モデルのインスタンスを生成
model = CNN(dict_size, emb_size, output_size, padding_idx, out_channels, kernel_heights, stride, padding)
print(model)

CNN(
  (embedding): Embedding(7647, 256, padding_idx=0)
  (conv): Conv2d(1, 100, kernel_size=(3, 256), stride=(1, 1), padding=(1, 0))
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


In [None]:
for i in range(10):
  X = train_dataset[i][0]
  print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

tensor([[0.2821, 0.1205, 0.1181, 0.4793]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1680, 0.3406, 0.2967, 0.1947]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2419, 0.2168, 0.2553, 0.2860]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1859, 0.2017, 0.2016, 0.4108]], grad_fn=<SoftmaxBackward0>)
tensor([[0.4319, 0.1319, 0.1190, 0.3173]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2348, 0.2919, 0.2940, 0.1793]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2438, 0.3199, 0.1067, 0.3296]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1960, 0.2594, 0.2243, 0.3202]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2679, 0.1513, 0.2981, 0.2827]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1392, 0.2919, 0.3581, 0.2107]], grad_fn=<SoftmaxBackward0>)


##87. 確率的勾配降下法によるCNNの学習

In [None]:
from torch.utils.data import DataLoader
from torch import optim

# GPUが使用可能な場合は、deviceをGPUに設定する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CNNの設定
dict_size = len(w2id_dict) + 1
emb_size = 300
output_size = 4
padding_idx = 0
out_channels = 100
kernel_heights = 3  # 畳み込みのフィルターサイズ
stride = 1  # 畳み込みのストライド（トークン）
padding = 1 # 畳み込みのパディング有無
padding_idx = 0

# モデルのインスタンスを生成
model = CNN(dict_size, emb_size, output_size, padding_idx, out_channels, kernel_heights, stride, padding)
print(model)

epochs = 10
batch_size = 64
lr = 0.1
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

# データローダーの作成
train_dataset = MyDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
valid_dataset = MyDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

# モデルの学習
train_model(model, train_loader, valid_loader, epochs, criterion, optimizer, device=device)

CNN(
  (embedding): Embedding(7647, 300, padding_idx=0)
  (conv): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=100, out_features=4, bias=True)
)
epoch: 1, loss_train: 0.4289, accuracy_train: 0.8542, loss_valid: 0.5728, accuracy_valid: 0.7949
epoch: 2, loss_train: 0.2250, accuracy_train: 0.9315, loss_valid: 0.4472, accuracy_valid: 0.8398
epoch: 3, loss_train: 0.1400, accuracy_train: 0.9591, loss_valid: 0.4307, accuracy_valid: 0.8443
epoch: 4, loss_train: 0.0834, accuracy_train: 0.9845, loss_valid: 0.3956, accuracy_valid: 0.8608
epoch: 5, loss_train: 0.0502, accuracy_train: 0.9933, loss_valid: 0.3772, accuracy_valid: 0.8690
epoch: 6, loss_train: 0.0349, accuracy_train: 0.9940, loss_valid: 0.4087, accuracy_valid: 0.8675
epoch: 7, loss_train: 0.0264, accuracy_train: 0.9970, loss_valid: 0.3966, accuracy_valid: 0.8690
epoch: 8, loss_train: 0.0210, accuracy_train: 0.9977, loss_valid: 0.3962, accuracy_

##88. パラメータチューニング

In [None]:
from torch.utils.data import DataLoader
from torch import optim

# GPUが使用可能な場合は、deviceをGPUに設定する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CNNの設定
dict_size = len(w2id_dict) + 1
emb_size = 300
output_size = 4
padding_idx = 0
out_channels = 100
kernel_heights = 3  # 畳み込みのフィルターサイズ
stride = 1  # 畳み込みのストライド（トークン）
padding = 1 # 畳み込みのパディング有無
padding_idx = 0

# モデルのインスタンスを生成
model = CNN(dict_size, emb_size, output_size, padding_idx, out_channels, kernel_heights, stride, padding)
print(model)

epochs = 10
batch_size = 64
lr = 0.01
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# データローダーの作成
train_dataset = MyDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
valid_dataset = MyDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

# モデルの学習
train_model(model, train_loader, valid_loader, epochs, criterion, optimizer, device=device)

CNN(
  (embedding): Embedding(7647, 300, padding_idx=0)
  (conv): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=100, out_features=4, bias=True)
)
epoch: 1, loss_train: 0.1825, accuracy_train: 0.9384, loss_valid: 0.3963, accuracy_valid: 0.8743
epoch: 2, loss_train: 0.0716, accuracy_train: 0.9771, loss_valid: 0.4281, accuracy_valid: 0.8945
epoch: 3, loss_train: 0.2269, accuracy_train: 0.9460, loss_valid: 0.8958, accuracy_valid: 0.8683
epoch: 4, loss_train: 0.0634, accuracy_train: 0.9858, loss_valid: 0.8659, accuracy_valid: 0.9042
epoch: 5, loss_train: 0.0810, accuracy_train: 0.9861, loss_valid: 1.1076, accuracy_valid: 0.9042
epoch: 6, loss_train: 0.0744, accuracy_train: 0.9889, loss_valid: 1.6570, accuracy_valid: 0.8967
epoch: 7, loss_train: 0.1235, accuracy_train: 0.9870, loss_valid: 2.3830, accuracy_valid: 0.8930
epoch: 8, loss_train: 0.1304, accuracy_train: 0.9886, loss_valid: 2.7025, accuracy_

##89. 事前学習済み言語モデルからの転移学習

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m78.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import transformers
from transformers import BertTokenizer, BertModel

In [None]:
! head -15 train.txt

TITLE	CATEGORY
UPDATE 1-Sandwich chain Quiznos files for bankruptcy protection	b
Iraq concerns lift top-rated euro zone bonds but Fed limits gains	b
China Manufacturing Gauge Rises in Stabilization Sign: Economy	b
Crucifixion: A New Way to Think About Jesus' Death	e
Tracy Morgan still critical but 'doing better' following deadly crash which killed his  ...	e
FOREX-Dollar off to slow start in event-packed week	b
Alstom should be a good investment for France, says CEO Kron	b
Diabetes-Related Problems Have Decreased Over Last 20 Years	m
2 Cases Of Ebola Confirmed In Liberia	m
'You Are Not Pregnant. We're Pregnant!' Mila Kunis Tells Off Overeager Fathers  ...	e
Home > Justin Bieber > Justin Bieber Sends Selena Gomez $10k Flowers?	e
Data storage firm Box files for US IPO of about $250 million	b
Cocaine use in U.S. cut in HALF while marijuana use jumps 30 per cent	m
UPDATE 1-HKMA intervenes as deals, China optimism spur Hong Kong dollar  ...	b


In [None]:
import pandas as pd

# データの読込
train = pd.read_csv('train.txt', sep='\t')
valid = pd.read_csv('valid.txt', sep='\t')
test = pd.read_csv('test.txt', sep='\t')
print(train.head())

                                               TITLE CATEGORY
0  UPDATE 1-Sandwich chain Quiznos files for bank...        b
1  Iraq concerns lift top-rated euro zone bonds b...        b
2  China Manufacturing Gauge Rises in Stabilizati...        b
3  Crucifixion: A New Way to Think About Jesus' D...        e
4  Tracy Morgan still critical but 'doing better'...        e


In [None]:
class NewsDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        text = self.X[index]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.LongTensor(ids),
            'mask': torch.LongTensor(mask),
            'labels': torch.Tensor(self.y[index])
        }

In [None]:
# カテゴリラベルのone-hot化
y_train = pd.get_dummies(train, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_t', 'CATEGORY_e', 'CATEGORY_m']].values
y_valid = pd.get_dummies(valid, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_t', 'CATEGORY_e', 'CATEGORY_m']].values
y_test = pd.get_dummies(test, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_t', 'CATEGORY_e', 'CATEGORY_m']].values
y_train[:10]

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0]], dtype=uint8)

In [None]:
# Datasetの作成
max_len = 30
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = NewsDataset(train['TITLE'], y_train, tokenizer, max_len)
valid_dataset = NewsDataset(valid['TITLE'], y_valid, tokenizer, max_len)
test_dataset = NewsDataset(test['TITLE'], y_test, tokenizer, max_len)

for var in train_dataset[0]:
    print(f'{var}: {train_dataset[0][var]}')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ids: tensor([  101, 10651,  1015,  1011, 11642,  4677, 19461, 15460,  6764,  2005,
        10528,  3860,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])
labels: tensor([1., 0., 0., 0.])




In [None]:
# BERT分類モデルの定義
class BERTClass(torch.nn.Module):
    def __init__(self, drop_rate, otuput_size):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = torch.nn.Dropout(drop_rate)
        self.fc = torch.nn.Linear(768, otuput_size)  # BERTの出力に合わせて768次元を指定

    def forward(self, ids, mask):
        _, out = self.bert(ids, attention_mask=mask, return_dict=False)
        out = self.fc(self.drop(out))
        return out

In [None]:
# 損失・正解率を計算
def calc_loss_acc(model, criterion, loader, device):
    model.eval()
    loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for data in loader:
            # デバイスの指定
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            labels = data['labels'].to(device)

        # 順伝播
        outputs = model(ids, mask)

        # 損失計算
        loss += criterion(outputs, labels).item()

        # 正解率計算
        pred = torch.argmax(outputs, dim=-1).cpu().numpy() # バッチサイズの長さの予測ラベル配列
        labels = torch.argmax(labels, dim=-1).cpu().numpy()  # バッチサイズの長さの正解ラベル配列
        total += len(labels)
        correct += (pred == labels).sum().item()

    return loss / len(loader), correct / total

In [None]:
from torch import optim
from torch import cuda

# パラメータの設定
drop_rate = 0.4
output_size = 4
batch_size = 32
num_epochs = 4
lr = 2e-5

# モデルの定義
model = BERTClass(drop_rate, output_size)

# 損失関数の定義
criterion = torch.nn.BCEWithLogitsLoss()

# オプティマイザの定義
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)

# デバイスの指定
device = 'cuda' if cuda.is_available() else 'cpu'

# デバイスの指定
model.to(device)

# データローダーの作成
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=len(valid_dataset), shuffle=False)

# 学習
log_train = []
log_valid = []
for epoch in range(num_epochs):

    # 訓練モードに設定
    model.train()
    for data in train_loader:
        # デバイスの指定
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        labels = data['labels'].to(device)

        # 勾配をゼロで初期化
        optimizer.zero_grad()

        # 順伝播 + 誤差逆伝播 + 重み更新
        outputs = model(ids, mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # 損失と正解率の算出
    loss_train, acc_train = calc_loss_acc(model, criterion, train_loader, device)
    loss_valid, acc_valid = calc_loss_acc(model, criterion, valid_loader, device)
    log_train.append([loss_train, acc_train])
    log_valid.append([loss_valid, acc_valid])

    # チェックポイントの保存
    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')

    # ログを出力
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}')


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch: 1, loss_train: 0.0003, accuracy_train: 0.9286, loss_valid: 0.1016, accuracy_valid: 0.9319
epoch: 2, loss_train: 0.0002, accuracy_train: 0.9286, loss_valid: 0.0839, accuracy_valid: 0.9461
epoch: 3, loss_train: 0.0000, accuracy_train: 1.0000, loss_valid: 0.0882, accuracy_valid: 0.9446
epoch: 4, loss_train: 0.0003, accuracy_train: 0.9643, loss_valid: 0.0856, accuracy_valid: 0.9476
