In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
PATH = "/content/drive/MyDrive/dataset/ch9/"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


In [None]:
# 80
import re
import collections

# CATEGORYをencodeする
def Encoder(sign):
    if sign == "b":
      return 0
    elif sign == "t":
      return 1
    elif sign == "e":
      return 2
    elif sign == "m":
      return 3

def Process(lines):
    sign_regrex = re.compile('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`|＄＃＠£â€™é\n]')
    # 記号など削除する
    word_list = []
    text_list = []
    true_label = []

    for text in lines:

      # print(text)

      true_label.append(text.split("\t")[3])
      # CATEGORYをもらう
      # print(true_label)

      text = text.split("\t")[0]
      # titleの内容をもらう
      # print(text)
      text = sign_regrex.sub("", text)
      # 記号など削除する

      text = re.sub("(\d+)", r" \1 ", text)
      # 数字と単語の間にspaceを入れる
      # print(text)

      words = text.split(" ")
      # 単語をもらう
      words = list(filter(lambda x:x, words))
      #空リスト削除
      words = list(map(lambda x:x.lower(), words))
      #小文字にする

      word_list.extend(words)
      text_list.append(words)

    return word_list, text_list, true_label

def MakeDict(name):
  # 辞書を作る
    f = open(PATH + "{}.txt".format(name), "r")
    lines = f.readlines()
    f.close()
    word_list, _, _ = Process(lines)
    c = collections.Counter(word_list).most_common()
    # 単語頻度の降順に並べ替え
    word_dic = {}
    for id, word in enumerate(c, 1):
      # 単語を、データセット中の単語頻度の降順に並べ替え、IDを単語頻度の順位に対応させる
      # 単語頻度が1のすべての単語について、そのIDを0にする。
        if int(word[1]) < 2:
            word_dic[word[0]] = 0
        else:
            word_dic[word[0]] = id
    return word_dic

# trainに基づいて辞書を作る
word_dic = MakeDict("train")

def Word2Code(name, word_dic):
    f = open(PATH + "{}.txt".format(name), "r")
    lines = f.readlines()
    lines.pop(0)
    #カラムの行を除く

    _, text_list, true_label = Process(lines)

    true_label = list(map(Encoder, true_label))
    # true_label一行目ずつEncoder関数で処理する

    result_list = []

    # 単語をcodeにする
    for text in text_list:
        code_list = []
        for word in text:
            try:
                code = word_dic[word]
            except:
                code = 0
            code_list.append(code)
        result_list.append(code_list)


    # 処理したdataをfileに書き込んで保存する
    f = open(PATH + "{}_code.txt".format(name), "w")
    i = 0
    for t1, t2, t3 in zip(true_label, text_list, result_list):
      # 出力の形式を調整する
        if i==0:
            f.write(str(t1)+"\t"+" ".join(t2)+"\t"+" ".join(map(str, t3)))
            i = 1
        else:
            f.write("\n"+str(t1)+"\t"+" ".join(t2)+"\t"+" ".join(map(str, t3)))
    f.close()

In [None]:
# 80

Word2Code("train", word_dic)
Word2Code("test", word_dic)
Word2Code("valid", word_dic)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# LSTMを定義する
class LSTM(nn.Module):
    def __init__(self, vocab_size, dw, dh, output):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, dw, padding_idx=vocab_size-1)
        # 入力したdataを300次元の単語ベクトルへ変更する
        self.lstm = nn.LSTM(dw, dh, batch_first=True)
        # LSTMを実現する
        self.fc1 = nn.Linear(dh, output, bias=True)
        # 300次元から４次元に変更する
        self.fc2 = nn.Softmax(dim=1)
        # Softmax関数を施す

        # 重みを初期化する
        nn.init.xavier_normal_(self.lstm.weight_ih_l0)
        nn.init.xavier_normal_(self.lstm.weight_hh_l0)
        nn.init.xavier_normal_(self.fc1.weight)


    def forward(self, x):
        x = x.to(device)
        x = self.embed(x)
        x, _ = self.lstm(x)
        x = self.fc1(x[:, -1, :])
        x = self.fc2(x)
        return x



def CountVocab(name):
    f = open(PATH + "{}_code.txt".format(name), "r")
    lines = f.readlines()
    f.close()
    max_num = []
    i = 0
    for line in lines:
      line_t = line.split("\t")[2].replace("\n", "").split(" ")
      # 単語のIDをもらう
      max_num.extend(map(int, line_t))

    vocab_max = max(max_num)+1
    # 最大のIDをもらう
    return vocab_max

def GetCodeLow(name):

    f = open(PATH + "{}_code.txt".format(name), "r")
    lines = f.readlines()
    f.close()
    num_list = []
    code_list = []
    pad_list = []

    for line in lines:

      try:
        line_s = line.split("\t")
        # print(line_s)
        code_list.append(int(line_s[0]))
        # print(code_list)
        # CATEGORYのcodeをもらう
        num = line_s[2].replace("\n", "").split(" ")
        num = list(map(int, num))
        num_list.append(num)
        # 単語のcodeをもらう
        num_tensor = torch.tensor(num)
        # print(num_tensor)
        # codeをtensorへ変更する
        pad_list.append(num_tensor)
      except:
          pass

    max_vocab = CountVocab("train")
    # 最大code
    # print(max_vocab)


    # 计算每个序列的长度
    lengths = [len(seq) for seq in num_list]
    # print(lengths)

    # 使用 pad_sequence 进行填充
    padded_sequences = pad_sequence(pad_list, batch_first=True, padding_value=max_vocab)

    # # 使用 pack_padded_sequence 打包序列
    # packed_sequences = pack_padded_sequence(padded_sequences, lengths, batch_first=True, enforce_sorted=False)

    code_list = torch.tensor(code_list)
    return padded_sequences, code_list






X_valid, Y_valid = GetCodeLow("valid")
X_valid, Y_valid = X_valid.to(device), Y_valid.to(device)
# print(Y_valid)

VOCAB_SIZE = CountVocab("train")+1
EMB_SIZE = 300
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50

model = LSTM(VOCAB_SIZE, EMB_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(device)
Y_pred = model(X_valid)
# print(Y_pred)

# 获取预测标签
pred = torch.argmax(Y_pred, dim=-1)
# print(pred)

# 正解率を計算する
accuracy = sum(1 for x, y in zip(Y_valid, pred) if x == y) / float(len(Y_pred))
print("accuracy: ", accuracy)

accuracy:  0.39280359820089955


In [None]:
# 82

X_train, y_train = GetCodeLow("train")
# print(X_train.shape, y_train.shape)

num_epochs = 50
batch_size = 64
lr = 5e-1

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# lossとSGDを定義する
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

# modelを訓練する
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)

      # 前向传播
      outputs = model(X_batch)
      loss = criterion(outputs, y_batch)

      # 反向传播和优化
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()


    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 評価する
model.eval()
with torch.no_grad():
    Y_pred = model(X_valid).to(device)
    pred = torch.argmax(Y_pred, dim=-1)
    accuracy = (Y_valid == pred).sum().item() / float(len(Y_pred))
    print("accuracy: ", accuracy)

Epoch [1/50], Loss: 0.8475
Epoch [2/50], Loss: 0.7864
Epoch [3/50], Loss: 0.9505
Epoch [4/50], Loss: 0.8478
Epoch [5/50], Loss: 0.9103
Epoch [6/50], Loss: 0.8281
Epoch [7/50], Loss: 0.8471
Epoch [8/50], Loss: 0.8559
Epoch [9/50], Loss: 0.9209
Epoch [10/50], Loss: 0.9124
Epoch [11/50], Loss: 0.9088
Epoch [12/50], Loss: 0.9287
Epoch [13/50], Loss: 0.8875
Epoch [14/50], Loss: 0.8479
Epoch [15/50], Loss: 0.8449
Epoch [16/50], Loss: 0.9103
Epoch [17/50], Loss: 0.8899
Epoch [18/50], Loss: 0.9521
Epoch [19/50], Loss: 0.9418
Epoch [20/50], Loss: 0.8267
Epoch [21/50], Loss: 0.8452
Epoch [22/50], Loss: 0.9728
Epoch [23/50], Loss: 0.9727
Epoch [24/50], Loss: 0.9313
Epoch [25/50], Loss: 0.9103
Epoch [26/50], Loss: 0.9102
Epoch [27/50], Loss: 0.8896
Epoch [28/50], Loss: 0.8687
Epoch [29/50], Loss: 0.9543
Epoch [30/50], Loss: 0.9103
Epoch [31/50], Loss: 0.8270
Epoch [32/50], Loss: 0.9302
Epoch [33/50], Loss: 0.8463
Epoch [34/50], Loss: 0.9101
Epoch [35/50], Loss: 0.9509
Epoch [36/50], Loss: 0.9307
E

In [None]:
# 83
bs_list = [2**i for i in range(10)]

for bs in bs_list:
  for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)

      # 前向传播
      outputs = model(X_batch)
      loss = criterion(outputs, y_batch)

      # 反向传播和优化
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()


  print(f'batch [{bs}], Loss: {loss.item():.4f}')
  # 評価する
  model.eval()
  with torch.no_grad():
    Y_pred = model(X_valid).to(device)
    pred = torch.argmax(Y_pred, dim=-1)
    accuracy = (Y_valid == pred).sum().item() / float(len(Y_pred))
    print("accuracy: ", accuracy)

batch [1], Loss: 0.7853
accuracy:  0.8335832083958021
batch [2], Loss: 0.8895
accuracy:  0.8328335832083958
batch [4], Loss: 0.8062
accuracy:  0.8328335832083958
batch [8], Loss: 0.7837
accuracy:  0.8320839580209896
batch [16], Loss: 0.7645
accuracy:  0.8320839580209896
batch [32], Loss: 0.7645
accuracy:  0.8328335832083958
batch [64], Loss: 0.8062
accuracy:  0.8328335832083958
batch [128], Loss: 0.7646
accuracy:  0.8133433283358321
batch [256], Loss: 0.7853
accuracy:  0.8110944527736131
batch [512], Loss: 0.8478
accuracy:  0.8110944527736131


In [None]:
# 84

from gensim.models import KeyedVectors
import numpy as np

# 重み行列をもらう

def GetInitWeight():
    vectors = KeyedVectors.load_word2vec_format(PATH + 'GoogleNews-vectors-negative300.bin', binary=True)
    # fileを読み込む
    worddic = MakeDict("train")
    # 辞書を作る

    # 重み行列を初期化する
    init_weight = []
    init_weight.append(list(np.zeros(300)))

    # 各単語にベクトルを付ける
    for key, value in worddic.items():
      # print(key, value)
      if value == 0:
        continue
      else:
        try:
          init_weight.append(list(vectors[key]))
        except:
          init_weight.append(list(np.zeros(300)))

    init_weight.append(list(np.zeros(300)))
    # listからtensorへ変更する
    weights = torch.tensor(init_weight)
    weights = weights.float()
    return weights

weights = GetInitWeight()

In [None]:
# 84

class LSTM(nn.Module):
    def __init__(self, vocab_size, dw, dh, output, init_weight=None):
        super().__init__()

        # Embedding layerに重み行列を提供する
        # 入力したdataの次元から300へ変更する
        if init_weight != None:
          self.embed = nn.Embedding.from_pretrained(init_weight, padding_idx=vocab_size-1)
        else:
          self.embed = nn.Embedding(vocab_size, dw, padding_idx=vocab_size-1)

        self.lstm = nn.LSTM(dw, dh, batch_first=True, bidirectional=True)
        # LSTMを実現する
        self.fc1 = nn.Linear(dh, output, bias=True)
        # 300次元から４次元に変更する

        self.fc2 = nn.Softmax(dim=1)
        # Softmax関数を施す

        # 重みを初期化する
        nn.init.xavier_normal_(self.lstm.weight_ih_l0)
        nn.init.xavier_normal_(self.lstm.weight_hh_l0)
        nn.init.xavier_normal_(self.fc1.weight)


    def forward(self, x):
        x = x.to(device)
        x = self.embed(x)
        x, _ = self.lstm(x)
        x = self.fc1(x[:, -1, :])
        x = self.fc2(x)
        return x

X_train, y_train = GetCodeLow("train")
# print(X_train.shape, y_train.shape)
X_test, y_test = GetCodeLow("test")
X_test, y_test = X_test.to(device), y_test.to(device)
# print(X_test.shape, y_test.shape)

num_epochs = 100
batch_size = 64
lr = 5e-1

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

train_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# lossとSGDを定義する
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

model = LSTM(VOCAB_SIZE, EMB_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, weights).to(device)

# modelを訓練する
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)

      # 前向传播
      outputs = model(X_batch)
      loss = criterion(outputs, y_batch)

      # 反向传播和优化
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()


    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 評価する
model.eval()
with torch.no_grad():
    Y_pred = model(X_test).to(device)
    pred = torch.argmax(Y_pred, dim=-1)
    accuracy = (y_test == pred).sum().item() / float(len(Y_pred))
    print("accuracy: ", accuracy)

Epoch [1/100], Loss: 1.2617
Epoch [2/100], Loss: 1.2513
Epoch [3/100], Loss: 1.2906
Epoch [4/100], Loss: 1.2742
Epoch [5/100], Loss: 1.2250
Epoch [6/100], Loss: 1.2968
Epoch [7/100], Loss: 1.3092
Epoch [8/100], Loss: 1.2575
Epoch [9/100], Loss: 1.2883
Epoch [10/100], Loss: 1.2421
Epoch [11/100], Loss: 1.0799
Epoch [12/100], Loss: 1.2509
Epoch [13/100], Loss: 1.2510
Epoch [14/100], Loss: 1.2517
Epoch [15/100], Loss: 1.2660
Epoch [16/100], Loss: 1.2486
Epoch [17/100], Loss: 1.2726
Epoch [18/100], Loss: 1.2577
Epoch [19/100], Loss: 1.3382
Epoch [20/100], Loss: 0.9755
Epoch [21/100], Loss: 0.9332
Epoch [22/100], Loss: 0.9582
Epoch [23/100], Loss: 1.0532
Epoch [24/100], Loss: 1.0146
Epoch [25/100], Loss: 0.9100
Epoch [26/100], Loss: 0.9421
Epoch [27/100], Loss: 0.9917
Epoch [28/100], Loss: 0.9382
Epoch [29/100], Loss: 0.9688
Epoch [30/100], Loss: 1.0541
Epoch [31/100], Loss: 0.8657
Epoch [32/100], Loss: 0.9440
Epoch [33/100], Loss: 0.9170
Epoch [34/100], Loss: 0.8915
Epoch [35/100], Loss: 1

In [None]:
# 85

class LSTM(nn.Module):
    def __init__(self, vocab_size, dw, dh, output, num_layers=5):
        super().__init__()

        self.embed = nn.Embedding(vocab_size, dw, padding_idx=vocab_size-1)
        # 入力したdataの次元から300へ変更する

        self.lstm = nn.LSTM(dw, dh, num_layers=num_layers, batch_first=True)
        # LSTMを実現する

        self.fc1 = nn.Linear(dh, output, bias=True)
        # 300次元から4次元に変更する

        self.fc2 = nn.Softmax(dim=1)
        # Softmax関数を施す

        # hidden state と cell stateをLSTMに初期化する
        for name, param in self.lstm.named_parameters():
            if 'weight_ih' in name or 'weight_hh' in name:
                nn.init.xavier_normal_(param)

        nn.init.xavier_normal_(self.fc1.weight)


    def forward(self, x):
        x = x.to(device)
        x = self.embed(x)

        # hidden state と cell state ゼロになる
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(device)
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(device)


        x, _ = self.lstm(x, (h0, c0))


        x = self.fc1(x[:, -1, :])
        x = self.fc2(x)

        return x

X_train, y_train = GetCodeLow("train")
# print(X_train.shape, y_train.shape)
X_test, y_test = GetCodeLow("test")
X_test, y_test = X_test.to(device), y_test.to(device)
# print(X_test.shape, y_test.shape)

num_epochs = 50
batch_size = 64
lr = 5e-1

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

train_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# lossとSGDを定義する
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

model = LSTM(VOCAB_SIZE, EMB_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(device)

# modelを訓練する
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)

      # 前向传播
      outputs = model(X_batch)
      loss = criterion(outputs, y_batch)

      # 反向传播和优化
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()


    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 評価する
model.eval()
with torch.no_grad():
    Y_pred = model(X_test).to(device)
    pred = torch.argmax(Y_pred, dim=-1)
    accuracy = (y_test == pred).sum().item() / float(len(Y_pred))
    print("accuracy: ", accuracy)

Epoch [1/50], Loss: 0.8386
Epoch [2/50], Loss: 0.7997
Epoch [3/50], Loss: 0.8470
Epoch [4/50], Loss: 0.8270
Epoch [5/50], Loss: 0.7437
Epoch [6/50], Loss: 0.8519
Epoch [7/50], Loss: 0.8063
Epoch [8/50], Loss: 0.7636
Epoch [9/50], Loss: 0.8251
Epoch [10/50], Loss: 0.8273
Epoch [11/50], Loss: 0.8062
Epoch [12/50], Loss: 0.7884
Epoch [13/50], Loss: 0.8058
Epoch [14/50], Loss: 0.8074
Epoch [15/50], Loss: 0.8155
Epoch [16/50], Loss: 0.7646
Epoch [17/50], Loss: 0.7445
Epoch [18/50], Loss: 0.8265
Epoch [19/50], Loss: 0.7844
Epoch [20/50], Loss: 0.8348
Epoch [21/50], Loss: 0.7649
Epoch [22/50], Loss: 0.8689
Epoch [23/50], Loss: 0.8475
Epoch [24/50], Loss: 0.8063
Epoch [25/50], Loss: 0.8132
Epoch [26/50], Loss: 0.8145
Epoch [27/50], Loss: 0.8271
Epoch [28/50], Loss: 0.8309
Epoch [29/50], Loss: 0.8073
Epoch [30/50], Loss: 0.7670
Epoch [31/50], Loss: 0.8686
Epoch [32/50], Loss: 0.8884
Epoch [33/50], Loss: 0.7759
Epoch [34/50], Loss: 0.7875
Epoch [35/50], Loss: 0.7658
Epoch [36/50], Loss: 0.8115
E

In [None]:
# 86

import torch.nn.functional as F

class CNN(nn.Module):
  def __init__(self, vocab_size, dw, dh, output):
      super().__init__()
      self.embed = nn.Embedding(vocab_size, dw, padding_idx=vocab_size-1)
      # 入力したdataを300次元の単語ベクトルへ変更する

      self.conv1 = nn.Conv2d(1, 3, kernel_size=(3, 300))
      # 畳み込みのストライド: 1 トークン
      # 畳み込みのフィルターのサイズ: 3 トークン
      # kernel_sizeのサイズは(3, 300)


      self.tanh = nn.ReLU()
      # 活性化関数ReLUを施す

      self.fc1 = nn.Linear(3, output, bias=True)
      # 3次元->4次元

      self.fc2 = nn.Softmax(dim=1)
      # Softmax関数を施す

  def forward(self, x):
      x = self.embed(x)
      x = x.unsqueeze(1)
      x = self.conv1(x)
      x = self.tanh(x)
      x = F.max_pool2d(x, kernel_size=(x.size()[2], 1))
      # 畳み込み層の出力に対する最大値プーリング

      x = x.view(-1, 3)
      x = self.fc1(x)
      x = self.fc2(x)
      return x


X_valid, Y_valid = GetCodeLow("valid")
X_valid, Y_valid = X_valid.to(device), Y_valid.to(device)
# print(Y_valid)

VOCAB_SIZE = CountVocab("train")+1
EMB_SIZE = 300
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50

model = CNN(VOCAB_SIZE, EMB_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(device)
Y_pred = model(X_valid)
# print(Y_pred)

# 获取预测标签
pred = torch.argmax(Y_pred, dim=-1)
# print(pred)

# 正解率を計算する
accuracy = sum(1 for x, y in zip(Y_valid, pred) if x == y) / float(len(Y_pred))
print("accuracy: ", accuracy)

accuracy:  0.11469265367316342


  return F.conv2d(input, weight, bias, self.stride,


In [None]:
# 87

X_train, y_train = GetCodeLow("train")
# print(X_train.shape, y_train.shape)
X_test, y_test = GetCodeLow("test")
X_test, y_test = X_test.to(device), y_test.to(device)
# print(X_test.shape, y_test.shape)

num_epochs = 50
batch_size = 64
lr = 5e-1

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

train_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# lossとSGDを定義する
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

# modelを訓練する
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)

      # 前向传播
      outputs = model(X_batch)
      loss = criterion(outputs, y_batch)

      # 反向传播和优化
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()


    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 評価する
model.eval()
with torch.no_grad():
    Y_pred = model(X_test).to(device)
    pred = torch.argmax(Y_pred, dim=-1)
    accuracy = (y_test == pred).sum().item() / float(len(Y_pred))
    print("accuracy: ", accuracy)

  return F.conv2d(input, weight, bias, self.stride,
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [1/50], Loss: 0.9786
Epoch [2/50], Loss: 1.0566
Epoch [3/50], Loss: 1.0134
Epoch [4/50], Loss: 0.9410
Epoch [5/50], Loss: 1.0573
Epoch [6/50], Loss: 0.9548
Epoch [7/50], Loss: 1.0312
Epoch [8/50], Loss: 0.8907
Epoch [9/50], Loss: 1.0506
Epoch [10/50], Loss: 1.0337
Epoch [11/50], Loss: 0.9309
Epoch [12/50], Loss: 1.0137
Epoch [13/50], Loss: 0.9881
Epoch [14/50], Loss: 0.9128
Epoch [15/50], Loss: 0.9482
Epoch [16/50], Loss: 0.9096
Epoch [17/50], Loss: 0.9913
Epoch [18/50], Loss: 0.9927
Epoch [19/50], Loss: 0.9729
Epoch [20/50], Loss: 1.0335
Epoch [21/50], Loss: 0.8683
Epoch [22/50], Loss: 0.9616
Epoch [23/50], Loss: 0.9692
Epoch [24/50], Loss: 0.8871
Epoch [25/50], Loss: 0.9072
Epoch [26/50], Loss: 0.8465
Epoch [27/50], Loss: 1.0322
Epoch [28/50], Loss: 0.8887
Epoch [29/50], Loss: 0.9689
Epoch [30/50], Loss: 0.9937
Epoch [31/50], Loss: 1.0116
Epoch [32/50], Loss: 0.9488
Epoch [33/50], Loss: 1.0515
Epoch [34/50], Loss: 0.9313
Epoch [35/50], Loss: 0.9104
Epoch [36/50], Loss: 0.9088
E

In [None]:
%%bash
pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 380.1/380.1 kB 6.3 MB/s eta 0:00:00
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.4/233.4 kB 29.6 MB/s eta 0:00:00
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.6/78.6 kB 14.0 MB/s eta 0:00:00
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [None]:
# 88

# LSTMを定義する
class LSTM(nn.Module):
    def __init__(self, vocab_size, dw, dh, output):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, dw, padding_idx=vocab_size-1)
        # 入力したdataを300次元の単語ベクトルへ変更する
        self.lstm = nn.LSTM(dw, dh, batch_first=True)
        # LSTMを実現する
        self.fc1 = nn.Linear(dh, output, bias=True)
        # 300次元から４次元に変更する
        self.fc2 = nn.Softmax(dim=1)
        # Softmax関数を施す

        # 重みを初期化する
        nn.init.xavier_normal_(self.lstm.weight_ih_l0)
        nn.init.xavier_normal_(self.lstm.weight_hh_l0)
        nn.init.xavier_normal_(self.fc1.weight)


    def forward(self, x):
        x = x.to(device)
        x = self.embed(x)
        x, _ = self.lstm(x)
        x = self.fc1(x[:, -1, :])
        x = self.fc2(x)
        return x
# CNNを定義する
class CNN(nn.Module):
  def __init__(self, vocab_size, dw, output, layer, unit, activation):
      super().__init__()
      self.layer = layer
      self.embed = nn.Embedding(vocab_size, dw, padding_idx = vocab_size-1)

      # unit と　layerによって異なる畳み込みlayerを設定する
      if unit == 6:
          units = [6, 4, 2]
      elif unit == 4:
          units = [4, 3, 2]
      elif unit == 2:
          units = [2, 2, 2]
      self.conv1 = nn.Conv2d(1, units[0], kernel_size=(units[0], 300))
      linearoutput = units[0]
      if layer > 1:
          self.conv2 = nn.Conv2d(units[0], units[1], kernel_size=(units[1],1))
          linearoutput = units[1]
      if layer > 2:
          self.conv3 = nn.Conv2d(units[1], units[2], kernel_size=(units[2],1))
          linearoutput = units[2]


      self.fc1 = nn.Linear(linearoutput, output, bias=True)
      self.fc2 = nn.Softmax(dim=1)


      # 活性化関数を選ぶ
      if activation == "Tanh":
          self.active = nn.Tanh()
      elif activation == "ReLU":
          self.active = nn.ReLU()
      elif activation == "Sigmoid":
          self.active = nn.Sigmoid()


  def forward(self, x):
      x = self.embed(x)
      x = x.unsqueeze(1)
      x = self.conv1(x)
      x = self.active(x)
      if self.layer > 1:
          x = self.conv2(x)
          x = self.active(x)
      if self.layer > 2:
          x = self.conv3(x)
          x = self.active(x)
      x = F.max_pool2d(x, kernel_size=(x.size()[2], 1))
      x = x.view(x.size()[0], -1)
      x = self.fc1(x)
      x = self.fc2(x)
      return x

# modelを訓練する
def train_model(X_train, y_train, X_test, y_test, batch_size, model, lr, num_epochs, device, collate_fn=None, optimizer_select="SGD"):
    dataset_train = TensorDataset(X_train, y_train)
    dataset_test = TensorDataset(X_test, y_test)
    model = model.to(device)
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    for ep in range(num_epochs):
        if ep%20==0:
            lr = lr * 0.1

        # optimizerを選ぶ
        if optimizer_select == "SGD":
            optimizer = torch.optim.SGD(model.parameters(), lr=lr)
        elif optimizer_select == "Adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        elif optimizer_select == "RMSprop":
          optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)

        model.train()
        for X, Y in dataloader_train:
            X = X.to(device)
            Y = Y.to(device)
            optimizer.zero_grad()
            Y_pred = model(X)
            loss = criterion(Y_pred, Y)
            loss.backward()
            optimizer.step()
    model.eval()
    _, acc_test = calculate_loss_and_accuracy(model, dataset_test, device, criterion=criterion)

    return acc_test

# loss と　正解率を計算する
def calculate_loss_and_accuracy(model, dataset, device, criterion=None):
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    loss = 0.0
    total = 0
    correct = 0
    model = model.to(device)
    with torch.no_grad():
        for X, Y in dataloader:
            X = X.to(device)
            Y = Y.to(device)
            Y_pred = model(X)
            if criterion != None:
                loss += criterion(Y_pred, Y).item()
            pred = torch.argmax(Y_pred, dim=-1)
            total += len(Y)
            correct += (pred == Y).sum().item()
    return loss / len(dataset), correct / total

In [None]:
def objective_LSTM(trial):

    # dataを読み込む
    X_train, Y_train = GetCodeLow("train")
    X_test, Y_test = GetCodeLow("test")

    # ハイパーパラメータを設定する
    BATCH_SIZE = 2
    NUM_EPOCHS = 10
    VOCAB_SIZE = CountVocab("train")+1
    EMB_SIZE = 300
    OUTPUT_SIZE = 4
    lr = 1e-2

    # Optuna を利用する
    model_name_display_only = trial.suggest_categorical("model_name_LSTM", ["LSTM"])
    HIDDEN_SIZE = trial.suggest_categorical("HIDDEN_SIZE", [10, 50, 100, 500])
    optimizer_select = trial.suggest_categorical("optimizer_select", ["SGD", "Adam", "RMSprop"])

    model = LSTM(VOCAB_SIZE, EMB_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
    score = train_model(X_train, Y_train, X_test, Y_test, BATCH_SIZE, model, lr, NUM_EPOCHS, device, optimizer_select)
    return score

def objective_CNN(trial):

    # dataを読み込む
    X_train, Y_train = GetCodeLow("train")
    X_test, Y_test = GetCodeLow("test")

    # ハイパーパラメータを設定する
    BATCH_SIZE = 2
    NUM_EPOCHS = 10
    VOCAB_SIZE = CountVocab("train")+1
    EMB_SIZE = 300
    OUTPUT_SIZE = 4
    lr = 1e-2

    # Optuna を利用する
    model_name_display_only = trial.suggest_categorical("model_name_CNN", ["CNN"])
    layer = trial.suggest_categorical("layer", [1,2,3])
    unit = trial.suggest_categorical("unit", [2,4,6])
    activation = trial.suggest_categorical("activation", ["Tanh", "Sigmoid", "ReLU"])
    optimizer_select = trial.suggest_categorical("optimizer_select", ["SGD", "Adam", "RMSprop"])


    model = CNN(VOCAB_SIZE, EMB_SIZE, OUTPUT_SIZE, layer, unit, activation)
    score = train_model(X_train, Y_train, X_test, Y_test, BATCH_SIZE, model, lr, NUM_EPOCHS, device, optimizer_select)
    return score

In [None]:
import optuna
import torch.nn.functional as F

study = optuna.create_study(direction='maximize')

study.optimize(objective_CNN, n_trials=81)
study.optimize(objective_LSTM, n_trials=15)

print(study.best_params)
print(study.best_value)

[I 2024-06-23 07:50:12,257] A new study created in memory with name: no-name-d70107df-c36b-4942-ad4b-b7f00a1bf91c
[I 2024-06-23 07:51:53,037] Trial 0 finished with value: 0.6611694152923538 and parameters: {'model_name_CNN': 'CNN', 'layer': 3, 'unit': 2, 'activation': 'ReLU', 'optimizer_select': 'RMSprop'}. Best is trial 0 with value: 0.6611694152923538.
[I 2024-06-23 07:53:36,770] Trial 1 finished with value: 0.643928035982009 and parameters: {'model_name_CNN': 'CNN', 'layer': 3, 'unit': 6, 'activation': 'ReLU', 'optimizer_select': 'SGD'}. Best is trial 0 with value: 0.6611694152923538.
[I 2024-06-23 07:55:17,340] Trial 2 finished with value: 0.4325337331334333 and parameters: {'model_name_CNN': 'CNN', 'layer': 3, 'unit': 4, 'activation': 'Sigmoid', 'optimizer_select': 'Adam'}. Best is trial 0 with value: 0.6611694152923538.
[I 2024-06-23 07:56:35,306] Trial 3 finished with value: 0.6574212893553223 and parameters: {'model_name_CNN': 'CNN', 'layer': 1, 'unit': 2, 'activation': 'ReLU',

{'model_name_CNN': 'CNN', 'layer': 1, 'unit': 6, 'activation': 'Tanh', 'optimizer_select': 'Adam'}
0.7263868065967016


In [None]:
# 89
# Bertを使う

%%bash
pip install transformers -q

In [None]:
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
import numpy as np
import torch

In [None]:
# 89

class Bert(nn.Module):
  def __init__(self):
      super().__init__()

      # 事前訓練済みBERTモデルをロードする
      self.bert = AutoModel.from_pretrained("bert-base-uncased")

      # 768次元->4次元
      self.classifier = nn.Linear(in_features = 768, out_features = 4)


  def forward(self, input_ids, attention_mask, token_type_ids):
      outputs = self.bert(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)

      # pooling
      pooler_output = outputs.pooler_output
      logits = self.classifier(pooler_output).squeeze(-1)
      return logits


# Bert modelの利用ため、datasetを改造する
class BertDataset(Dataset):
  def __init__(self, data, label):
      super().__init__()

      #　dataの長さ
      self.data_length = len(data["input_ids"])

      # 入力したdata
      self.x_input_ids = data["input_ids"]

      # 異なる文を区別するために使用される
      self.x_token_type_ids = data["token_type_ids"]

      # どのtokenが実際の入力の一部であり、どのtokenがpadding部分であるかを示すために使用される
      self.x_attention_mask = data["attention_mask"]

      # labelをもらう
      self.y = label

  def __len__(self):
      return self.data_length
  def __getitem__(self, idx):
      # idx番目のデータを取得する
      x_input_ids = torch.tensor(self.x_input_ids[idx])
      x_token_type_ids = torch.tensor(self.x_token_type_ids[idx])
      x_attention_mask = torch.tensor(self.x_attention_mask[idx])
      return {"input_ids":x_input_ids, "token_type_ids":x_token_type_ids, "x_attention_mask":x_attention_mask}, torch.tensor(self.y[idx])

# loss　と　正解率を計算する
def calculate_loss_and_accuracy(model, dataset, device, criterion=None):
    dataloader = DataLoader(dataset, batch_size=256, shuffle=False)
    loss = 0.0
    total = 0
    correct = 0
    model = model.to(device)
    with torch.no_grad():
      for X, Y in dataloader:
          input_ids = X["input_ids"].to(device)
          attention_mask = X["x_attention_mask"].to(device)
          token_type_ids = X["token_type_ids"].to(device)
          Y = Y.to(device)
          Y_pred =  model(input_ids, attention_mask, token_type_ids)
          if criterion != None:
              loss += criterion(Y_pred, Y).item()
          pred = torch.argmax(Y_pred, dim=-1)
          total += len(Y)
          correct += (pred == Y).sum().item()
    return loss / len(dataset), correct / total

# model を訓練する
def train_model(X_train, y_train, X_test, y_test, batch_size, model, lr, num_epochs, device, collate_fn=None):

    dataset_train = BertDataset(X_train, y_train)
    dataset_test = BertDataset(X_test, y_test)
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    for ep in range(num_epochs):
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
        model.train()
        if ep%30==0:
            lr = lr * 0.1
        for X, Y in dataloader_train:
            input_ids = X["input_ids"].to(device)
            attention_mask = X["x_attention_mask"].to(device)
            token_type_ids = X["token_type_ids"].to(device)

            Y = Y.to(device)
            optimizer.zero_grad()
            Y_pred = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(Y_pred, Y)
            loss.backward()
            optimizer.step()

        model.eval()

        loss_train, acc_train = calculate_loss_and_accuracy(model, dataset_train, device, criterion=criterion)
        loss_test, acc_test = calculate_loss_and_accuracy(model, dataset_test, device, criterion=criterion)

        print(f'epoch: {ep + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_Test: {loss_test:.4f}, accuracy_Test: {acc_test:.4f}')


# title と　CATEGORYのcodeをもらう
def GetStrLow(name):
    f = open(PATH + "{}_code.txt".format(name), "r")
    lines = f.readlines()
    f.close()
    sent_list = []
    code_list = []

    for line in lines:

      try:
        line_s = line.split("\t")
        code_list.append(int(line_s[0]))
        sent = line_s[1].replace("\n", "")
        sent_list.append(sent)
      except:
        pass

    code_list = torch.tensor(code_list)
    return sent_list, code_list

X_train, Y_train = GetStrLow("train")
X_test, Y_test = GetStrLow("test")

MAX_LENGTH = 32
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
X_train_tokenizer = tokenizer.batch_encode_plus(X_train, padding = "max_length", max_length = MAX_LENGTH, truncation=True)
X_test_tokenizer = tokenizer.batch_encode_plus(X_test, padding = "max_length", max_length = MAX_LENGTH, truncation=True)

BATCH_SIZE = 8
NUM_EPOCHS = 20
lr = 1e-3

model = Bert()
train_model(X_train_tokenizer, Y_train, X_test_tokenizer, Y_test, BATCH_SIZE, model, lr, NUM_EPOCHS, device)

  return {"input_ids":x_input_ids, "token_type_ids":x_token_type_ids, "x_attention_mask":x_attention_mask}, torch.tensor(self.y[idx])


epoch: 1, loss_train: 0.0010, accuracy_train: 0.9190, loss_Test: 0.0013, accuracy_Test: 0.9085
epoch: 2, loss_train: 0.0009, accuracy_train: 0.9263, loss_Test: 0.0012, accuracy_Test: 0.9130
epoch: 3, loss_train: 0.0009, accuracy_train: 0.9317, loss_Test: 0.0012, accuracy_Test: 0.9175
epoch: 4, loss_train: 0.0008, accuracy_train: 0.9346, loss_Test: 0.0011, accuracy_Test: 0.9220
epoch: 5, loss_train: 0.0008, accuracy_train: 0.9376, loss_Test: 0.0011, accuracy_Test: 0.9213
epoch: 6, loss_train: 0.0007, accuracy_train: 0.9403, loss_Test: 0.0011, accuracy_Test: 0.9235
epoch: 7, loss_train: 0.0007, accuracy_train: 0.9397, loss_Test: 0.0011, accuracy_Test: 0.9258
epoch: 8, loss_train: 0.0007, accuracy_train: 0.9454, loss_Test: 0.0011, accuracy_Test: 0.9273
epoch: 9, loss_train: 0.0007, accuracy_train: 0.9477, loss_Test: 0.0011, accuracy_Test: 0.9228
epoch: 10, loss_train: 0.0006, accuracy_train: 0.9499, loss_Test: 0.0011, accuracy_Test: 0.9295
epoch: 11, loss_train: 0.0006, accuracy_train: 0.