In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!pip install portalocker

import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext import datasets
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import string
import re
from typing import List, Union

seed = 1234
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


# 学習データ
X_train = np.load('/content/drive/MyDrive/Colab Notebooks/Homework (6)/x_train.npy', allow_pickle=True)
T_train = np.load('/content/drive/MyDrive/Colab Notebooks/Homework (6)/t_train.npy', allow_pickle=True)





In [33]:
# 検証データを取る
x_train, x_valid, t_train, t_valid = train_test_split(X_train, T_train, test_size=0.2, random_state=seed)

# テストデータ
x_test = np.load('/content/drive/MyDrive/Colab Notebooks/Homework (6)/x_test.npy', allow_pickle=True)


def text_transform(text: List[int], max_length=256):
    # <BOS>はすでに1で入っている．<EOS>は2とする．
    text = text[:max_length - 1] + [2]

    return text, len(text)

def collate_batch(batch):
    label_list, text_list, len_seq_list = [], [], []

    for sample in batch:
        if isinstance(sample, tuple):
            label, text = sample

            label_list.append(label)
        else:
            text = sample.copy()

        text, len_seq = text_transform(text)
        text_list.append(torch.tensor(text))
        len_seq_list.append(len_seq)

    # NOTE: 宿題用データセットでは<PAD>は3です．
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3).T, torch.tensor(len_seq_list)


word_num = np.concatenate(np.concatenate((x_train, x_test))).max() + 1
print(f"単語種数: {word_num}")

単語種数: 88587


In [3]:
batch_size = 128

train_dataloader = DataLoader(
    [(t, x) for t, x in zip(t_train, x_train)],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch,
)
valid_dataloader = DataLoader(
    [(t, x) for t, x in zip(t_valid, x_valid)],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch,
)
test_dataloader = DataLoader(
    x_test,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch,
)

In [23]:
def torch_log(x):
    return torch.log(torch.clamp(x, min=1e-10))


class Embedding(nn.Module):
    def __init__(self, emb_dim, vocab_size):
        super().__init__()
        self.embedding_matrix = nn.Parameter(torch.rand((vocab_size, emb_dim),
                                                        dtype=torch.float))

    def forward(self, x):
        return F.embedding(x, self.embedding_matrix)

class LSTM(nn.Module):
    def __init__(self, in_dim, hid_dim):
        super().__init__()
        self.hid_dim = hid_dim
        glorot = 6/(in_dim + hid_dim*2)

        self.W_i = nn.Parameter(torch.tensor(np.random.uniform(
                        low=-np.sqrt(glorot),
                        high=np.sqrt(glorot),
                        size=(in_dim + hid_dim, hid_dim)
                    ).astype('float32')))
        self.b_i = nn.Parameter(torch.tensor(np.zeros([hid_dim]).astype('float32')))

        self.W_f = nn.Parameter(torch.tensor(np.random.uniform(
                        low=-np.sqrt(glorot),
                        high=np.sqrt(glorot),
                        size=(in_dim + hid_dim, hid_dim)
                    ).astype('float32')))
        self.b_f = nn.Parameter(torch.tensor(np.zeros([hid_dim]).astype('float32')))

        self.W_o = nn.Parameter(torch.tensor(np.random.uniform(
                        low=-np.sqrt(glorot),
                        high=np.sqrt(glorot),
                        size=(in_dim + hid_dim, hid_dim)
                    ).astype('float32')))
        self.b_o = nn.Parameter(torch.tensor(np.zeros([hid_dim]).astype('float32')))

        self.W_c = nn.Parameter(torch.tensor(np.random.uniform(
                        low=-np.sqrt(glorot),
                        high=np.sqrt(glorot),
                        size=(in_dim + hid_dim, hid_dim)
                    ).astype('float32')))
        self.b_c = nn.Parameter(torch.tensor(np.zeros([hid_dim]).astype('float32')))

    def function(self, state_c, state_h, x):
        i = torch.sigmoid(torch.matmul(torch.cat([state_h, x], dim=1), self.W_i) + self.b_i)
        f = torch.sigmoid(torch.matmul(torch.cat([state_h, x], dim=1), self.W_f) + self.b_f)
        o = torch.sigmoid(torch.matmul(torch.cat([state_h, x], dim=1), self.W_o) + self.b_o)
        c = f*state_c + i*torch.tanh(torch.matmul(torch.cat([state_h, x], dim=1), self.W_c) + self.b_c)
        h = o*torch.tanh(c)
        return c, h

    def forward(self, x, len_seq_max=0, init_state_c=None, init_state_h=None):
        x = x.transpose(0, 1)  # 系列のバッチ処理のため、次元の順番を「系列、バッチ」の順に入れ替える
        state_c = init_state_c
        state_h = init_state_h
        if init_state_c is None:  # 初期値を設定しない場合は0で初期化する
            state_c = torch.zeros((x[0].size()[0], self.hid_dim)).to(x.device)
        if init_state_h is None:  # 初期値を設定しない場合は0で初期化する
            state_h = torch.zeros((x[0].size()[0], self.hid_dim)).to(x.device)

        size = list(state_h.unsqueeze(0).size())
        size[0] = 0
        output = torch.empty(size, dtype=torch.float).to(x.device)  # 一旦空テンソルを定義して順次出力を追加する

        if len_seq_max == 0:
            len_seq_max = x.size(0)
        for i in range(len_seq_max):
            state_c, state_h = self.function(state_c, state_h, x[i])
            output = torch.cat([output, state_h.unsqueeze(0)])  # 出力系列の追加
        return output



# class SequenceTaggingNet(nn.Module):
#     # WRITE ME


class BidirectionalLSTM(nn.Module):
    def __init__(self, word_num, emb_dim, hid_dim, dropout=0.5):
        super().__init__()
        # 順方向と逆方向のLSTMを用意する
        self.emb = nn.Embedding(word_num, emb_dim)
        self.dropout = nn.Dropout(dropout)
        self.forward_lstm = nn.LSTM(emb_dim, hid_dim, 1, batch_first=True)
        self.backward_lstm = nn.LSTM(emb_dim, hid_dim, 1, batch_first=True)
        self.linear = nn.Linear(hid_dim*2, 1) # ForwardとBackwardの出力をconcatしたものを渡すので2倍

    def forward(self, x, len_seq_max=0, len_seq=None, init_state=None):
        h = self.emb(x) # (batch_size, seq_length, emb_dim)
        h = self.dropout(h)
        # Backwardにはシークエンスを反転して渡す
        if len_seq_max > 0:
            h1, _ = self.forward_lstm(h[:, 0:len_seq_max, :], init_state)
            h2, _ = self.backward_lstm(torch.flip(h[:, 0:len_seq_max, :], dims=[1]), init_state)
        else:
            h1, _ = self.forward_lstm(h, init_state) # (batch_size, seq_length, hid_dim)
            h2, _ = self.backward_lstm(torch.flip(h, dims=[1]), init_state) # (batch_size, seq_length, hid_dim)
        # Backwardから返ってきたものを再び反転する
        h2 = torch.flip(h2, dims=[1])

        # ForwardとBackwardの出力を結合
        h = torch.cat([h1, h2], dim=2).transpose(0, 1)

        if len_seq is not None:
            h = h[len_seq - 1, list(range(len(x))), :]
        else:
            h = h[-1]

        y = self.linear(h)

        return y

In [None]:
emb_dim = 100
hid_dim = 30
n_epochs = 50
device = 'cuda'

# net = SequenceTaggingNet(word_num, emb_dim, hid_dim)
net = BidirectionalLSTM(word_num, emb_dim, hid_dim)
net.to(device)
optimizer = optim.Adam(net.parameters())

In [34]:
for epoch in range(n_epochs):
    losses_train = []
    losses_valid = []


    net.train()
    n_train = 0
    acc_train = 0
    for label, line, len_seq in train_dataloader:

        net.zero_grad()

        x = line.to(device)
        len_seq.to(device)
        t = label.to(device)

        h = net(x, torch.max(len_seq), len_seq)
        y = torch.sigmoid(h).squeeze()

        loss = -torch.mean(t*torch_log(y) + (1 - t)*torch_log(1 - y))

        loss.backward()

        torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)

        optimizer.step()

        losses_train.append(loss.tolist())

        n_train += t.size()[0]

    # Valid
    t_valid = []
    y_pred = []
    net.eval()
    for label, line, len_seq in valid_dataloader:

        # WRITE ME
        x = line.to(device)
        len_seq.to(device)
        t = label.to(device)

        h = net(x, torch.max(len_seq), len_seq)
        y = torch.sigmoid(h).squeeze()

        loss = -torch.mean(t*torch_log(y) + (1 - t)*torch_log(1 - y))

        pred = y.round().squeeze()  # 0.5以上の値を持つ要素を正ラベルと予測する

        t_valid.extend(t.tolist())
        y_pred.extend(pred.tolist())

        losses_valid.append(loss.tolist())

    print('EPOCH: {}, Train Loss: {:.3f}, Valid Loss: {:.3f}, Validation F1: {:.3f}'.format(
        epoch,
        np.mean(losses_train),
        np.mean(losses_valid),
        f1_score(t_valid, y_pred, average='macro')
    ))

EPOCH: 0, Train Loss: 0.024, Valid Loss: 0.789, Validation F1: 0.873
EPOCH: 1, Train Loss: 0.023, Valid Loss: 0.878, Validation F1: 0.866
EPOCH: 2, Train Loss: 0.025, Valid Loss: 0.800, Validation F1: 0.870
EPOCH: 3, Train Loss: 0.022, Valid Loss: 0.873, Validation F1: 0.866
EPOCH: 4, Train Loss: 0.021, Valid Loss: 0.861, Validation F1: 0.868
EPOCH: 5, Train Loss: 0.022, Valid Loss: 0.868, Validation F1: 0.866
EPOCH: 6, Train Loss: 0.023, Valid Loss: 0.806, Validation F1: 0.869
EPOCH: 7, Train Loss: 0.021, Valid Loss: 0.868, Validation F1: 0.865
EPOCH: 8, Train Loss: 0.018, Valid Loss: 0.927, Validation F1: 0.866
EPOCH: 9, Train Loss: 0.020, Valid Loss: 0.862, Validation F1: 0.871


KeyboardInterrupt: 

In [35]:
net.eval()

y_pred = []
for _, line, len_seq in test_dataloader:

    x = line.to(device)
    len_seq.to(device)

    h = net(x, torch.max(len_seq), len_seq)
    y = torch.sigmoid(h).squeeze()

    pred = y.round().squeeze()  # 0.5以上の値を持つ要素を正ラベルと予測する

    y_pred.extend(pred.tolist())


submission = pd.Series(y_pred, name='label')
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/Homework (6)/submission_pred.csv', header=True, index_label='id')