In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# その他
import warnings
warnings.simplefilter('ignore') # warning無視
%cd /content/drive/MyDrive/grad_comp

/content/drive/MyDrive/grad_comp


In [None]:
# ====================
# ライブラリのインストール
# ====================

! pip install torch==1.6.0
! pip install torchtext==0.7.0
! pip install pytorch-lightning==1.0.8

In [None]:
# ====================
# ライブラリの読み込み
# ====================
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchtext.data import Example, Field, Dataset, BucketIterator
import numpy as np
from sklearn.metrics import cohen_kappa_score

In [None]:
# ====================
# 前処理：ラベルの読み込み
# ====================

# ラベルの集合
label_set = set()
labels = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.dev.txt')
for label in labels:
    label_set.add(label)


# ラベルをIDに変換するための辞書
label2id = dict()

for label in sorted(labels):
    label = int(label)
    label2id[label] = len(label2id) - 1

labels, label2id # 1始まりに変換

(array([ 0., -2., -1., ...,  2.,  2.,  2.]), {-2: 0, -1: 1, 0: 2, 1: 3, 2: 4})

In [None]:
%cd /content/drive/MyDrive/grad_comp

/content/drive/MyDrive/grad_comp


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# preprocess path
prepro = 'sudachi/C'

# setup data

f = open('preprocess/' + prepro + '/text.prep_train.txt', 'r')
train_data = f.read()
train_data = train_data.split('\n')
del train_data[30000]

f = open('preprocess/' + prepro + '/text.prep_test.txt', 'r')
test_data = f.read()
test_data = test_data.split('\n')
del test_data[2500]

f = open('preprocess/' + prepro + '/text.prep_dev.txt', 'r')
dev_data = f.read()
dev_data = dev_data.split('\n')
del dev_data[2500]

# label
y_train = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.train.txt')
y_dev = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.dev.txt')
y_test = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.dummy.txt')

pseudo_data = np.loadtxt('/content/drive/MyDrive/grad_comp/pseudo/504.txt')
train_data = np.concatenate([train_data,test_data])
y_train = np.concatenate([y_train, pseudo_data])
print(train_data.shape,y_train.shape)

"""
y_train += 2
y_dev += 2
y_test += 2
"""

(32500,) (32500,)


'\ny_train += 2\ny_dev += 2\ny_test += 2\n'

In [None]:
# ====================
# 前処理：データローダの作成
# ====================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# データの読み込み
text_field = Field(sequential=True, use_vocab=True, tokenize=None) # ここで定義するので、わざわざtokenizeの必要があるかも
label_field = Field(sequential=False, use_vocab=False, is_target=True)
fields = [("x", text_field), ("t", label_field)]

def load_corpus(text,labels):
    examples = list()
    for line,label in zip(text,labels):
      word_list = line.split(' ')
      #print(word_list)
      label = int(label)
      label_id = label2id[label]
      examples.append(Example.fromlist([word_list, label_id], fields))
    return Dataset(examples, fields)


dataset_train = load_corpus(train_data,y_train)
dataset_val = load_corpus(dev_data,y_dev)
dataset_test = load_corpus(test_data,y_test)

# 語彙を登録（訓練データに含まれる単語にIDを割り振る）
text_field.build_vocab(dataset_train, min_freq=2)

batch_size = 4

# データセットオブジェクトからデータローダーを作成
dataloader_train = BucketIterator(dataset_train, batch_size=batch_size, shuffle=True,device=device)
dataloader_val = BucketIterator(dataset_val, batch_size=batch_size, shuffle=False,device=device)
dataloader_test = BucketIterator(dataset_test, batch_size=1, shuffle=False)

In [None]:
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


In [None]:
# ====================
# RNNによるテキスト分類
# ====================

class RNN(pl.LightningModule):

    # 埋め込み層, 隠れ層, 全結合層の定義 
    def __init__(self, n_input, n_embed, n_hidden, n_layers, n_output, dropout, bidirectional):
        super(RNN, self).__init__()
        self.embed = nn.Embedding(num_embeddings=n_input, embedding_dim=n_embed, padding_idx=1)
        self.lstm = nn.LSTM(input_size=n_embed, hidden_size=n_hidden, num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)
        self.fc = nn.Linear(in_features=n_hidden * (2 if bidirectional==True else 1), out_features=n_output)
    
    # 順伝播
    def forward(self, x):
        o, (h, c) = self.lstm((self.embed(x))) # output,
        #print(o.shape,o[-1].shape) #torch.Size([系列長, batch,hidden*2]) torch.Size([batch,hidden*2])
        #print(o.size(0))
        mid = o.size(0)
        mid //= 2
        avg = (o[-1] + o[mid] )/ 2
        return self.fc(avg) # 平均、マックス、attention
    # 訓練用データのバッチを受け取って損失を計算
    def training_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        loss = self.lossfun(y, t)
        
        self.log("train_loss", loss)
        return loss
    
    # 検証用データのバッチを受け取って損失を計算
    def validation_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        loss = self.lossfun(y, t)
        self.log("val_loss", loss)

    # 評価用データのバッチを受け取って分類の正解率を計算
    def test_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        y = torch.argmax(y, dim=1)
        accuracy = torch.sum(t == y).item() / (len(y) * 1.0)
        self.log("test_acc", accuracy)

    def lossfun(self, y, t):
        return F.cross_entropy(y, t)

    # 最適化手法を設定
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=2e-4)

In [None]:
# ====================
# 訓練
# ====================

# パラメータ設定
EMBED = 128
HIDDEN = 256
LAYER = 2
DROPOUT = 0.5
BID = True

# モデルの保存用ディレクトリがすでにあり、新たに訓練する場合は、そのディレクトリを消す
# BiLSTM
! rm -r model_rnnC

n_input = len(text_field.vocab)
n_embed = EMBED  # 単語ベクトルの次元
n_hidden = HIDDEN  # 文ベクトルの次元
n_layers = LAYER  # RNN層の数
n_output = len(label2id)  # 出力ベクトルの次元（=ラベルの種類数）
dropout = DROPOUT  # ドロップアウトによる正則化の割合
bidirectional = BID

rnn1 = RNN(n_input, n_embed, n_hidden, n_layers, n_output, dropout, bidirectional)

# 訓練中にモデルを保存するための設定
checkpoint = pl.callbacks.ModelCheckpoint(
    # 検証用データにおける損失が最も小さいモデルを保存する
    monitor="val_loss", mode="min", save_top_k=3,
    # モデルファイル（重みのみ）を "model" というディレクトリに保存する
    save_weights_only=True, dirpath="model_rnnC/"
)
early_stopping = pl.callbacks.EarlyStopping(
    monitor='val_loss',
    mode="min",
    patience=5,
)

# 訓練
trainer = pl.Trainer(gpus = 1,max_epochs=30, callbacks=[checkpoint,early_stopping]) # early_stopping
trainer.fit(rnn1, dataloader_train, dataloader_val)

# ベストモデルの確認
print("ベストモデル: ", checkpoint.best_model_path)
print("ベストモデルの検証用データにおける損失: ", checkpoint.best_model_score)

## 推論

In [None]:
n_input = len(text_field.vocab)
n_embed = EMBED  # 単語ベクトルの次元
n_hidden = HIDDEN  # 文ベクトルの次元
n_layers = LAYER  # RNN層の数
n_output = len(label2id)  # 出力ベクトルの次元（=ラベルの種類数）
dropout = DROPOUT  # ドロップアウトによる正則化の割合
bidirectional = BID  # 双方向モデルにするかどうか

CHECK_PATH = checkpoint.best_model_path
#最適なモデルを呼び出す
rnn = RNN.load_from_checkpoint(
    checkpoint_path=CHECK_PATH,
    n_input=n_input, n_embed=n_embed, n_hidden=n_hidden, n_layers=n_layers, n_output=n_output, dropout=dropout, bidirectional=bidirectional
)

# 正解率の計算
def test_model(test_loader):

    device = "cpu"

    with torch.no_grad():

        y_preds = [] # 各バッチごとの結果格納用

        for batch in test_loader:
            x, t = batch
            x = x.to(device)
            t = t.to(device)
            y = rnn(x)
            y_label = torch.argmax(y, dim=1) - 2
            y_label = y_label.to('cpu').detach().numpy().copy()
            y_preds.append(y_label)
    return y_preds
# テストデータで結果確認
y_preds = test_model(dataloader_test)
print(y_preds)

# submit

with open('rnn_eval.txt','w') as f:
  for y_pred in y_preds:
    y_pred = int(y_pred)
    y_pred = str(y_pred)
    f.write(y_pred + '\n')