In [32]:
"""
BERTの勉強 note1
"""
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from transformers import AutoModel, AutoTokenizer

import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import glob, pickle

pretrained_model_name = "cl-tohoku/bert-base-japanese"

In [13]:
# データ読み込み
# 事前にトークナイズして保存しておいてもいい
df_train = pd.read_csv("../../DataSet/ldcc/reshaped/train.tsv", sep="\t", header=None)
df_valid = pd.read_csv("../../DataSet/ldcc/reshaped/valid.tsv", sep="\t", header=None)
df_test = pd.read_csv("../../DataSet/ldcc/reshaped/test.tsv", sep="\t", header=None)


# textとlabelに分ける
text_train, labels_train = list(df_train[0].values), list(df_train[1].values)
text_valid, labels_valid = list(df_valid[0].values), list(df_valid[1].values)
text_test,  labels_test  = list(df_test[0].values),  list(df_test[1].values)


In [15]:
# トークナイザを事前モデルからロード
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

In [16]:
# トークナイズ処理
enc_train = tokenizer(text_train, truncation=True, padding=True)
enc_valid = tokenizer(text_valid, truncation=True, padding=True)
enc_test = tokenizer(text_test, truncation=True, padding=True)

In [17]:
# タスク用Datasetクラスを定義
class LivedoorDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
        item["labels"] = torch.tensor(self.labels[idx]) # item["label"]でなくitem["labels"]が正しい！
        return item
    
    def __len__(self):
        return len(self.labels)     

In [18]:
# Datasetを作成
ds_train = LivedoorDataset(enc_train, labels_train)
ds_valid = LivedoorDataset(enc_valid, labels_valid)
ds_test = LivedoorDataset(enc_test, labels_test)

# pkl保存
with open("../../DataSet/ldcc/dataloader/ds_train.pkl", "wb") as f:
    pickle.dump(ds_train, f)
with open("../../DataSet/ldcc/dataloader/ds_valid.pkl", "wb") as f:
    pickle.dump(ds_valid, f)
with open("../../DataSet/ldcc/dataloader/ds_test.pkl", "wb") as f:
    pickle.dump(ds_test, f)


#### データの準備ここまで
ここからはファインチューニング用にモデル作る\
自作だろうがTransformersのモデル使おうが上記までは共通事項

In [19]:
batch_size_train = 16
batch_size_val = 64

bt_train = DataLoader(ds_train, batch_size=batch_size_train)
bt_val = DataLoader(ds_valid, batch_size=batch_size_val)
dataloader_dict = {"train": bt_train, "val": bt_val}

In [20]:
"""
ファインチューニング用モデル
"""
class BertClassifier(nn.Module):
    def __init__(self, pretrained_model):
        super(BertClassifier, self).__init__()
        
        self.bert = pretrained_model
        self.dropout = nn.Dropout(p=.1)
        self.classifier = nn.Linear(in_features=768, out_features=9) #9カテゴリのクラス分類
        
        # 重み初期化
        nn.init.normal_(self.classifier.weight, std=.02)
        nn.init.normal_(self.classifier.bias, 0)
        
    def forward(self, input_ids, labels=None, **kwargs):
        output = self.bert(input_ids)
        pooler_output = output.pooler_output
        pooler_output = self.dropout(pooler_output)
        output_classifier = self.classifier(pooler_output)
        return output_classifier        

In [21]:
# 事前学習モデルを用意
pretrained_model = AutoModel.from_pretrained(pretrained_model_name)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
# 自作Bertファインチューニングモデル
my_model = BertClassifier(pretrained_model)

my_model

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [31]:
# 計算時間短縮・モデルの性能低下の防止を目的に事前学習のレイヤを学習に関与しないようにする（＝Freeze）ことがある
# Freezeのやり方は各レイヤのparameters()で取り出せるparam.requires_gradをT/FでOn/Offする

# これにより全パラメータ固定
for param in my_model.parameters():
    param.requires_grad = False

# BERTの最終層の更新をON
for param in my_model.bert.encoder.layer[-1].parameters():
    param.requires_grad = True
# ドロップアウト層の更新をON
for param in my_model.dropout.parameters():
    param.requires_grad = True
# クラス分類層の更新をON
for param in my_model.classifier.parameters():
    param.requires_grad = True

In [33]:
# 最適化関数（重み更新式）の定義 Adam使う
optimizer = optim.Adam([
    {"params": my_model.bert.encoder.layer[-1].parameters(), "lr":5e-5},
    {"params": my_model.dropout.parameters(), "lr":1e-3},
    {"params": my_model.classifier.parameters(), "lr":1e-4},
])

# 損失関数　クラス分類なのでCrossEntropy
criterion = nn.CrossEntropyLoss()

#### モデルの定義ここまで
ここからは学習の工程を定義する


In [43]:
def train_model(net, dataloader_dict, criterion, optimizer, num_epochs):
    # GPU利用可能ならそうする
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net.to(device) # モデルをGPU or CPUに送る
    
    torch.backends.cudnn.benchmark = True # 高速化するらしい？
    
    # 以下、epochループ
    for epoch in tqdm(range(num_epochs)):
        # train と val をそれぞれ実施
        for phase in ["train", "val"]:
            batch_size = dataloader_dict[phase].batch_size
            if phase == "train":
                net.train()
            else:
                net.eval()
            
            epoch_loss = .0
            epoch_corrects = 0
            iteration = 1
            
            for batch in (dataloader_dict[phase]): # ミニバッチとしてデータ取り出す
                inputs = batch["input_ids"].to(device) # GPU or CPUへデータ送る
                labels = batch["labels"].to(device) # GPU or CPUへデータ送る
                
                optimizer.zero_grad() # optimizer初期化
                
                # feed forward処理
                with torch.set_grad_enabled(phase == "train"):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels) # nn.CrossEntropyLossは内部でSoftmax相当の処理をするのでoutput直でOK
                    _, preds = torch.max(outputs, 1)
                    
                    if phase == "train": # trainモードならback prop
                        loss.backward() # 損失から勾配計算しパラメータへ逆伝播
                        optimizer.step() # パラメータ更新
                    
                        if (iteration % 10 == 0):
                            acc = (torch.sum(preds == labels.data)).double() / batch_size
                            print(f"It:{iteration:3d}|Loss: {loss.item():.4f}|accuracy:{acc:.4f}")
                
                iteration += 1
                
                epoch_loss += loss.item() * batch_size
                epoch_corrects += torch.sum(preds == labels.data)
            
        # epoch毎に評価結果を出力
        epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
        epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)
        print(f"Epoch {epoch+1} / {num_epochs} | {phase} | Loss: {epoch_loss:.4f} | Acc: {epoch_acc}")
    
    return net #訓練後のモデルを出力    

In [44]:
# ファインチューニング
num_epochs = 1
my_model_trained = train_model(my_model, dataloader_dict, criterion, optimizer, num_epochs=num_epochs)


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

It: 10|Loss: 0.5591|accuracy:0.8125
It: 20|Loss: 0.3166|accuracy:0.8750
It: 30|Loss: 0.0837|accuracy:1.0000
It: 40|Loss: 0.3079|accuracy:0.9375
It: 50|Loss: 0.4084|accuracy:0.8125
It: 60|Loss: 0.1415|accuracy:0.9375
It: 70|Loss: 0.5575|accuracy:0.8125
It: 80|Loss: 0.3694|accuracy:0.8750
It: 90|Loss: 0.1939|accuracy:0.9375
It:100|Loss: 0.4419|accuracy:0.8750
It:110|Loss: 0.1446|accuracy:1.0000
It:120|Loss: 0.5115|accuracy:0.8750
It:130|Loss: 0.6629|accuracy:0.6875
It:140|Loss: 0.3873|accuracy:0.8750
It:150|Loss: 0.2579|accuracy:0.9375
It:160|Loss: 0.1507|accuracy:0.9375
It:170|Loss: 0.3413|accuracy:0.9375
It:180|Loss: 0.3615|accuracy:0.8750
It:190|Loss: 0.2028|accuracy:0.9375
It:200|Loss: 0.2679|accuracy:0.8750
It:210|Loss: 0.2646|accuracy:0.8750
It:220|Loss: 0.1433|accuracy:0.9375
It:230|Loss: 0.4972|accuracy:0.7500
It:240|Loss: 0.1533|accuracy:1.0000
It:250|Loss: 0.3262|accuracy:0.9375
It:260|Loss: 0.0727|accuracy:1.0000
It:270|Loss: 0.4632|accuracy:0.8750
It:280|Loss: 0.1239|accuracy

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [03:26<00:00, 206.22s/it]

Epoch 1 / 1 | val | Loss: 0.2856 | Acc: 0.9050203527815467





In [45]:
# ファインチューニングしたモデルをテストデータで精度評価
bt_test = DataLoader(ds_test, batch_size=32)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
my_model_trained.eval()
my_model_trained.to(device)

epochs_corrects = 0

for batch in tqdm(bt_test):
    inputs = batch["input_ids"].to(device)
    labels = batch["labels"].to(device)
    
    with torch.no_grad():
        outputs = my_model_trained(inputs)
        loss = criterion(outputs, labels)
        
        _, preds = torch.max(outputs, 1)
        epochs_corrects += torch.sum(preds == labels.data)

epoch_acc = epochs_corrects.double() / (len(bt_test) * bt_test.batch_size)
print(f"テストデータ{len(ds_test)}個でのaccuracy: {epoch_acc:.4f}")

100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [00:18<00:00,  1.30it/s]

テストデータ737個でのaccuracy: 0.8724



