In [2]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from transformers import BertJapaneseTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix

MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
print(torch.cuda.is_available())

True


### 事前準備
学習データ成形、事前学習クラスのロード、BERTを使ったモデルのクラス作成

In [2]:
# ポジネガデータセット
df_dataset = pd.read_csv(
    'D:/DataSet/chABSA-dataset/chABSA-dataset/dataset.tsv',
    sep='\t', 
    header=None
).rename(columns={0:'text', 1:'label'}).loc[:, ['text', 'label']]

# ひとまずこういうFmtのデータに成形するところまでがんばる
df_dataset

Unnamed: 0,text,label
0,当社グループを取り巻く環境は、実質賃金が伸び悩むなか、消費者の皆様の生活防衛意識の高まりや節...,0
1,春から夏にかけましては個人消費の低迷などにより、きのこの価格は厳しい状況で推移いたしました,0
2,台湾の現地法人「台灣北斗生技股份有限公司」におきましては、ブランドの構築、企画提案などに力を...,0
3,化成品事業におきましては、引き続き厳しい販売環境にありましたが、中核である包装資材部門におき...,0
4,以上の結果、化成品事業の売上高は92億45百万円（同1.7％減）となりました,0
...,...,...
2808,当連結会計年度におきましては、連結子会社のデジタル・アドバタイジング・コンソーシアム株式会社...,1
2809,新規の自動ドアの売上台数は僅かに減少したものの、シートシャッターの大型物件に加え、取替の売上...,1
2810,"加えて、保守契約が堅調に増加し、売上高は6,952百万円（前年同期比1.2％増）となりました",1
2811,利益につきましては、取替工事の増加及び保守契約による安定的な利益の確保により、セグメント利益...,1


In [3]:
"""
https://dreamer-uma.com/pytorch-dataset/

対象タスクのデータを扱うDataset
データの格納と引き出し　DataLoaderと組み合わせてミニバッチ学習が可能
Datasetを自作する場合は必ず以下のメソッドを実装すること
__len__(): Datasetのサイズ（データ数）
__getitem__(): Datasetの要素にアクセス

"""
class PosiNegaDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.encodings["input_ids"])
    
    def __getitem__(self, idx):
        item = { k: torch.tensor(v[idx]) for k, v in self.encodings.items() }
#         item = { k: torch.tensor(v[idx]).cuda() for k, v in self.encodings.items() }
        item["labels"] = torch.tensor(self.labels[idx])
        return item

### 事前準備、以上
ここからは上で用意した各種クラスやモデルやデータセットを使ってタスクを解くコーディングをしていく

In [4]:
# トークナイザ
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

# 事前学習モデル
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# model = model.cuda()  
model.device

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

device(type='cuda', index=0)

In [5]:
# 特徴量X、ラベルyを取得
X, y = df_dataset["text"].values, df_dataset["label"].values

# train, val分割
# random_stateはシャッフルの乱数シード固定、stratifyは正例、負例のラベル数均一にする処理
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0, stratify=y_val)


# トークナイザでモデルへのinputとなるようencodingする
max_len = 256 #512

enc_train = tokenizer(
    X_train.tolist(), 
    add_special_tokens=True, 
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
)
# tokenizer.convert_ids_to_tokens(enc_train['attention_masks'].tolist()[0])

enc_val = tokenizer(
    X_val.tolist(), 
    add_special_tokens=True, 
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
)

enc_test = tokenizer(
    X_test.tolist(), 
    add_special_tokens=True, 
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
)

In [6]:
# Datasetを作成
ds_train = PosiNegaDataset(enc_train, y_train)
ds_val = PosiNegaDataset(enc_val, y_val)

# DataLoaderを作成
batch_size = 8
dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=batch_size, shuffle=False)

In [7]:
# compute_matricsを定義
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    
    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}


In [8]:
# Trainerを作成
training_args = TrainingArguments(
    output_dir='./outputs',
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=10,
    no_cuda=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    compute_metrics=compute_metrics,
)


In [9]:
# ファインチューニング
trainer.train()

***** Running training *****
  Num examples = 2250
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 282
  item = { k: torch.tensor(v[idx]) for k, v in self.encodings.items() }


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=282, training_loss=0.423732172724203, metrics={'train_runtime': 95.775, 'train_samples_per_second': 23.493, 'train_steps_per_second': 2.944, 'total_flos': 295999937280000.0, 'train_loss': 0.423732172724203, 'epoch': 1.0})

### モデルのテスト
testデータを使って推論の精度確認

In [10]:
# データセット
ds_test = PosiNegaDataset(enc_test, y_test)

# モデル読み込み
model_path = "./outputs/checkpoint-500"
trained_model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
test_trainer = Trainer(trained_model)

# 推論
raw_pred, _, _ = test_trainer.predict(ds_test)

y_pred = np.argmax(raw_pred, axis=1)

loading configuration file ./outputs/checkpoint-500\config.json
Model config BertConfig {
  "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tokenizer_class": "BertJapaneseTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file ./outputs/checkpoint-500\pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassif

In [11]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
recall = recall_score(y_true=y_test, y_pred=y_pred)
precision = precision_score(y_true=y_test, y_pred=y_pred)
f1 = f1_score(y_true=y_test, y_pred=y_pred)

print(accuracy, recall, precision, f1)

0.9148936170212766 0.874251497005988 0.9798657718120806 0.9240506329113924


In [12]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[112   3]
 [ 21 146]]


### 実用フェーズ
作成したモデルで推論してみる

In [3]:
# テスト用トークナイザ
tokenizer_for_test = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
# ファインチューニング済みモデル
model_path = "./outputs/checkpoint-500"
trained_model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# 使ってみる
text = input('> ')

encoding = tokenizer_for_test( 
    text,
    add_special_tokens=True, 
    max_length=256,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

#print(tokenizer.convert_ids_to_tokens(encoding["input_ids"].tolist()[0]))

if trained_model.device.type == 'cuda':
    encoding = { k: v.cuda() for k, v in encoding.items() }

with torch.no_grad():
    output = trained_model(**encoding)


> 当社の売り上げは１０年連続減少、経営黒字が続いている


In [6]:
# 分類スコア
scores = output.logits
# 確率
prob = scores.softmax(dim=1)
# 予測ラベル
predicted_labels = scores.argmax(-1)

print("分類スコア")
print(scores)
print("確率")
print(prob)
print("推論結果")
print(predicted_labels)

分類スコア
tensor([[ 2.0845, -2.4853]])
確率
tensor([[0.9897, 0.0103]])
推論結果
tensor([0])


In [13]:
texts = [
    '当社の売り上げは１０年連続減少、経営赤字が続いている',
    '新製品の開発に成功、収益は過去最高を達成しました',
    'あの映画は本当に面白いからぜひ見てね',    
]
encoding = tokenizer_for_test( 
    texts,
    add_special_tokens=True, 
    max_length=256,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)
if trained_model.device.type == 'cuda':
    encoding = { k: v.cuda() for k, v in encoding.items() }

with torch.no_grad():
    output = trained_model(**encoding)

scores = output.logits
prob = scores.softmax(dim=1)
predicted_labels = scores.argmax(-1)
print("分類スコア")
print(scores)
print("確率")
print(prob)
print("推論結果")
print(predicted_labels)

分類スコア
tensor([[ 2.1422, -2.4919],
        [-0.4833,  2.7143],
        [-0.5637,  2.2063]])
確率
tensor([[0.9904, 0.0096],
        [0.0393, 0.9607],
        [0.0590, 0.9410]])
推論結果
tensor([0, 1, 1])
