<a href="https://colab.research.google.com/github/ITU-TKG/Websystem_A7_2/blob/master/JNLI_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 作業用フォルダの作成
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p '/content/drive/My Drive/work/'
%cd '/content/drive/My Drive/work/'


Mounted at /content/drive
/content/drive/My Drive/work


In [14]:
!pip install transformers[ja,torch] datasets matplotlib japanize-matplotlib



In [15]:
#データセットの準備
from pprint import pprint
from datasets import load_dataset

# Hugging Face Hub上のllm-book/JGLUEのリポジトリから
# JNLIのデータを読み込む
train_dataset = load_dataset(
    "llm-book/JGLUE",name="JNLI",split="train"
)

valid_dataset = load_dataset(
    "llm-book/JGLUE",name="JNLI",split="validation"
)
#データセットへのラベル情報を表示
print(train_dataset.features["label"])
#データ例を表示
pprint(train_dataset[0])

ClassLabel(names=['entailment', 'contradiction', 'neutral'], id=None)
{'label': 2,
 'sentence1': '二人の男性がジャンボジェット機を見ています。',
 'sentence2': '2人の男性が、白い飛行機を眺めています。',
 'sentence_pair_id': '0',
 'yjcaptions_id': '100124-104404-104405'}


In [16]:
#トークナイザ
from transformers import AutoTokenizer

model_name = "cl-tohoku/bert-base-japanese-v3"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#データセット統計の可視化
from collections import Counter
import japanize_matplotlib
import matplotlib.pyplot as plt
from datasets import Dataset
from tqdm import tqdm

plt.rcParams["font.size"] = 18 #文字大きくする

def visualize_text_length(dataset: Dataset):
  length_counter1 = Counter()
  length_counter2 = Counter()
  for data in tqdm(dataset):
      length1 = len(tokenizer.tokenize(data["sentence1"]))
      length_counter1[length1] += 1
      length2 = len(tokenizer.tokenize(data["sentence2"]))
      length_counter2[length2] += 1
  plt.bar(length_counter1.keys(),length_counter1.values(),width=1.0,alpha=0.5)
  plt.bar(length_counter2.keys(),length_counter2.values(),width=1.0,alpha=0.5)
  plt.xlabel("トークン数")
  plt.ylabel("事例数")
  plt.show()

visualize_text_length(train_dataset)
visualize_text_length(valid_dataset)

In [None]:
#ラベル分布可視化
def visualize_labels(dataset:Dataset):
  label_counter = Counter()
  for data in tqdm(dataset):
    label_id = data["label"]
    label_name = dataset.features["label"].names[label_id]
    label_counter[label_name] += 1
  plt.bar(label_counter.keys(),label_counter.values())
  plt.xlabel("ラベル")
  plt.ylabel("事例数")
  plt.show()

visualize_labels(train_dataset)
visualize_labels(valid_dataset)

In [17]:
#データセットの前処理 textをトークンに分割し、IDの系列に変換する処理を定義する
from transformers import BatchEncoding

def preprocess_text_pair_classification(
    example:dict[str,str|int]
) -> BatchEncoding:
    encoded_example = tokenizer(
        example["sentence1"],example["sentence2"],max_length=512
    )

    encoded_example["labels"] = example["label"]
    return encoded_example

In [None]:
#出力の確認
from transformers import AutoTokenizer

transformers_model_name = "cl-tohoku/bert-base-japanese-v3"
tokenizer = AutoTokenizer.from_pretrained(transformers_model_name)

example = train_dataset[0]
encoded_example = preprocess_text_pair_classification(example)
print(tokenizer.convert_ids_to_tokens(encoded_example["input_ids"]))
print(encoded_example["input_ids"])
print(encoded_example)
#クラス確認
print(type(encoded_example).__name__)

#以下marc-jaと一緒

In [18]:
encoded_train_dataset = train_dataset.map(
    preprocess_text_pair_classification,
    remove_columns=train_dataset.column_names,
)
encoded_valid_dataset = valid_dataset.map(
    preprocess_text_pair_classification,
    remove_columns=valid_dataset.column_names,
)

Map:   0%|          | 0/20073 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/2434 [00:00<?, ? examples/s]

In [19]:
#ミニバッチ構築
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

batch_inputs = data_collator(encoded_train_dataset[0:4])
pprint({name: tensor.size() for name, tensor in batch_inputs.items()})

{'attention_mask': torch.Size([4, 31]),
 'input_ids': torch.Size([4, 31]),
 'labels': torch.Size([4]),
 'token_type_ids': torch.Size([4, 31])}


In [20]:
#モデルの準備
from transformers import AutoModelForSequenceClassification

class_label = train_dataset.features["label"]
label2id = {label: id for id, label in enumerate(class_label.names)}
id2label = {id: label for id, label in enumerate(class_label.names)}
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=class_label.num_classes,
    label2id=label2id,  # ラベル名からIDへの対応を指定
    id2label=id2label,  # IDからラベル名への対応を指定
)
print(type(model).__name__)

# パラメータをメモリ上に隣接した形で配置
# これを実行しない場合、モデルの保存でエラーになることがある
for param in model.parameters():
    param.data = param.data.contiguous()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification


In [31]:
#動作確認
print(model.forward(**data_collator(encoded_train_dataset[0:4])))

SequenceClassifierOutput(loss=tensor(1.7583, grad_fn=<NllLossBackward0>), logits=tensor([[-0.9217,  0.5567, -0.7348],
        [-0.9195,  0.5496, -0.7490],
        [-0.9811,  0.6157, -0.6104],
        [-0.9874,  0.6109, -0.6239]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [21]:
#訓練の実行
#学習に関わる設定

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output_jnli",       #結果の保存フォルダ
    per_device_train_batch_size=32, #訓練時のバッチサイズ
    per_device_eval_batch_size=32,  #評価時のバッチサイズ
    learning_rate = 2e-5,           #学習率
    lr_scheduler_type="linear",      #学習率スケジューラの種類 liner
    warmup_ratio=0.1,               #学習率のウォームアップの長さを指定
    num_train_epochs=3,             #エポック数
    save_strategy="epoch",          #チェックポイントの保存タイミング
    logging_strategy="epoch",       #ロギングのタイミング
    evaluation_strategy="epoch",    #検証セットによる評価のタイミング
    load_best_model_at_end=True,    #訓練後に開発セットで最良のモデルをロード
    metric_for_best_model="accuracy",#最良のモデルを決定する評価指標
    fp16=True,                       #自動近藤制度演算の有効化
    report_to="none",               # 外部ツールへのログを無効化
)

In [None]:
#推論 訓練の実行


#関数定義
import numpy as np

def compute_accuracy(
    eval_pred: tuple[np.ndarray, np.ndarray]
) -> dict[str,float]:
    """予測ラベルと正解ラベルから正解率を計算"""
    predictions,labels = eval_pred
    #predictionsは各ラベルについてのスコア
    #もっともスコアの高いインデックスを予測ラベルとする
    predictions = np.argmax(predictions,axis=1)
    return {"accuracy":(predictions == labels).mean()}



from pprint import pprint
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
)

model_name = "llm-book/bert-base-japanese-v3-jnli"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#データセットの準備
valid_dataset = load_dataset(
    "llm-book/JGLUE",name="JNLI",split="validation"
)
encoded_valid_dataset = valid_dataset.map(
    preprocess_text_pair_classification,
    remove_columns=valid_dataset.column_names,
)



#評価の実行
trainer = Trainer(
    model=model,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_valid_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    args=training_args,
    compute_metrics = compute_accuracy,
)

trainer.train()

#モデル評価
eval_metrics = trainer.evaluate(encoded_valid_dataset)
pprint(eval_metrics)

In [None]:
#モデルをpipeline関数を通じて使用するコード

from transformers import pipeline

#Jstsでファインチューニングしたモデルのpipelineを読み込む
nli_pipeline = pipeline(model="llm-book/bert-base-japanese-v3-jnli")
#pipeline実行
text1=""
text2="" #ユーザ入力
print(nli_pipeline({"text":text1,"text_pair":text2}))

In [26]:
#Google Driveへの保存
from google.colab import drive
drive.mount("drive")

ValueError: mount failed

In [24]:
!mkdir -p drive/MyDrive/llm-book
!cp -r output_jnli drive/MyDrive/llm-book

In [25]:
#hugging face　への保存
from huggingface_hub import login

login()

#Hugging Face Hubのリポジトリ名
repo_name = "ITU-TKG/bert-base-japanese-v3-jnli"
#トークナイザとモデルをアップロード
tokenizer.push_to_hub(repo_name)
model.push_to_hub(repo_name)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6787e307-7ad8f4817a22a0a607c4833d;fadd9511-0b43-4755-be08-e5a2a93079f0)

Invalid username or password.