In [20]:
import os
import re
import csv
import time
import numpy as np
import pandas as pd
import torch
from transformers import BertJapaneseTokenizer, BertModel

In [49]:
pretrained_model = "cl-tohoku/bert-base-japanese-v2"
pretrained_version = "v2"
file_path = "annotation.csv"

tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model)
model = BertModel.from_pretrained(
    pretrained_model, # 日本語Pre trainedモデルの指定
    output_attentions = False, # アテンションベクトルを出力するか
    output_hidden_states = True, # 隠れ層を出力するか
)

# 文章を取得
with open(file_path, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    scene_descs_tab = [row for row in reader]
scene_descs_tab

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[['とっても冷えてそうです。喉が渇いてきました。', 'クリアな気分になりました。', '松本さんが楽しそうです。こちらも楽しくなってきました。'],
 ['目が回らないのでしょうか。', 'どうやらレンタカーの会社らしいです。', '社長がとっても楽しそう。開店開店…'],
 [' 膝をあんなに回してどうしたのでしょうか？', '比較的お年を召した方が多く出演しているらしいです。', '何かサプリメントのご紹介のようですね。']]

In [50]:
start_time = time.time()
embed_tab = []

for scene_descs in scene_descs_tab:
    print(".", end="")
    
    scene_embed = []
    for scene_desc in scene_descs:
        if scene_desc == "":
            continue
#         print(scene_desc)
        splitted_texts = re.findall("[^。]+。?", scene_desc)
        
        scene_subj_embed = []
        for one_text in splitted_texts:
            tokens = np.array(tokenizer.encode(one_text))
            tokens_tensor = torch.tensor([tokens.tolist()])
            
            model.eval()
            with torch.no_grad(): # 勾配計算なし
                all_encoder_layers = model(tokens_tensor)["hidden_states"]

            one_text_embeds = []
            for one_layer in all_encoder_layers:
                # 下記2つ目の0は単語のインデックスすなわち[CLS]トークンに対応する埋め込みを取得
                one_text_embeds.append(one_layer[0][0].numpy())
            
            scene_subj_embed.append(one_text_embeds)
        
        scene_subj_embed = np.array(scene_subj_embed).mean(axis=0)
        scene_embed.append(scene_subj_embed)
        

    scene_embed = np.array(scene_embed).mean(axis=0)
    embed_tab.append(scene_embed)
    
embed_tab = np.array(embed_tab)
embed_tab = embed_tab.transpose(1, 0, 2)
print(embed_tab.shape)

end_time = time.time()
print("time:", end_time-start_time)

...(13, 3, 768)
time: 0.2912912368774414


In [58]:
# 各層の畳み込みをHDF5保存
for i, one_layer_embed in enumerate(embed_tab): 
    stim = (one_layer_embed - normmeans) / normstds
    with open(f"embed_layer{i+1:02}.csv", 'w') as wf:
        writer = csv.writer(wf)
        writer.writerow(range(1, 769))
        for row in one_layer_embed:
            writer.writerow(row)

print("finished")

finished


In [59]:
# ファイルの確認
with open("embed_layer01.csv", "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    scene_descs_tab = [row for row in reader]
np.array(scene_descs_tab)

array([['1', '2', '3', ..., '766', '767', '768'],
       ['0.241623', '-0.38691887', '0.3015562', ..., '-0.18987507',
        '0.3637279', '0.17413235'],
       ['0.241623', '-0.38691887', '0.3015562', ..., '-0.18987507',
        '0.3637279', '0.17413235'],
       ['0.241623', '-0.38691887', '0.3015562', ..., '-0.18987507',
        '0.3637279', '0.17413235']], dtype='<U13')