<a href="https://colab.research.google.com/github/Gityosan/google-colab/blob/main/step2-ver1-1-bertClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### インストール・初期設定等

In [None]:
!pip install -q transformers ipadic fugashi
!python -V
!rm -rf sample_data/
from google.colab import drive

drive.mount("/content/drive")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.9/615.9 KB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
Python 3.8.16
Mounted at /content/drive


In [None]:
# 標準ライブラリ
import glob
import json
import os
import pickle
import random
import re
import unicodedata

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import BertJapaneseTokenizer, BertModel

In [None]:
# @title  { vertical-output: true, form-width: "35%", display-mode: "both" }

# 初期設定群
target = "cl-tohoku/bert-base-japanese-whole-word-masking"  # @param {type:"string"}
tokenizer = BertJapaneseTokenizer.from_pretrained(target, tokenize_chinese_chars=False)
model = BertModel.from_pretrained(target).to(
    "cuda:0" if torch.cuda.is_available() else "cpu"
)


target_words = [
    "失笑",
    "なし崩し",
    "なしくずし",
    "御の字",
    # "姑息",
    "すべからく",
    "割愛",
    "破天荒",
    "役不足",
    "確信犯",
    "炎上",
    "草",
]
target_words = [unicodedata.normalize("NFC", w) for w in target_words]
base_dir = "drive/MyDrive/script/bert/"  # @param {type:"string"}
with open(base_dir + "Japanese.txt", "r", encoding="utf-8") as f:
    stop_words = set([w.strip() for w in f] + ["する", "なる", "いる", "ある"])

path_list = glob.glob(base_dir + "twitter-corpus/*.json")
path_list.sort()

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### 関数群

In [None]:
# 前処理
def preprocessing(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[w/:%#$&?()~.=+-…]+[:]? ", "", text)
    text = re.sub(r"(^RT )", "", text)
    text = text.lower()  # 小文字化
    text = re.sub("\r", "", text)  # \r\nをdelete
    text = re.sub("\n", "", text)  # \r\nをdelete
    text = re.sub(r"\d+", "", text)  # 数字列をdelete
    ZEN = "".join(chr(0xFF01 + i) for i in range(94))  # 全角記号一覧
    HAN = "".join(chr(0x21 + i) for i in range(94))  # 半角記号一覧
    ETC = "".join(chr(0x3000 + i) for i in range(30))  # その他主要そうな記号
    text = text.translate(str.maketrans(ZEN, HAN))  # 全角記号を半角記号に置換
    FIXED_HAN = re.sub(r"[\w]+", "", HAN)
    return re.sub("[" + "~" + "*" + "＊" + ETC + FIXED_HAN + "]", " ", text)  # 記号を消す


# JSONからdf作成
def JSONtoDF(path):
    with open(path) as f:
        j = json.load(f)
        return pd.json_normalize(data=j["data"]).drop(
            ["created_at", "edit_history_tweet_ids"], axis=1
        )


# 上記2関数を組み合わせてcorpusを用意
def prepareCorpus(path_list):
    res = pd.DataFrame()
    for path in path_list:
        df = JSONtoDF(path)
        df["preprocessedText"] = [preprocessing(text) for text in df["text"]]
        res = pd.concat([res, df])
    return res.reset_index(drop=True)


def get_context(tokens=[], target_position=0, sequence_length=128):
    token_length = len(tokens)
    diff_length = max([0, sequence_length - token_length])
    # -2 as [CLS] and [SEP] tokens will be added later; /2 as it's a one-sided window
    half_size = int(sequence_length / 2)
    if diff_length:
        return tokens + diff_length * [0], target_position

    if half_size > target_position:
        return tokens[0:sequence_length], target_position
    else:
        return (
            tokens[
                target_position
                - half_size : target_position
                - half_size
                + sequence_length
            ],
            half_size,
        )


def get_usage(
    text_list=[],
    target_word=None,
    output_path="word-vectors/{}.dict".format(random.randrange(1000, 10000)),
    sequence_length=256,
    buffer_size=512,
    layer_range=(0, 12),
):
    print("------------------------")
    print("Start process : {}".format(output_path.split("/")[-1]))

    # check params
    if not target_word or not len(text_list):
        print("Exit from function due to improper parameters")
        return

    # check outputs
    if os.path.exists("/content/" + output_path):
        print("Exit from function because the file already exists")
        return

    # initialize
    TW_batches = []
    surrounding_words = []
    TW_token = tokenizer.encode(target_word)  # TargetWord_token
    TW_token = TW_token[1 : len(TW_token) - 1]

    for text in tqdm(text_list):
        tokens = tokenizer.encode(text)
        if not len(TW_token) or TW_token[0] not in tokens:
            continue

        # このループで取り扱っている一文に対象の単語が含まれている場合以下の処理を実行する
        TW_input_ids, TW_position = get_context(
            tokens, tokens.index(TW_token[0]), sequence_length
        )
        TW_batches.append(
            {
                "word": target_word,
                "position": (TW_position, TW_position + len(TW_token)),
                "input_ids": TW_input_ids,
            }
        )

        # 周辺単語についてもtokenizer.tokenizeで文を分かち書きし、必要な単語だけ重複なくListに格納
        [
            surrounding_words.append(w)
            for w in tokenizer.tokenize(text)
            if w not in stop_words and w not in surrounding_words
        ]

        if len(TW_batches) >= buffer_size:
            break

    usages = {"target_word": {}, "surrounding_words": [], "all": []}

    # target_wordの分散表現を取得
    with torch.no_grad():
        TW_input_ids = torch.tensor([b["input_ids"] for b in TW_batches])
        if torch.cuda.is_available():
            TW_input_ids = TW_input_ids.cuda()
        TW_outputs = model(TW_input_ids, output_hidden_states=True)
    TW_hidden_states = np.stack([v.detach().cpu().numpy() for v in TW_outputs[2]])


    # TW_outputs[2]は13(隠れ12層+最終層)×文章数×単語数×768次元になる (例： 13, 512, 256, 768)
    # 文章の分散表現 : defaultで12層すべての和をとる(TW_vectorsは最終的に512×768次元になる)
    TW_vectors = np.sum(
        TW_hidden_states[layer_range[0] : layer_range[1], :, :, :], axis=0
    ) / (layer_range[1] - layer_range[0])
    TW_vectors = np.stack(
        [
            np.sum(TW_vectors[i, b["position"][0] : b["position"][1], :], axis=0)
            / (b["position"][1] - b["position"][0])
            for i, b in enumerate(TW_batches)
        ]
    )

    usages["target_word"] = {
        "word": target_word,
        "vector": np.sum(TW_vectors, axis=0) / len(TW_vectors),
    }
    usages["all"].append(usages["target_word"])

    # target_wordの周辺単語についても分散表現を取得
    SW_batches = []
    for w in surrounding_words:
        SW_token = tokenizer.encode(w)
        SW_input_ids, SW_position = get_context(SW_token, 1, 10)
        SW_batches.append(
            {
                "word": w,
                "position": (SW_position, SW_position + len(SW_token) - 2),
                "input_ids": SW_input_ids,
            }
        )

    with torch.no_grad():
        SW_input_ids = torch.tensor([s["input_ids"] for s in SW_batches])
        if torch.cuda.is_available():
            SW_input_ids = SW_input_ids.cuda()
        SW_outputs = model(SW_input_ids)
        SW_hidden_states = SW_outputs.last_hidden_state.detach().cpu().numpy()
    [
        usages["surrounding_words"].append(
            {
                "word": s["word"],
                "vector": np.sum(
                    SW_hidden_states[i, s["position"][0] : s["position"][1], :], axis=0
                )
                / (s["position"][1] - s["position"][0]),
            }
        )
        for i, s in enumerate(SW_batches)
    ]

    usages["all"].extend(usages["surrounding_words"])

    print("Finish process : {} / {}".format(target_word, len(TW_batches)))

    with open(output_path, "wb") as f:
        pickle.dump(usages, f)

### 主要処理

In [None]:
# ターゲットとする単語や期間の整理は予めここで行う
oparation = [
    {
        "word": w,
        "year": y,
        "path_list": list(filter(lambda x: w in unicodedata.normalize(
        "NFC",x) and str(y) in x, path_list)),
    }
    for w in target_words
    for y in range(2007, 2021)
]
for o in oparation:
    if not len(o["path_list"]):
        continue
    output_path = unicodedata.normalize(
        "NFC", base_dir + "word-vectors/3-1536/" + o["word"] + "-" + str(o["year"]) + ".dict"
    )
    corpus = prepareCorpus(o["path_list"])
    get_usage(
        text_list=corpus["preprocessedText"].values.tolist(),
        target_word=o["word"],
        output_path=output_path,
        sequence_length=256,
        buffer_size=1536,
        layer_range=(0, 12),
    )

### 保存後処理

In [None]:
# formatter
!pip install -q black[jupyter]
!black "/content/drive/MyDrive/Colab Notebooks/bertClustering.ipynb"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 KB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:

word = "男"
input = tokenizer([word], return_tensors="pt")
outputs = model(**input)
# 最終層のテンソルのshape
print(outputs.last_hidden_state.shape)
# print(outputs.last_hidden_state[0][1])
print(input)
print(outputs.last_hidden_state*input["attention_mask"].unsqueeze(-1))

NameError: ignored

In [None]:
sentences = ["私はラーメンが好きですですます口調","チャーシューメンが好きです"]

input = tokenizer(sentences, return_tensors="pt",padding=True,truncation=True)
print(input)
outputs = model(input)
sentence_vecs=torch.mean(outputs.last_hidden_state,1)
print(torch.cosine_similarity(sentence_vecs[0],sentence_vecs[1],dim=0))
# attention_mask = input.attention_mask.unsqueeze(-1)
# valid_token_num = attention_mask.sum(1)
# sentence_vecs = (outputs.last_hidden_state*attention_mask).sum(1) / valid_token_num
print(outputs.last_hidden_state.shape)
print(sentence_vecs.shape)
print(torch.cosine_similarity(sentence_vecs[0],sentence_vecs[1],dim=0))

{'input_ids': tensor([[    2,  1325,     9,  9714,    14,  3596,  2992,  2992,  2610,  1285,
         28913,     3],
        [    2,  7101,  5434,  5470,    14,  3596,  2992,     3,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}
tensor(0.7618, grad_fn=<SumBackward1>)
torch.Size([2, 12, 768])
torch.Size([2, 768])
tensor(0.7618, grad_fn=<SumBackward1>)


In [None]:
input_ids_A = torch.tensor(tokenizer.encode("吾輩は神である。", add_special_tokens=True)).unsqueeze(0)
print(input_ids_A)
print(tokenizer.encode("吾輩は神である。", add_special_tokens=True))
print(torch.tensor(tokenizer.encode("吾輩は神である。", add_special_tokens=True)))


tensor([[    2,  7184, 30046,     9,   299,    12,    31,     8,     3]])
[2, 7184, 30046, 9, 299, 12, 31, 8, 3]
tensor([    2,  7184, 30046,     9,   299,    12,    31,     8,     3])


In [None]:
opa = model(input_ids_A,output_hidden_states=True)
embeddings_A = torch.mean(opa.hidden_states[12],1)
print(opa.last_hidden_state.detach().numpy().shape)

(1, 9, 768)


### その他

In [None]:
from tensorflow.python.client import device_lib
import tensorflow as tf
from psutil import virtual_memory

# RAMのサイズをcheck
ram_gb = virtual_memory().total / 1e9
print("Your runtime has {:.1f} gigabytes of available RAM\n".format(ram_gb))

if ram_gb < 20:
    print("Not using a high-RAM runtime")
else:
    print("You are using a high-RAM runtime!")
# GPUの数をcheck
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))
# Check GPU recognized
print(device_lib.list_local_devices())

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!
Num GPUs Available:  1
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6354157854472975992
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11586961408
locality {
  bus_id: 1
  links {
  }
}
incarnation: 14767642777879333371
physical_device_desc: "device: 0, name: A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0"
xla_global_id: 416903419
]
