In [2]:
!pip install transformers ipadic fugashi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import glob
import json
import os
import pickle
import random
import re

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import BertJapaneseTokenizer, BertModel

In [4]:
# 関数群
# 前処理
def preprocessing(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[w/:%#$&?()~.=+-…]+[:]? ", "", text)
    text = re.sub(r"(^RT )", "", text)
    text = text.lower()  # 小文字化
    text = re.sub("\r", "", text)  # \r\nをdelete
    text = re.sub("\n", "", text)  # \r\nをdelete
    text = re.sub(r"\d+", "", text)  # 数字列をdelete
    ZEN = "".join(chr(0xFF01 + i) for i in range(94))  # 全角記号一覧
    HAN = "".join(chr(0x21 + i) for i in range(94))  # 半角記号一覧
    ETC = "".join(chr(0x3000 + i) for i in range(30))  # その他主要そうな記号
    text = text.translate(str.maketrans(ZEN, HAN))  # 全角記号を半角記号に置換
    FIXED_HAN = re.sub(r"[\w]+", "", HAN)
    return re.sub("[" + "~" + "*" + "＊" + ETC + FIXED_HAN + "]", " ", text)  # 記号を消す


# JSONからdf作成
def JSONtoDF(path):
    with open(path) as f:
        j = json.load(f)
        return pd.json_normalize(data=j["data"]).drop(
            ["created_at", "edit_history_tweet_ids"], axis=1
        )


def prepareCorpus(path_list):
    res = pd.DataFrame()
    for path in path_list:
        df = JSONtoDF(path)
        df["preprocessedText"] = [preprocessing(text) for text in df["text"]]
        res = pd.concat([res, df])
    return res.reset_index(drop=True)


def get_context(token_ids, target_position, sequence_length=128):
    """
    Given a text containing a target word, return the sentence snippet which surrounds the target word
    (and the target word's position in the snippet).

    :param token_ids: list of token ids (for an entire line of text)
    :param target_position: index of the target word's position in `tokens`
    :param sequence_length: desired length for output sequence (e.g. 128, 256, 512)
    :return: (context_ids, new_target_position)
                context_ids: list of token ids for the output sequence
                new_target_position: index of the target word's position in `context_ids`
    """
    # -2 as [CLS] and [SEP] tokens will be added later; /2 as it's a one-sided window
    window_size = int((sequence_length - 2) / 2)
    context_start = max([0, target_position - window_size])
    padding_offset = max([0, window_size - target_position])
    padding_offset += max([0, target_position + window_size - len(token_ids)])
    context_ids = token_ids[context_start : target_position + window_size]
    context_ids += padding_offset * [0]
    new_target_position = target_position - context_start
    return context_ids, new_target_position


def get_i2w(target_words):
    i2w = {}
    for t in target_words:
        t_id = tokenizer.encode(t)[1]
        if t_id:
            i2w[t_id] = t
    return i2w


def get_usage(
    text_list=[],
    i2w={},
    output_path="word-vectors/{}.dict".format(random.randrange(1000, 10000)),
    sequence_length=256,
    buffer_size=512,
    layer_range=(1, 14),
):
    batches = []
    for text in text_list:
        tokens = tokenizer.encode(text)
        for index, token in enumerate(tokens):
            if token not in i2w:
                continue
            context_ids, position = get_context(tokens, index, sequence_length)
            batches.append(
                {
                    "word": i2w[token],
                    "position": position,
                    "input_ids": [101] + context_ids + [102],
                    "context_ids": context_ids,
                }
            )
            if len(batches) >= buffer_size:
                break
        else:
            continue
        break

    print("=====================")
    print(
        "{} data_size: {}".format(
            output_path.split("/")[-1].split(".")[0], len(batches)
        )
    )
    print("=====================")

    # print("Start model fit")
    # hidden_states = torch.Tensor()
    # input_ids_tensor = torch.tensor([b["input_ids"] for b in batches])
    # for inid in tqdm(input_ids_tensor[:]):
    #     with torch.no_grad():
    #         outputs = model(inid.reshape(1, -1), output_hidden_states=True)
    #     hidden_states = torch.cat((hidden_states, outputs.last_hidden_state))
    # print("Finish model fit")

    print("Start model fit")
    with torch.no_grad():
        input_ids_tensor = torch.tensor([b["input_ids"] for b in batches])
        outputs = model(input_ids_tensor, output_hidden_states=True)
    hidden_states = [l.clone().numpy() for l in outputs[2]]
    # outputs[2]は三次元のtensor型が13個並ぶ配列 (13, 2, 256, 768) (13, B, |s|, 768)
    # 13(隠れ12層+最終層)×文章数×768次元になる
    print("Finish model fit")

    # defaultで12層すべての和をとる
    usage_vectors = np.sum(
        np.stack(hidden_states)[layer_range[0] : layer_range[1], :, :], axis=0
    )
    usages = {}
    # すでにファイルが存在すれば続きから追記
    if os.path.exists(output_path):
        with open(output_path, "rb") as f:
            usages = pickle.load(f)
    else:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

    for i, b in enumerate(batches):
        # int(b["position"])に+1されているのは、スペシャルトークンの分だけターゲットワードの位置が後ろに一個分ズレているから
        usage_vector = usage_vectors[i, int(b["position"]) + 1]  # 文章の分散表現
        if b["word"] not in usages:
          usages[b["word"]]=[]
        usages[b["word"]].append(
            {
                "word": b["word"],
                "position": b["position"],
                "input_ids": b["input_ids"],
                "context_ids": b["context_ids"],
                "vector": usage_vector,
            }
        )
    with open(output_path, "wb") as f:
        pickle.dump(usages, f)


# 未実施処理
# - Japanese.txtに基づくストップワード除去
# - 重複Tweet削除

In [None]:
# 初期設定群
target = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = BertJapaneseTokenizer.from_pretrained(target)
model = BertModel.from_pretrained(target)

target_words = [
    "失笑",
    "なし崩し",
    "なしくずし",
    "御の字",
    "姑息",
    "すべからく",
    "割愛",
    "破天荒",
    "役不足",
    "確信犯",
    "炎上",
    "草",
]


data_dir = "twitter-corpus"
path_list = glob.glob(data_dir + "/*.json")
for path in path_list:
    output_path = "word-vectors/" + path.split("/")[-1].split(".")[0] + ".dict"
    corpus = prepareCorpus([path])
    get_usage(
        text_list=corpus["preprocessedText"].values.tolist(),
        i2w=get_i2w(target_words),
        output_path=output_path,
        sequence_length=256,
        buffer_size=378,
        layer_range=(1, 14),
    )

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


確信犯-2020 data_size: 378
Start model fit
Finish model fit
