In [None]:
!pip install transformers ipadic fugashi
!python -V

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.1 MB/s 
[?25hCollecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 25.8 MB/s 
[?25hCollecting fugashi
  Downloading fugashi-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615 kB)
[K     |████████████████████████████████| 615 kB 69.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 81.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 53.4 MB/s 
Building wheels for collected packages: ipadic
  Building wheel for ipadi

In [None]:
import glob
import json
import os
import pickle
import random
import re

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import BertJapaneseTokenizer, BertModel

In [None]:
# 関数群
# 前処理
def preprocessing(text):
    # ストップワードを定義
    with open("Japanese.txt", "r", encoding="utf-8") as f:
        stop_words = set([w.strip() for w in f] + ["する", "なる", "いる", "ある"])
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[w/:%#$&?()~.=+-…]+[:]? ", "", text)
    text = re.sub(r"(^RT )", "", text)
    text = text.lower()  # 小文字化
    text = re.sub("\r", "", text)  # \r\nをdelete
    text = re.sub("\n", "", text)  # \r\nをdelete
    text = re.sub(r"\d+", "", text)  # 数字列をdelete
    ZEN = "".join(chr(0xFF01 + i) for i in range(94))  # 全角記号一覧
    HAN = "".join(chr(0x21 + i) for i in range(94))  # 半角記号一覧
    ETC = "".join(chr(0x3000 + i) for i in range(30))  # その他主要そうな記号
    text = text.translate(str.maketrans(ZEN, HAN))  # 全角記号を半角記号に置換
    FIXED_HAN = re.sub(r"[\w]+", "", HAN)
    return re.sub("[" + "~" + "*" + "＊" + ETC + FIXED_HAN + "]", " ", text)  # 記号を消す


# JSONからdf作成
def JSONtoDF(path):
    with open(path) as f:
        j = json.load(f)
        return pd.json_normalize(data=j["data"]).drop(
            ["created_at", "edit_history_tweet_ids"], axis=1
        )


def prepareCorpus(path_list):
    res = pd.DataFrame()
    for path in path_list:
        df = JSONtoDF(path)
        df["preprocessedText"] = [preprocessing(text) for text in df["text"]]
        res = pd.concat([res, df])
    return res.reset_index(drop=True)


def get_context(token_ids, target_position, sequence_length=128):
    """
    Given a text containing a target word, return the sentence snippet which surrounds the target word
    (and the target word's position in the snippet).

    :param token_ids: list of token ids (for an entire line of text)
    :param target_position: index of the target word's position in `tokens`
    :param sequence_length: desired length for output sequence (e.g. 128, 256, 512)
    :return: (context_ids, new_target_position)
                context_ids: list of token ids for the output sequence
                new_target_position: index of the target word's position in `context_ids`
    """
    # -2 as [CLS] and [SEP] tokens will be added later; /2 as it's a one-sided window
    window_size = int((sequence_length - 2) / 2)
    context_start = max([0, target_position - window_size])
    padding_offset = max([0, window_size - target_position])
    padding_offset += max([0, target_position + window_size - len(token_ids)])
    context_ids = token_ids[context_start : target_position + window_size]
    context_ids += padding_offset * [0]
    new_target_position = target_position - context_start
    return context_ids, new_target_position


def get_usage(
    text_list=[],
    target_word=None,
    output_path="word-vectors/{}.dict".format(random.randrange(1000, 10000)),
    sequence_length=256,
    buffer_size=512,
    layer_range=(1, 14),
):
    if not target_word:
        return
    TW_token = tokenizer.encode(target_word)
    TW_token = TW_token[1 : len(TW_token) - 1]
    batches = []
    surrounding_words = []
    for text in tqdm(text_list):
        if target_word not in text:
            continue
        [
            surrounding_words.append(w)
            for w in tokenizer.tokenize(text)
            if w not in stop_words and w not in surrounding_words
        ]
        tokens = tokenizer.encode(text)
        TW_position = (
            tokens.index(TW_token[0])
            if len(TW_token) > 0 and TW_token[0] in tokens
            else None
        )
        if TW_position == None:
            continue
        # このループで取り扱っている一文に対象の単語が含まれている場合以下の処理を実行する
        input_ids, position = get_context(tokens, TW_position, sequence_length)
        batches.append(
            {
                "word": target_word,
                "position": (position, position + len(TW_token)),
                "input_ids": input_ids,
            }
        )
        if len(batches) >= buffer_size:
            break

    print("Start model fit")
    usages = []
    # すでにファイルが存在すれば続きから追記
    if os.path.exists(output_path):
        with open(output_path, "rb") as f:
            usages = pickle.load(f)
    else:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with torch.no_grad():
        input_ids_tensor = torch.tensor([b["input_ids"] for b in batches])
        outputs = model(input_ids_tensor, output_hidden_states=True)
        hidden_states = [l.detach().numpy() for l in outputs[2]]
        # outputs[2]は三次元のtensor型が13個並ぶ配列 (13, 512, 256, 768) (13, B, |s|, 768)
        # 13(隠れ12層+最終層)×文章数×単語数×768次元になる

        SW_tokens = [
            tokenizer.encode(w) + max([0, 10 - len(tokenizer.encode(w))]) * [0]
            for w in surrounding_words
        ]
        outputs = model(torch.tensor(SW_tokens), output_hidden_states=True)
        [
            usages.append(
                {
                    "word": w,
                    "vector": np.sum(
                        outputs.last_hidden_state.detach().numpy()[i, 0:10, :]
                    ),
                }
            )
            for i, w in enumerate(surrounding_words)
        ]

    print("Finish model fit: word: {} / size: {}".format(target_word, len(batches)))

    # defaultで12層すべての和をとる
    usage_vectors = np.sum(
        np.stack(hidden_states)[layer_range[0] : layer_range[1], :, :, :], axis=0
    )

    batch = {"word": "", "vector": [], "position": [], "input_ids": []}
    for i, b in enumerate(batches):
        b["vector"] = np.sum(
            usage_vectors[i, b["position"][0] : b["position"][1], :], axis=0
        )  # 文章の分散表現
        batch = {
            "word": b["word"],
            "vector": np.row_stack((batch["vector"], b["vector"]))
            if len(batch["vector"])
            else b["vector"],
            "position": batch["position"] + [b["position"]],
            "input_ids": batch["input_ids"] + [b["input_ids"]],
        }

    usages.append(batch)
    with open(output_path, "wb") as f:
        pickle.dump(usages, f)


# 未実施処理
# - 重複Tweet削除

In [None]:
# 初期設定群
target = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = BertJapaneseTokenizer.from_pretrained(target,tokenize_chinese_chars=False)
model = BertModel.from_pretrained(target)

target_words = [
    "失笑",
    "なし崩し",
    "なしくずし",
    "御の字",
    # "姑息",
    "すべからく",
    "割愛",
    "破天荒",
    "役不足",
    "確信犯",
    "炎上",
    "草",
]
with open("Japanese.txt", "r", encoding="utf-8") as f:
  stop_words = set([w.strip() for w in f] + ["する", "なる", "いる", "ある"])

data_dir = "twitter-corpus"
path_list = glob.glob(data_dir + "/*.json")
path_list.sort()
# ターゲットとする単語や期間の整理は予めここで行う
oparation = [
    {
        "word": w,
        "year": y,
        "path_list": list(filter(lambda x: w in x and str(y) in x, path_list)),
    }
    for w in target_words
    for y in range(2007, 2021)
]
for o in oparation:
    if not len(o["path_list"]):
        continue
    output_path = "word-vectors/" + o["word"] + "-" + str(o["year"]) + ".dict"
    corpus = prepareCorpus(o["path_list"])
    get_usage(
        text_list=corpus["preprocessedText"].values.tolist(),
        target_word=o["word"],
        output_path=output_path,
        sequence_length=256,
        buffer_size=512,
        layer_range=(1, 14),
    )

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 36%|███▌      | 511/1420 [00:00<00:00, 1532.30it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 20%|█▉        | 511/2587 [00:00<00:01, 1071.88it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 10%|█         | 516/4972 [00:00<00:04, 1017.00it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 10%|█         | 511/4934 [00:00<00:04, 951.30it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 10%|█         | 512/4948 [00:00<00:03, 1127.84it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 10%|█         | 512/4906 [00:00<00:03, 1238.72it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 11%|█         | 511/4827 [00:00<00:03, 1140.23it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 10%|█         | 511/4917 [00:00<00:04, 1039.45it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 11%|█         | 511/4826 [00:00<00:04, 986.13it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 11%|█         | 529/4849 [00:00<00:04, 953.32it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 11%|█         | 512/4829 [00:00<00:05, 845.23it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 12%|█▏        | 556/4663 [00:00<00:04, 988.76it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 11%|█         | 514/4855 [00:00<00:04, 958.85it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


 12%|█▏        | 536/4406 [00:00<00:04, 901.30it/s]


Start model fit
Finish model fit: word: 失笑 / size: 512


100%|██████████| 420/420 [00:00<00:00, 1705.42it/s]


Start model fit
Finish model fit: word: なし崩し / size: 420


 33%|███▎      | 511/1550 [00:00<00:00, 1373.84it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 13%|█▎        | 513/3974 [00:00<00:04, 856.97it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 10%|█         | 511/4922 [00:00<00:05, 790.81it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 12%|█▏        | 579/4843 [00:00<00:04, 915.39it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 11%|█         | 542/4844 [00:02<00:16, 256.25it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 11%|█         | 524/4918 [00:00<00:03, 1130.66it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 11%|█         | 516/4858 [00:00<00:04, 916.90it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 11%|█▏        | 553/4830 [00:00<00:04, 886.52it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 11%|█         | 514/4825 [00:00<00:04, 873.71it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 21%|██        | 991/4814 [00:00<00:02, 1469.51it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 11%|█         | 516/4830 [00:00<00:06, 706.55it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 10%|█         | 516/4923 [00:00<00:05, 797.76it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


 11%|█         | 517/4786 [00:00<00:06, 653.94it/s]


Start model fit
Finish model fit: word: なし崩し / size: 512


100%|██████████| 80/80 [00:00<00:00, 2947.02it/s]


Start model fit
Finish model fit: word: なしくずし / size: 80


100%|██████████| 430/430 [00:00<00:00, 680.19it/s]


Start model fit
Finish model fit: word: なしくずし / size: 430


 23%|██▎       | 511/2210 [00:00<00:01, 1100.35it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 23%|██▎       | 511/2257 [00:00<00:02, 833.66it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 12%|█▏        | 514/4450 [00:00<00:05, 733.25it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 11%|█         | 514/4715 [00:00<00:03, 1082.15it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 11%|█         | 516/4813 [00:00<00:07, 602.80it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 12%|█▏        | 568/4921 [00:00<00:04, 939.25it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 11%|█         | 511/4847 [00:00<00:05, 864.32it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 11%|█         | 519/4895 [00:00<00:05, 771.88it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 17%|█▋        | 511/3016 [00:00<00:04, 580.82it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 11%|█         | 516/4875 [00:00<00:06, 719.22it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 11%|█         | 515/4904 [00:00<00:05, 786.45it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


 11%|█         | 518/4859 [00:01<00:12, 347.16it/s]


Start model fit
Finish model fit: word: なしくずし / size: 512


100%|██████████| 300/300 [00:00<00:00, 2060.36it/s]


Start model fit
Finish model fit: word: 御の字 / size: 300


 21%|██▏       | 514/2400 [00:00<00:01, 1400.68it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 16%|█▌        | 511/3221 [00:00<00:02, 994.51it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 10%|█         | 511/4944 [00:00<00:04, 994.16it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 10%|█         | 511/4963 [00:00<00:04, 1057.40it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 10%|█         | 511/4969 [00:00<00:04, 1060.80it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 10%|█         | 516/4953 [00:00<00:03, 1152.94it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 10%|█         | 511/4962 [00:00<00:04, 1058.40it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 10%|█         | 511/4959 [00:00<00:03, 1141.95it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 11%|█         | 514/4785 [00:00<00:03, 1139.52it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 10%|█         | 514/4943 [00:00<00:04, 984.80it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 11%|█         | 527/4937 [00:00<00:04, 970.27it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 11%|█         | 516/4864 [00:00<00:04, 930.88it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 10%|█         | 511/4900 [00:00<00:05, 851.83it/s]


Start model fit
Finish model fit: word: 御の字 / size: 512


 85%|████████▌ | 511/600 [00:00<00:00, 1810.65it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 13%|█▎        | 511/3810 [00:00<00:02, 1353.31it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 13%|█▎        | 511/3859 [00:00<00:03, 988.28it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 10%|█         | 511/4961 [00:00<00:04, 1105.58it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 11%|█         | 511/4830 [00:00<00:04, 1051.26it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 11%|█         | 513/4849 [00:00<00:03, 1369.58it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 10%|█         | 512/4879 [00:00<00:02, 1564.61it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 10%|█         | 512/4887 [00:00<00:03, 1366.65it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 10%|█         | 512/4911 [00:00<00:03, 1445.18it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 11%|█         | 512/4873 [00:00<00:03, 1313.65it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 10%|█         | 512/4912 [00:00<00:03, 1375.89it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 11%|█         | 548/4897 [00:00<00:03, 1252.83it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 11%|█         | 519/4871 [00:00<00:03, 1162.02it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


 11%|█         | 511/4644 [00:00<00:03, 1080.01it/s]


Start model fit
Finish model fit: word: すべからく / size: 512


100%|██████████| 300/300 [00:00<00:00, 1825.78it/s]


Start model fit
Finish model fit: word: 割愛 / size: 300


 22%|██▏       | 513/2340 [00:00<00:01, 1083.70it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 11%|█         | 512/4825 [00:00<00:04, 884.20it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 10%|█         | 513/4964 [00:00<00:05, 812.21it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 11%|█         | 516/4914 [00:00<00:05, 843.59it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 11%|█         | 517/4864 [00:00<00:03, 1097.73it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 10%|█         | 511/4867 [00:00<00:05, 851.29it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 10%|█         | 513/4909 [00:00<00:05, 795.51it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 10%|█         | 511/4897 [00:00<00:05, 828.62it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 11%|█         | 513/4674 [00:00<00:05, 804.41it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 11%|█         | 518/4788 [00:02<00:19, 219.69it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 11%|█         | 523/4870 [00:00<00:06, 663.23it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 11%|█         | 517/4899 [00:00<00:06, 671.53it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


 11%|█         | 525/4835 [00:00<00:06, 675.59it/s]


Start model fit
Finish model fit: word: 割愛 / size: 512


100%|██████████| 350/350 [00:00<00:00, 1606.82it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 350


 88%|████████▊ | 511/580 [00:00<00:00, 1300.09it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 24%|██▍       | 511/2142 [00:00<00:01, 864.30it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 10%|█         | 511/4970 [00:00<00:04, 1019.30it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 10%|█         | 511/4949 [00:00<00:04, 1033.50it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 10%|█         | 511/4888 [00:00<00:04, 1086.33it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 10%|█         | 511/4883 [00:00<00:04, 933.06it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 11%|█         | 512/4677 [00:00<00:04, 982.50it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 11%|█         | 512/4607 [00:01<00:11, 365.00it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 10%|█         | 511/4898 [00:00<00:04, 1021.81it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 11%|█         | 519/4934 [00:00<00:05, 855.07it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 11%|█         | 516/4875 [00:00<00:06, 706.05it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 11%|█         | 519/4921 [00:00<00:05, 804.97it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


 11%|█         | 512/4817 [00:00<00:04, 911.99it/s]


Start model fit
Finish model fit: word: 破天荒 / size: 512


100%|██████████| 200/200 [00:00<00:00, 1584.26it/s]


Start model fit
Finish model fit: word: 役不足 / size: 200


 22%|██▏       | 513/2330 [00:00<00:01, 1534.46it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 34%|███▍      | 511/1507 [00:00<00:00, 1221.47it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 515/4959 [00:00<00:04, 1032.77it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 513/4960 [00:00<00:04, 1093.87it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 515/4940 [00:00<00:03, 1378.93it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 512/4934 [00:00<00:03, 1351.96it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 512/4946 [00:00<00:03, 1253.41it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 511/4961 [00:00<00:03, 1249.99it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 512/4969 [00:00<00:03, 1186.78it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 511/4941 [00:00<00:04, 1033.03it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 10%|█         | 513/4933 [00:00<00:04, 891.77it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 11%|█         | 518/4865 [00:00<00:04, 1058.50it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 11%|█         | 516/4870 [00:00<00:04, 1036.01it/s]


Start model fit
Finish model fit: word: 役不足 / size: 512


 34%|███▎      | 511/1520 [00:00<00:00, 1974.29it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 25%|██▍       | 511/2082 [00:00<00:01, 1452.77it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 10%|█         | 511/4978 [00:00<00:04, 1031.40it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 10%|█         | 511/4949 [00:00<00:04, 1027.76it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 10%|█         | 514/4927 [00:00<00:03, 1274.04it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 11%|█         | 514/4844 [00:00<00:03, 1327.54it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 11%|█         | 516/4884 [00:00<00:03, 1434.10it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 11%|█         | 512/4699 [00:00<00:03, 1376.74it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 11%|█         | 512/4751 [00:00<00:03, 1312.67it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 11%|█         | 515/4783 [00:00<00:03, 1165.21it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 10%|█         | 513/4916 [00:00<00:04, 1097.45it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 10%|█         | 514/4929 [00:00<00:03, 1273.56it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 11%|█         | 513/4739 [00:00<00:03, 1222.35it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


 11%|█▏        | 530/4698 [00:00<00:04, 922.28it/s]


Start model fit
Finish model fit: word: 確信犯 / size: 512


  7%|▋         | 518/7138 [00:00<00:04, 1395.63it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  6%|▌         | 520/9031 [00:00<00:06, 1296.62it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 517/9944 [00:00<00:08, 1074.47it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 521/9866 [00:00<00:08, 1090.87it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 516/9709 [00:00<00:08, 1124.18it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  6%|▌         | 547/9778 [00:00<00:08, 1094.41it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 519/9685 [00:00<00:08, 1088.15it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 525/9759 [00:00<00:08, 1037.46it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 517/9659 [00:00<00:09, 952.37it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 519/9637 [00:00<00:07, 1164.94it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 513/9608 [00:00<00:09, 1009.91it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  6%|▌         | 534/9665 [00:00<00:09, 927.34it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  5%|▌         | 519/9562 [00:00<00:10, 902.18it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


  6%|▌         | 525/9161 [00:00<00:08, 1004.34it/s]


Start model fit
Finish model fit: word: 炎上 / size: 512


 10%|█         | 519/4992 [00:00<00:03, 1126.77it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  6%|▌         | 514/8842 [00:00<00:06, 1344.55it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 514/9947 [00:00<00:07, 1185.11it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 514/9878 [00:00<00:07, 1180.64it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 512/9932 [00:00<00:08, 1121.74it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 514/9836 [00:00<00:08, 1096.87it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 516/9810 [00:00<00:07, 1204.00it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 512/9812 [00:00<00:07, 1215.92it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 512/9829 [00:00<00:05, 1603.71it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 513/9810 [00:00<00:06, 1462.43it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 513/9814 [00:00<00:05, 1716.40it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 511/9871 [00:00<00:04, 2184.12it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 522/9733 [00:00<00:04, 2017.23it/s]


Start model fit
Finish model fit: word: 草 / size: 512


  5%|▌         | 515/9600 [00:00<00:06, 1402.73it/s]


Start model fit
Finish model fit: word: 草 / size: 512


In [None]:
!zip -r /content/word-vectors.zip /content/word-vectors
from google.colab import files
files.download("/content/word-vectors.zip")

  adding: content/word-vectors/ (stored 0%)
  adding: content/word-vectors/すべからく-2010.dict (deflated 29%)
  adding: content/word-vectors/なし崩し-2018.dict (deflated 48%)
  adding: content/word-vectors/確信犯-2018.dict (deflated 76%)
  adding: content/word-vectors/役不足-2007.dict (deflated 21%)
  adding: content/word-vectors/御の字-2015.dict (deflated 25%)
  adding: content/word-vectors/すべからく-2018.dict (deflated 41%)
  adding: content/word-vectors/御の字-2011.dict (deflated 24%)
  adding: content/word-vectors/なし崩し-2008.dict (deflated 21%)
  adding: content/word-vectors/炎上-2014.dict (deflated 31%)
  adding: content/word-vectors/失笑-2009.dict (deflated 23%)
  adding: content/word-vectors/役不足-2013.dict (deflated 27%)
  adding: content/word-vectors/失笑-2017.dict (deflated 25%)
  adding: content/word-vectors/確信犯-2008.dict (deflated 23%)
  adding: content/word-vectors/なしくずし-2019.dict (deflated 47%)
  adding: content/word-vectors/失笑-2011.dict (deflated 25%)
  adding: content/word-vectors/割愛-2017.dict (deflate

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>