<a href="https://colab.research.google.com/github/Gityosan/google-colab/blob/main/step2-ver1-2-kmeansClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### インストール初期設定等

In [None]:
!pip install -q japanize_matplotlib # matplotlib numpy plotly networkx sklearn tqdm はプリインストール済み
!python -V
!rm -rf sample_data/
from google.colab import drive

drive.mount("/content/drive")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/4.1 MB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m2.3/4.1 MB[0m [31m32.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.1/4.1 MB[0m [31m42.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.1/4.1 MB[0m [31m42.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.1/4.1 MB[0m [31m42.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for japanize_matplotlib (setup.py) 

In [None]:
# 標準ライブラリ
import gc
import glob
import itertools
import logging
import operator
import os
import pickle
import random
import unicodedata

import japanize_matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import networkx as nx  # グラフ/ネットワーク理論系の計算を行うためのPythonのパッケージ
import numpy as np
import plotly.graph_objs as go
import plotly.io as pio
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import (
    calinski_harabasz_score,
    silhouette_samples,
    silhouette_score,
)
from tqdm import tqdm

In [None]:
# @title  { vertical-output: true, form-width: "35%", display-mode: "both" }

# 初期設定群
target_words = [
    "失笑",
    "なし崩し",
    "なしくずし",
    "御の字",
    "すべからく",
    "割愛",
    "破天荒",
    "役不足",
    "確信犯",
    "炎上",
    "草",
]
target_words = [unicodedata.normalize("NFC", w) for w in target_words]
base_dir = "drive/MyDrive/script/bert/"  # @param {type:"string"}
path_list = glob.glob(base_dir + "word-vectors/1-512/*.dict")
path_list_map = {w: list(filter(lambda x: w in x, path_list)) for w in target_words}

### 関数群

In [None]:
# 関数群
def best_kmeans(X, max_range=np.arange(2, 11), criterion="silhouette", SEED=42):
    """
    Return the best K-Means clustering given the data, a range of K values, and a K-selection criterion.

    :param X: usage matrix (made of usage vectors)
    :param max_range: range within the number of clusters should lie
    :param criterion: K-selection criterion: 'silhouette' or 'calinski'
    :return: best_model: KMeans model (sklearn.cluster.Kmeans) with best clustering according to the criterion
              scores: list of tuples (k, s) indicating the clustering score s obtained using k clusters
    """
    assert criterion in ["silhouette", "calinski", "harabasz", "calinski-harabasz"]
    best_n_cluster, best_model, best_score = 1, None, -1
    scores = []
    # クラスター数2から11までの間で最もシルエットスコアが高いものを選択
    for k in max_range:
        # print("n_cluster: {}".format(k))
        if k >= X.shape[0]:
            continue
        # クラスター数に応じてKmeansを行う
        kmeans = KMeans(n_clusters=k, random_state=SEED)
        clusters = kmeans.fit_predict(X)
        # show_silhouette(kmeans, X)
        # シルエットスコアを算出
        if criterion == "silhouette":
            score = silhouette_score(X, clusters)
        else:
            score = calinski_harabasz_score(X, clusters)

        scores.append((k, score))

        # if two clusterings yield the same score, keep the one that results from a smaller K
        if score > best_score:
            best_n_cluster, best_model, best_clusters = k, kmeans, clusters
    print("best_n_cluster: {}".format(best_n_cluster))
    return best_n_cluster, best_model, best_clusters


def cluster_usages(
    Uw, method="kmeans", k_range=np.arange(2, 11), criterion="silhouette"
):
    """
    Return the best clustering model for a usage matrix.

    :param Uw: usage matrix
    :param method: K-Means or Gaussian Mixture Model ('kmeans' or 'gmm')
    :param k_range: range of possible K values (number of clusters)
    :param criterion: K selection criterion; depends on clustering method
    :return: best clustering model
    """
    # standardize usage matrix by removing the mean and scaling to unit variance
    X = preprocessing.StandardScaler().fit_transform(Uw)

    # get best model according to a K-selection criterion
    if method == "kmeans":
        best_n_cluster, best_model, best_clusters = best_kmeans(
            X, k_range, criterion=criterion
        )
    # elif method == 'gmm':
    #     best_model_aic, best_model_bic, _, _ = best_gmm(X, k_range)
    #     if criterion == 'aic':
    #         best_model = best_model_aic
    #     elif criterion == 'bic':
    #         best_model = best_model_bic
    #     else:
    #         raise ValueError('Invalid criterion {}. Choose "aic" or "bic".'.format(criterion))
    else:
        raise ValueError(
            'Invalid method "{}". Choose "kmeans" or "gmm".'.format(method)
        )

    return best_n_cluster, best_model, best_clusters


def show_scatter_plot(words=[], vectors=[], best_clusters=[], output_path=""):
    # 次元削減
    tsne = TSNE(random_state=0, perplexity=30, learning_rate=500).fit_transform(
        np.array(vectors)
    )
    # 表示
    fig, ax = plt.subplots(1, 1, figsize=(40, 40), tight_layout=True)
    cmap = plt.get_cmap("Dark2")
    # vocabとその位置に対応したクラスター番号を取り出す
    for idx, word in enumerate(words):
        # クラスターごとの色の指定
        cval = cmap(best_clusters[idx])
        # 各単語のベクトルを取得
        ax.scatter(tsne[idx, 0], tsne[idx, 1], marker=".", color=cval)
        # 単語を表示
        if word == usages["target_word"]["word"]:
            ax.annotate(word, xy=(tsne[idx, 0], tsne[idx, 1]), size=50, color=cval)
        else:
            ax.annotate(word, xy=(tsne[idx, 0], tsne[idx, 1]), color=cval)
    plt.savefig(output_path)
    # plt.show()


# 横軸をシルエット係数、縦軸をクラスター番号としてプロットし、シルエット分析を可視化する関数
def show_silhouette(fitted_model, vectors=[]):
    cluster_labels = np.unique(fitted_model.labels_)
    num_cluster = cluster_labels.shape[0]
    # シルエット係数の計算
    silhouette_vals = silhouette_samples(vectors, fitted_model.labels_)
    # 可視化
    y_ax_lower, y_ax_upper = 0, 0
    y_ticks = []

    for idx, cls in enumerate(cluster_labels):
        cls_silhouette_vals = silhouette_vals[fitted_model.labels_ == cls]
        cls_silhouette_vals.sort()
        y_ax_upper += len(cls_silhouette_vals)
        cmap = cm.get_cmap("Spectral")
        # rgbaの配列
        rgba = list(cmap(idx / num_cluster))
        # alpha値を0.7にする
        rgba[-1] = 0.7
        plt.barh(
            y=range(y_ax_lower, y_ax_upper),
            width=cls_silhouette_vals,
            height=1.0,
            edgecolor="none",
            color=rgba,
        )
        y_ticks.append((y_ax_lower + y_ax_upper) / 2.0)
        y_ax_lower += len(cls_silhouette_vals)

    silhouette_avg = np.mean(silhouette_vals)
    # 各クラスターのシルエット係数の平均をプロット
    plt.axvline(silhouette_avg, color="orangered")
    plt.xlabel("sihouette coefficient")
    plt.ylabel("cluster")
    # クラスター番号をプロット
    plt.yticks(y_ticks, cluster_labels + 1)
    plt.show()

def log_word_same_cluster(tuple_list): #list(zip(ar1,ar2))
    relative_words = {}
    for id in set([v[0] for v in tuple_list]):
        relative_words[id] = [w[1] for w in filter(lambda x: id == x[0], tuple_list)]
    for words in relative_words.values():
        print(words[:20])

### 主要処理

In [None]:
for key in path_list_map:
    print("-----------------")
    print("Start with {}".format(key))
    path_list = path_list_map[key]
    if not len(path_list):
        continue
    path_list.sort()
    output_path = (
        base_dir
        + "transition-plot/"
        + "/".join(path_list[0].split("/")[-2:]).split(".")[-2]
        + ".png"
    )
    # check outputs
    if os.path.exists("/content/" + output_path):
        print("Exit from function because the file already exists")
        continue
    else:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # loading data
    embeddings = []
    for path in path_list:
        with open(path, "rb") as f:
            binary = pickle.load(f)
            embeddings.append(
                {"filename": path.split("/")[-1].split(".")[0], "usage": binary["target_word"]}
            )

    # 次元削除
    tsne = TSNE(random_state=0, perplexity=30, learning_rate=500).fit_transform(
        np.array([u["usage"]["vector"] for u in embeddings])
    )
    # 表示
    fig, ax = plt.subplots(1, 1, figsize=(30, 20), tight_layout=True)
    cmap = plt.get_cmap("Dark2")
    for idx, u in enumerate(embeddings):
        # 各単語のベクトルを取得
        ax.scatter(tsne[idx, 0], tsne[idx, 1], 10, marker=".", color=cmap(0))
        # 単語を表示
        ax.annotate(u["filename"], xy=(tsne[idx, 0], tsne[idx, 1]), size=20, color=cmap(0))
    plt.savefig(output_path)
    plt.show()

In [None]:
for key in path_list_map:
    for path in path_list_map[key]:
        print("-----------------")
        print("Start with {}".format(path.split("/")[-1]))
        # check outputs
        output_path = base_dir + "scatter-plot/" + "/".join(path.split("/")[-2:]) + ".png"
        # output_path = base_dir + "scatter-plot/" + "/".join(path.split("/")[-2:]).split(".")[-2] + ".png"
        if os.path.exists("/content/" + output_path):
            print("Exit from function because the file already exists")
            continue
        with open(path, "rb") as f:
            usages = pickle.load(f)
            usages["all"] = list(filter(lambda x: '#' not in x["word"], usages["all"]))
        best_n_cluster, best_model, best_clusters = cluster_usages(
            Uw=[u["vector"] for u in usages["all"]],
            method="kmeans",
            k_range=np.arange(2, 11),
            criterion="silhouette",
        )
        log_word_same_cluster(
            list(zip(best_model.labels_, [u["word"] for u in usages["all"]]))
        )
        show_scatter_plot(
            [u["word"] for u in usages["all"]],
            [u["vector"] for u in usages["all"]],
            best_clusters,
            output_path,
        )

### 保存後処理

In [None]:
# formatter
!pip install -q black[jupyter]
!black "/content/drive/MyDrive/Colab Notebooks/bertClustering.ipynb"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 KB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25h

### その他

In [None]:
from tensorflow.python.client import device_lib
import tensorflow as tf
from psutil import virtual_memory

# RAMのサイズをcheck
ram_gb = virtual_memory().total / 1e9
print("Your runtime has {:.1f} gigabytes of available RAM\n".format(ram_gb))

if ram_gb < 20:
    print("Not using a high-RAM runtime")
else:
    print("You are using a high-RAM runtime!")
# GPUの数をcheck
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))
# Check GPU recognized
print(device_lib.list_local_devices())

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!
Num GPUs Available:  1
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6354157854472975992
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11586961408
locality {
  bus_id: 1
  links {
  }
}
incarnation: 14767642777879333371
physical_device_desc: "device: 0, name: A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0"
xla_global_id: 416903419
]
