In [110]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from transformers import BertJapaneseTokenizer, BertModel
import torch
import sudachipy.dictionary
import sudachipy.tokenizer
import pickle
import fasttext
import fasttext.util
import bcubed
import numpy as np
from gensim.corpora.dictionary import Dictionary as GensimDictionary
from gensim.models import LdaModel

# クラスタリングの類似度を評価する関数

## Counting Pair based f-measure

In [2]:

def cluster_similarity_pair(correct_cluster, test_cluster):
  def get_pair_label(cluster):
    labels = []
    for i0, v0 in enumerate(cluster):
      for i1, v1 in enumerate(cluster):
        if i1<=i0: continue
        labels.append(v0==v1)
    return labels

  correct_pairs = get_pair_label(correct_cluster)
  test_pairs = get_pair_label(test_cluster)
  combined_pairs = [(v0,v1) for v0, v1 in zip(correct_pairs, test_pairs)]

  correct_true, correct_false = correct_pairs.count(True), correct_pairs.count(False)
  test_true, test_false = test_pairs.count(True), test_pairs.count(False)
  true_positive = combined_pairs.count((True, True))
  false_positive = combined_pairs.count((False, True))
  true_negative = combined_pairs.count((False, False))
  false_negative = combined_pairs.count((True, False))

  scores = {
    "ct_cf_tt_tf": (correct_true, correct_false, test_true, test_false)
    , "tp_fp_tn_fn": (true_positive, false_positive, true_negative, false_negative)
    , "precision": metrics.precision_score(correct_pairs, test_pairs)
    , "recall": metrics.recall_score(correct_pairs, test_pairs)
    , "f1": metrics.f1_score(correct_pairs, test_pairs)
    , "accuracy": metrics.accuracy_score(correct_pairs, test_pairs)
  }
  return scores    

In [14]:
cluster_similarity_pair([0,0,0,1,1,1],[1,0,0,0,0,2])

{'ct_cf_tt_tf': (6, 9, 6, 9),
 'tp_fp_tn_fn': (2, 4, 5, 4),
 'precision': 0.3333333333333333,
 'recall': 0.3333333333333333,
 'f1': 0.3333333333333333,
 'accuracy': 0.4666666666666667}

## Purity-Inverse Purity F-measure

In [38]:
# 参考 https://stackoverflow.com/questions/34047540/python-clustering-purity-metric
def cluster_similarity_purity(correct_cluster, test_cluster):
  # compute contingency matrix (also called confusion matrix)
  contingency_matrix = metrics.cluster.contingency_matrix(correct_cluster, test_cluster)
  # purity
  purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
  inverse_purity = np.sum(np.amax(contingency_matrix, axis=1)) / np.sum(contingency_matrix)

  f1 = 2*purity*inverse_purity/(purity+inverse_purity)

  score = {
    "precision": purity
    , "recall": inverse_purity
    , "f1": f1
  }
  return score


In [40]:
cluster_similarity_purity([0,0,0,1,1,1],[1,0,0,0,0,2])

{'precision': 0.6666666666666666,
 'recall': 0.6666666666666666,
 'f1': 0.6666666666666666}

## BCubed

In [12]:
def cluster_similarity_bcubed(correct_cluster, test_cluster):
  def get_bcubed_input(cluster):
    labels = {i: {v} for i,v in enumerate(cluster)}
    return labels

  correct_bcubed_input = get_bcubed_input(correct_cluster)
  test_bcubed_input = get_bcubed_input(test_cluster)

  precision = bcubed.precision(correct_bcubed_input, test_bcubed_input)
  recall = bcubed.recall(correct_bcubed_input, test_bcubed_input)
  f1 = bcubed.fscore(precision, recall)
  
  scores = {
    "precision": precision
    , "recall": recall
    , "f1": f1
  }
  return scores    

In [13]:
cluster_similarity_bcubed([0,0,0,1,1,1],[1,0,0,0,0,2])

{'precision': 0.5555555555555555,
 'recall': 0.6666666666666666,
 'f1': 0.606060606060606}

# データの読み込み

In [17]:
FILE_PATH = "text/titles.csv"

In [18]:
# テストデータの読み込み
df = pd.read_csv(FILE_PATH)
# 検証を素早くできるようにテストデータ数を制限
train_df, test_df = train_test_split(df, train_size=0.9, random_state = 0, shuffle=True, stratify=df["category"])
# indexをリセット
train_df, test_df = train_df.reset_index(drop=True), test_df.reset_index(drop=True)

print("all")
print(df["category"].value_counts())
print("")
print("train")
print(train_df["category"].value_counts())
print("")
print("test")
print(test_df["category"].value_counts())


all
sports-watch      900
dokujo-tsushin    870
it-life-hack      870
movie-enter       870
smax              870
kaden-channel     864
peachy            842
topic-news        770
livedoor-homme    511
Name: category, dtype: int64

train
sports-watch      810
smax              783
it-life-hack      783
dokujo-tsushin    783
movie-enter       783
kaden-channel     777
peachy            758
topic-news        693
livedoor-homme    460
Name: category, dtype: int64

test
sports-watch      90
kaden-channel     87
smax              87
dokujo-tsushin    87
movie-enter       87
it-life-hack      87
peachy            84
topic-news        77
livedoor-homme    51
Name: category, dtype: int64


# Embedding取得のための関数の定義

## TF-IDF

In [90]:
# tfidf
def ngram_tfidf(texts, *, ngram_range = (3,3)):
  vectorizer = TfidfVectorizer(
                    analyzer="char"
                    , ngram_range=ngram_range
                    , max_df=0.9
                    , min_df = 5)
  return vectorizer.fit_transform(texts)

def word_tfidf(texts, *, ngram_range = (1,1)):
  tokenizer_obj = sudachipy.dictionary.Dictionary(dict="full").create()
  mode = sudachipy.tokenizer.Tokenizer.SplitMode.A
  wakachi_texts = [" ".join([m.surface() for m in tokenizer_obj.tokenize(text, mode)]) for text in texts]
  vectorizer = TfidfVectorizer(
    analyzer = "word"
    , ngram_range = ngram_range
    , max_df = 0.9
    , min_df = 5
  )
  return vectorizer.fit_transform(wakachi_texts)

## FastText

In [92]:
def fasttext_vector(texts, *, model=None, model_path = "fasttext/cc.ja.300.bin"):
  ft = model or fasttext.load_model(model_path)
  tokenizer_obj = sudachipy.dictionary.Dictionary(dict="full").create()
  mode = sudachipy.tokenizer.Tokenizer.SplitMode.A
  vectors = []
  for text in texts:
    tokens = tokenizer_obj.tokenize(text)
    words = [token.surface() for token in tokens]
    vec = ft.get_word_vector(words[0])
    for w in words[1:]:
      vec += ft.get_word_vector(w)
    mean_vec = vec / len(words)
    vectors.append(mean_vec)
  return vectors

## Sentence-Bert

In [22]:
# 参考 https://qiita.com/sonoisa/items/1df94d0a98cd4f209051
class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)

def sentencebert(texts, *, model=None):
    MODEL_NAME = "sonoisa/sentence-bert-base-ja-mean-tokens-v2"  # <- v2です。
    model = model or SentenceBertJapanese(MODEL_NAME)
    sentence_embeddings = model.encode(texts, batch_size=8)
    return sentence_embeddings.detach().numpy()

## BERT

In [95]:
# 参考：近江崇宏; 金田健太郎; 森長誠; 江間見亜利. BERTによる自然言語処理入門 ―Transformersを使った実践プログラミング― (p.169). Kindle 版. 
#トークナイザとモデルのロード 
# 参考 https://qiita.com/sonoisa/items/1df94d0a98cd4f209051
class BertJapanese:
    def __init__(self, model_name_or_path, *, device=None, model = None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = model or BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def encode(self, sentences, *, batch_size=8, max_length = 256):
        iterator = range(0, len(sentences), batch_size)
        all_embeddings = []

        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]
            encoded_input = self.tokenizer.batch_encode_plus(
                        batch, 
                        max_length=max_length, 
                        padding='max_length', 
                        truncation=True, 
                        return_tensors='pt'
                ).to(self.device)
            
            # 文章ベクトルを計算
            # BERTの最終層の出力を平均を計算する。（ただし、[PAD]は除く。）
            with torch.no_grad():
                model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')
                
            all_embeddings.extend(sentence_embeddings)
        return torch.stack(all_embeddings)

def get_bert_embeddings(texts, *, model=None):
    MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    model = BertJapanese(MODEL_NAME, model=model)
    sentence_embeddings = model.encode(texts, batch_size=8)
    return sentence_embeddings.detach().numpy()

In [86]:
embeddings = get_bert_embeddings(test_df["title"])


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [97]:
with open("embedding/bert_embedding.pickle","wb") as f:
  pickle.dump(torch.stack(embeddings).detach().numpy(), f)

# Clustering用の関数

## kmeans

In [23]:
# k-meansでクラスタ分析。とりあえず9つのグループに分けてみる
def kmeans_clustering(vectors, *, n_clusters=9):
  km_model = KMeans(n_clusters=n_clusters, random_state = 0)
  km_model.fit(vectors)
  return km_model.labels_


## 教師あり（トリグラム、ナイーブベイズ）

In [43]:
# 参考 
def supervised_naivebayse_vector(texts, *, model=None, train_text = train_df["title"], train_category=train_df["category"]):
  if model is None:
    model = make_pipeline(
      TfidfVectorizer(
                      analyzer="char"
                      , ngram_range=(3,3)
                      , max_df=0.9
                      , min_df = 5)
      , MultinomialNB()
    )
    model.fit(train_text, train_category)
  return model.predict(texts)

## LDA

In [130]:
# 参考 https://zenn.dev/robes/articles/424cb97503d16e
def get_lda_label(texts, *, train_texts = train_df["title"], num_topics=9):
  tokenizer_obj = sudachipy.dictionary.Dictionary(dict="full").create()
  mode = sudachipy.tokenizer.Tokenizer.SplitMode.A
  def wakachi(text):
    return [m.surface() for m in tokenizer_obj.tokenize(text, mode)]

  wakachi_texts = [wakachi(text) for text in texts]
  # モデル作成
  dictionary = GensimDictionary(wakachi_texts)
  corpus = [dictionary.doc2bow(text) for text in wakachi_texts]
  lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
  # maxを取り出す
  topics = []
  for doc in corpus:
    topic, prob = max(lda[doc], key=lambda x:x[1])
    topics.append(topic)
  return topics

# Clustering

In [132]:
# ngram tfidf, kmeans
X = ngram_tfidf(test_df["title"])
test_labels = kmeans_clustering(X)
test_df["pred_trigram_tfidf_kmeans"] = test_labels

# word tfidf, kmeans
X = word_tfidf(test_df["title"])
test_labels = kmeans_clustering(X)
test_df["pred_word_tfidf_kmeans"] = test_labels

# fasttext, kmeans
# モデルは以下からダウンロード
# https://fasttext.cc/docs/en/crawl-vectors.html
#ft = fasttext.load_model('fasttext/cc.ja.300.bin')
vectors = fasttext_vector(test_df["title"], model = ft)
test_labels = kmeans_clustering(vectors)
test_df["pred_fasttext_kmeans"] = test_labels

# sentence bert, kmeans
embedding_binary_path = "embedding/sentencebert_embedding.pickle"
if Path(embedding_binary_path).exists():
  with open(embedding_binary_path, "rb") as f:
    sentence_embeddings = pickle.load(f)
else:
  sentence_embeddings = sentencebert(test_df["title"])
  Path(embedding_binary_path).parent.mkdir(parents=True, exist_ok=True)
  with open("embedding/sentencebert_embedding.pickle", "wb") as f:
    pickle.dump(sentence_embeddings.detach().numpy(), f)

test_labels = kmeans_clustering(sentence_embeddings)
test_df["pred_sentencebert_kmeans"] = test_labels

# bert, kmeans
embedding_binary_path = "embedding/bert_embedding.pickle"
if Path(embedding_binary_path).exists():
  with open(embedding_binary_path, "rb") as f:
    sentence_embeddings = pickle.load(f)
else:
  sentence_embeddings = get_bert_embeddings(test_df["title"])
  Path(embedding_binary_path).parent.mkdir(parents=True, exist_ok=True)
  with open("embedding/sentencebert_embedding.pickle", "wb") as f:
    pickle.dump(sentence_embeddings.detach().numpy(), f)

test_labels = kmeans_clustering(sentence_embeddings)
test_df["pred_bert_kmeans"] = test_labels

test_labels = get_lda_label(test_df["title"])
test_df["pred_lda"] = test_labels

test_labels = supervised_naivebayse_vector(test_df["title"])
test_df["pred_supervised_naivebayse"] = test_labels

In [99]:
# save
OUT_DIR = "prediction"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
test_df.to_csv(str(Path(OUT_DIR).joinpath("livedoor.csv")), index=False)

# Calculate Similarity

## Pair

In [104]:
test_df = pd.read_csv("prediction/livedoor.csv")
cluster_similarity_func = cluster_similarity_pair
print("ngram tfidf, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_trigram_tfidf_kmeans"]))
print("word tfidf, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_word_tfidf_kmeans"]))
print("fasttext, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_fasttext_kmeans"]))
print("sentencebert, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_sentencebert_kmeans"]))
print("bert, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_bert_kmeans"]))
print("supervised, naivebayse")
print(cluster_similarity_func(test_df["category"], test_df["pred_supervised_naivebayse"]))

ngram tfidf, kmeans
{'ct_cf_tt_tf': (30397, 240819, 172013, 99203), 'tp_fp_tn_fn': (21785, 150228, 90591, 8612), 'precision': 0.12664740455663234, 'recall': 0.7166825673586209, 'f1': 0.21525616323304184, 'accuracy': 0.4143413367942894}
word tfidf, kmeans
{'ct_cf_tt_tf': (30397, 240819, 85461, 185755), 'tp_fp_tn_fn': (12802, 72659, 168160, 17595), 'precision': 0.14979932366810592, 'recall': 0.4211599828930487, 'f1': 0.22099466588409952, 'accuracy': 0.667224647513421}
fasttext, kmeans
{'ct_cf_tt_tf': (30397, 240819, 33096, 238120), 'tp_fp_tn_fn': (6382, 26714, 214105, 24015), 'precision': 0.19283297075175249, 'recall': 0.20995492976280555, 'f1': 0.2010300348069866, 'accuracy': 0.812957200165182}
sentencebert, kmeans
{'ct_cf_tt_tf': (30397, 240819, 32532, 238684), 'tp_fp_tn_fn': (8420, 24112, 216707, 21977), 'precision': 0.25882208287224884, 'recall': 0.277001019837484, 'f1': 0.2676031718285687, 'accuracy': 0.8300653353784437}
bert, kmeans
{'ct_cf_tt_tf': (30397, 240819, 35909, 235307), '

## Purity-Inverse Purity F-measure

In [102]:
test_df = pd.read_csv("prediction/livedoor.csv")
cluster_similarity_func = cluster_similarity_purity
print("ngram tfidf, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_trigram_tfidf_kmeans"]))
print("word tfidf, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_word_tfidf_kmeans"]))
print("fasttext, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_fasttext_kmeans"]))
print("sentencebert, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_sentencebert_kmeans"]))
print("bert, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_bert_kmeans"]))
print("supervised, naivebayse")
print(cluster_similarity_func(test_df["category"], test_df["pred_supervised_naivebayse"]))

ngram tfidf, kmeans
{'precision': 0.2903663500678426, 'recall': 0.8208955223880597, 'f1': 0.4289905782443096}
word tfidf, kmeans
{'precision': 0.3514246947082768, 'recall': 0.582089552238806, 'f1': 0.43825928497049643}
fasttext, kmeans
{'precision': 0.3147896879240163, 'recall': 0.34328358208955223, 'f1': 0.32841975688567476}
sentencebert, kmeans
{'precision': 0.41112618724559025, 'recall': 0.4056987788331072, 'f1': 0.4083944517821643}
bert, kmeans
{'precision': 0.48846675712347354, 'recall': 0.5183175033921302, 'f1': 0.5029495989788865}
supervised, naivebayse
{'precision': 0.7815468113975577, 'recall': 0.7815468113975577, 'f1': 0.7815468113975577}


In [67]:
test_df.query("pred_sentencebert_kmeans == 5")["category"].value_counts()

peachy            26
dokujo-tsushin     8
livedoor-homme     5
kaden-channel      3
smax               1
Name: category, dtype: int64

## BCubed

In [103]:
test_df = pd.read_csv("prediction/livedoor.csv")
cluster_similarity_func = cluster_similarity_bcubed
print("ngram tfidf, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_trigram_tfidf_kmeans"]))
print("word tfidf, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_word_tfidf_kmeans"]))
print("fasttext, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_fasttext_kmeans"]))
print("sentencebert, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_sentencebert_kmeans"]))
print("bert, kmeans")
print(cluster_similarity_func(test_df["category"], test_df["pred_bert_kmeans"]))
print("supervised, naivebayse")
print(cluster_similarity_func(test_df["category"], test_df["pred_supervised_naivebayse"]))


ngram tfidf, kmeans
{'precision': 0.7302436789202665, 'recall': 0.2562694948629095, 'f1': 0.3793951945032219}
word tfidf, kmeans
{'precision': 0.4351869409102146, 'recall': 0.27031032765763474, 'f1': 0.3334825798222082}
fasttext, kmeans
{'precision': 0.21664901046033372, 'recall': 0.1995891309097131, 'f1': 0.2077694637396775}
sentencebert, kmeans
{'precision': 0.2831008772589822, 'recall': 0.27139126072269565, 'f1': 0.2771224287171229}
bert, kmeans
{'precision': 0.369014582491454, 'recall': 0.3341771760861575, 'f1': 0.350732924859773}
supervised, naivebayse
{'precision': 0.6364946721625434, 'recall': 0.629575059396474, 'f1': 0.6330159564573605}


# 参考文献／記事

# References

文章のベクトル化
* [SudachiPy](https://github.com/WorksApplications/SudachiPy/blob/develop/docs/tutorial.md)
* [tf-idfでベクトル化したラジオ感想ツイートをクラスタリングして可視化する](https://note.com/himaratsu/n/necefee6e5454)
* [機械学習 〜 テキスト分類（ナイーブベイズ分類器） 〜](https://qiita.com/fujin/items/39d450b910bf2be866b5)
* [fastTextとDoc2Vecのモデルを作成してニュース記事の多クラス分類の精度を比較する](https://qiita.com/kazuki_hayakawa/items/ca5d4735b9514895e197)
* [【日本語モデル付き】2020年に自然言語処理をする人にお勧めしたい文ベクトルモデル](https://qiita.com/sonoisa/items/1df94d0a98cd4f209051)
* [https://huggingface.co/sonoisa/sentence-bert-base-ja-mean-tokens-v2](https://huggingface.co/sonoisa/sentence-bert-base-ja-mean-tokens-v2)

エラー・デバッグ関係
* [Pytorch: Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead](https://stackoverflow.com/questions/55466298/pytorch-cant-call-numpy-on-variable-that-requires-grad-use-var-detach-num)
  * sentence-bertの出力をkmeansに入力したらエラーが出たときの解消方法


クラスタリング結果の比較方法
* [２つのクラスタリング結果がどのくらい似ているかの指標](https://takemikami.com/2019/02/25/clustdiff.html)
* [Precision and recall for clustering?](https://stats.stackexchange.com/questions/15158/precision-and-recall-for-clustering/80194)
* [Evaluation of clustering](https://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html)