In [1]:
import pandas as pd

In [2]:
data_1 = pd.read_csv("/Users/tanakahiroshi/Desktop/Programming/scraping/extracted_relations_final_1.csv")
data_2 = pd.read_csv("/Users/tanakahiroshi/Desktop/Programming/scraping/extracted_relations_final_2.csv")

In [3]:
data = pd.concat([data_1, data_2], ignore_index=True)

In [4]:
temp = data[data["関係"]=="原因・理由"].copy()
temp.reset_index(drop=True, inplace=True)
temp_unique = temp.drop_duplicates()
temp = temp_unique

In [5]:
temp

Unnamed: 0,原因,関係,結果
0,今回もカラーも今どきな雰囲気に仕上がったので,原因・理由,良かったですまたよろしくお願いします♪
1,シャンプーを使っていただいたので、,原因・理由,来月またよろしくお願いします
2,とても大事な友達の式だったので、,原因・理由,アレンジが不慣れなら他のスタッフの方にもサポートいただきたかったです
3,とって前髪は顔のパーツの一部なので、,原因・理由,もっと研究していただきたいと思いました
4,今後も通いたいので、,原因・理由,改善していただければいいなと思います
...,...,...,...
3347,トリートメントもとても良かったので、,原因・理由,次回もよろしくお願いします
3348,色味のアドバイスをいただけるので、,原因・理由,大変有り難いです
3349,のは、担当スタイリストさんなので、,原因・理由,改めて感謝したいなと思いました
3350,つけっぱなしの上でのカラーなので,原因・理由,マスクを仕方ないのかもしれませんが


In [6]:
# 重複するセットを見つける
duplicate_indices = []
seen = {}
for index, row in temp.iterrows():
    key = (row['原因'], row['結果'])
    if key in seen:
        duplicate_indices.append((seen[key], index))
        print(f"重複: インデックス {seen[key]} と {index}")
    else:
        seen[key] = index

# 重複を削除する
for idx1, idx2 in duplicate_indices:
    temp = temp.drop(idx2)

# 結果を表示
print("\n重複削除後のデータフレーム:")
print(temp)


重複削除後のデータフレーム:
                          原因     関係                                  結果
0     今回もカラーも今どきな雰囲気に仕上がったので  原因・理由                 良かったですまたよろしくお願いします♪
1          シャンプーを使っていただいたので、  原因・理由                      来月またよろしくお願いします
2           とても大事な友達の式だったので、  原因・理由  アレンジが不慣れなら他のスタッフの方にもサポートいただきたかったです
3         とって前髪は顔のパーツの一部なので、  原因・理由                 もっと研究していただきたいと思いました
4                 今後も通いたいので、  原因・理由                  改善していただければいいなと思います
...                      ...    ...                                 ...
3347      トリートメントもとても良かったので、  原因・理由                       次回もよろしくお願いします
3348       色味のアドバイスをいただけるので、  原因・理由                            大変有り難いです
3349       のは、担当スタイリストさんなので、  原因・理由                     改めて感謝したいなと思いました
3350        つけっぱなしの上でのカラーなので  原因・理由                   マスクを仕方ないのかもしれませんが
3351             またお願いしたいので、  原因・理由                          よろしくお願いします

[3348 rows x 3 columns]


In [7]:
#BERTを読み込む
#KMEANSをする（k=50) （自分の解釈）
#networrkxで可視化

In [8]:
from transformers import BertJapaneseTokenizer, BertModel
import torch

In [9]:
class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)


In [10]:
MODEL_NAME = "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
model = SentenceBertJapanese(MODEL_NAME)

In [11]:
# 一つの行列に結合
def marge_text(temp_df):
    marge = []
    for cau in temp_df["原因"]:
        marge.append(cau)

    for res in temp_df["結果"]:
        marge.append(res)
    return marge

In [12]:
marge_rev = marge_text(temp)

## クラスタリング ##

In [13]:
from sklearn.cluster import KMeans 
import pandas as pd

# 文章リストをベクトル化してクラスタリング
def sentence_encode_and_clustering(sentences, n_clusters=100):
    sentence_embeddings = model.encode(sentences, batch_size=8)
    print("Sentence embeddings:", sentence_embeddings.shape)

    kmeans_model = KMeans(n_clusters = n_clusters).fit(sentence_embeddings)
    print(kmeans_model.labels_)

    # 元の文章、埋め込み、クラスタラベルを含むDataFrameを作成
    return pd.DataFrame({
        '文章': sentences,
        # '埋め込み': sentence_embeddings.tolist(),
        'クラスタ': kmeans_model.labels_
    })

# クラスタ番号ごとに分けて格納
def divide_into_clusters(df_results):
    # ユニークなクラスタ番号を取得
    unique_clusters = df_results['クラスタ'].unique()

    dfs = {}

    for cluster in unique_clusters:
        # クラスタごとにDataFrameをフィルタリングし、新しい変数に格納
        cluster_df = df_results[df_results['クラスタ'] == cluster].copy()
        
        # 変数名を動的に生成（例：cluster_0, cluster_1, ...）
        var_name = f"cluster_{cluster}"
        
        # グローバル変数として設定
        globals()[var_name] = cluster_df
        
        # 辞書にも格納（後で使用する場合）
        dfs[cluster] = cluster_df
    
    return dfs


In [14]:
# 文章リストをベクトル化してクラスタリング
df_results = sentence_encode_and_clustering(marge_rev, n_clusters=100)
cluster_dfs = {}
# クラスタ番号ごとに分けて格納
cluster_dfs = divide_into_clusters(df_results)
# DataFrameの要素数（行数）を取得し、少ない順にソート
sorted_dfs = [df for _, df in sorted(cluster_dfs.items(), key=lambda x: len(x[1]))]

Sentence embeddings: torch.Size([6696, 768])
[64 16  4 ... 67 42 15]


In [15]:
sorted_dfs[1]

Unnamed: 0,文章,クラスタ
1441,駅から近いので,48
1544,仕上がりもよきでした駅近なので,48
1761,地下歩出口からも近いので,48
1934,地下歩行空間の出口からはすぐなので,48
2259,駅の近くなので,48
2289,札駅に近いので,48
2359,お店の雰囲気もすごくおしゃれで駅からも徒歩で近いので,48
2412,駅からも近いので,48
2496,会社から近いので,48
2568,駅からも近いため,48


In [16]:
# !pip install keybert
# !pip install --upgrade keybert sentence-transformers transformers

## クラスタごとに、内容を一言でまとめる ##

In [17]:
# from sentence_transformers import SentenceTransformer
# from keybert import KeyBERT
# import pandas as pd

# # クラスタが同じものから文章だけ取得し、一文に繋げる
# def connect_text_list(short_index_box_list_df):
#     text_list = short_index_box_list_df["文章"].tolist()
#     # print(text_list)

#     text = '。'.join(text_list)
#     return text

# # # キーワードを抽出
# # def text_extract_keywords(text, text_num, model_name='cl-tohoku/bert-base-japanese-whole-word-masking', top_n=3, keyphrase_ngram_range=(1, 1)):
# #     # モデルの初期化
# #     model = SentenceTransformer(model_name)
# #     kw_model = KeyBERT(model=model)

# #     # 結果を格納するリスト
# #     kw_results = []

# #     try:
# #         # キーワードを抽出
# #         keywords = kw_model.extract_keywords(text, top_n=top_n, keyphrase_ngram_range=keyphrase_ngram_range, diversity=0, nr_candidates=0)
# #     except Exception as e:
# #         # print(f"文章 {index + 1} でエラーが発生しました: {e}")
# #         keywords = []

# #     # スコアの低い順にソート
# #     sorted_keywords = sorted(keywords, key=lambda x: x[1])

# #     # ワースト3を取得
# #     worst_3 = sorted_keywords[:3]
    
# #     # 結果を格納
# #     kw_results.append({
# #         "文章番号": text_num,
# #         "文章": text,
# #         **{f"キーワード{i + 1}": worst_3[i][0] if i < len(worst_3) else "" for i in range(len(worst_3))},
# #         **{f"スコア{i + 1}": worst_3[i][1] if i < len(worst_3) else 0 for i in range(len(worst_3))}
# #     })

# #     # 結果をデータフレームに変換
# #     return pd.DataFrame(kw_results)

# # GPUメモリの使用量を制限
# torch.mps.set_per_process_memory_fraction(0.8)  # 80%のGPUメモリを使用

# # モデルを一度だけ初期化
# model_name = 'sonoisa/sentence-bert-base-ja-mean-tokens-v2'
# model = SentenceTransformer(model_name)
# kw_model = KeyBERT(model=model)

# def text_extract_keywords(text, text_num, kw_model, top_n=3, keyphrase_ngram_range=(1, 1)):
#     kw_results = []

#     try:
#         keywords = kw_model.extract_keywords(text, top_n=top_n, keyphrase_ngram_range=keyphrase_ngram_range, diversity=0, nr_candidates=0)
#     except Exception as e:
#         keywords = []

#     # sorted_keywords = sorted(keywords, key=lambda x: x[1])
#     # worst_3 = sorted_keywords[:3]
    
#     kw_results.append({
#         "文章番号": text_num,
#         "文章": text,
#         **{f"キーワード{i + 1}": keywords[i][0] if i < len(keywords) else "" for i in range(len(keywords))},
#         **{f"スコア{i + 1}": keywords[i][1] if i < len(keywords) else 0 for i in range(len(keywords))}
#     })

#     return pd.DataFrame(kw_results)


In [18]:
sorted_dfs[0]["文章"].tolist()

['ツルツルツヤツヤが持続しているので、',
 'ツヤツヤになるので、',
 'ツヤのある綺麗な状態がキープできるので',
 'ツヤツヤな仕上がりになるので',
 '大満足ですトリートメントでもツヤツヤになったので',
 'しっかり伸びてツヤも出たので',
 '見た目にもツヤが出たので',
 'ツルツルです',
 'ツルツルツヤツヤが持続しているので、',
 '大満足ですトリートメントでもツヤツヤになったので',
 'しっかり伸びてツヤも出たので']

### クラスタごとに、命名したい ###

In [19]:
# import os
# os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

In [20]:
def replace_text(replace_text_df, new_text):
    replace_text_df['文章'] = new_text
    return replace_text_df

In [21]:
# df_results_list = []
# batch_size = 10  # バッチサイズを設定

# for i in range(0, len(sorted_dfs), batch_size):
#     batch = sorted_dfs[i:i+batch_size]
    
#     for j, cl_df in enumerate(batch):
#         combined_text = connect_text_list(cl_df)  # 一文に繋げる
#         df_results_keywords = text_extract_keywords(combined_text, int(cl_df['クラスタ'].iloc[0]), kw_model, top_n=len(cl_df))  # クラスタリング
#         df_results_list.append(df_results_keywords)
        
#         keyword = df_results_keywords["キーワード1"].iloc[0]
#         sorted_dfs[i+j] = replace_text(sorted_dfs[i+j], keyword)
    
#     # バッチ処理後にメモリをクリア
#     torch.mps.empty_cache()
    
#     if i % 100 == 0:
#         print(f"Processed {i} items")

# print(df_results_list)

In [22]:
sorted_dfs[1]["文章"].tolist()

['駅から近いので',
 '仕上がりもよきでした駅近なので',
 '地下歩出口からも近いので',
 '地下歩行空間の出口からはすぐなので',
 '駅の近くなので',
 '札駅に近いので',
 'お店の雰囲気もすごくおしゃれで駅からも徒歩で近いので',
 '駅からも近いので',
 '会社から近いので',
 '駅からも近いため',
 '駅から近く大きくお店の名前も書かれていましたので、',
 '駅近なので',
 '駅から近いので',
 '少し駅から歩くので']

In [23]:
# コピーを取っておく
sorted_dfs_copy = [df.copy() for df in sorted_dfs]

In [24]:
sorted_dfs_copy[0]["文章"].tolist()

['ツルツルツヤツヤが持続しているので、',
 'ツヤツヤになるので、',
 'ツヤのある綺麗な状態がキープできるので',
 'ツヤツヤな仕上がりになるので',
 '大満足ですトリートメントでもツヤツヤになったので',
 'しっかり伸びてツヤも出たので',
 '見た目にもツヤが出たので',
 'ツルツルです',
 'ツルツルツヤツヤが持続しているので、',
 '大満足ですトリートメントでもツヤツヤになったので',
 'しっかり伸びてツヤも出たので']

In [25]:
len(sorted_dfs_copy[0])

11

## トピックモデリング ##

In [26]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer, AutoModel
import MeCab
import ipadic

In [27]:
# MeCabの設定
mecab = MeCab.Tagger(ipadic.MECAB_ARGS)
# BERTopicモデルの設定
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
model = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

# テキストの前処理関数
def preprocess(text):
    node = mecab.parseToNode(text)
    words = []
    while node:
        if node.feature.split(",")[0] in ["名詞", "動詞", "形容詞"]:
            words.append(node.surface)
        node = node.next
    return " ".join(words)

def topic_modeling(sentences):

    # 前処理を適用
    preprocessed_sentences = [preprocess(sentence) for sentence in sentences]

    vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize, max_features=10000)

    topic_model = BERTopic(
        language="japanese",
        calculate_probabilities=True,
        verbose=True,
        vectorizer_model=vectorizer,
        embedding_model=model
    )

    # トピックモデリングの実行
    topics, probs = topic_model.fit_transform(preprocessed_sentences)

    # # 結果の表示
    # print(topic_model.get_topic_info())

    # # 各トピックの上位単語を表示
    # for topic in topic_model.get_topics():
    #     print(f"トピック {topic}: {topic_model.get_topic(topic)}")

    return topic_model

In [28]:
sorted_dfs_copy[0]["文章"].tolist()

['ツルツルツヤツヤが持続しているので、',
 'ツヤツヤになるので、',
 'ツヤのある綺麗な状態がキープできるので',
 'ツヤツヤな仕上がりになるので',
 '大満足ですトリートメントでもツヤツヤになったので',
 'しっかり伸びてツヤも出たので',
 '見た目にもツヤが出たので',
 'ツルツルです',
 'ツルツルツヤツヤが持続しているので、',
 '大満足ですトリートメントでもツヤツヤになったので',
 'しっかり伸びてツヤも出たので']

In [29]:
topics_list = []
err_df = []

# sorted_dfs_copy のdfリスト全てからトピックを取得
for index, df_copy in enumerate(sorted_dfs_copy):
    try:
        sentences = df_copy["文章"].tolist()
        
        # 文章数が2未満であることを確認
        if len(sentences) < 2:
            print(f"警告: インデックス {index} の文章数が不足しています。スキップします。")
            topics_list.append(index)  # インデックスを格納
            err_df.append(df_copy)
            continue

        topic_model = topic_modeling(sentences)
        topics_list.append(topic_model)
    except (TypeError, ValueError) as e:
        print(f"エラーが発生しました (インデックス {index}):")
        print(str(e))
        print("\nエラーが発生した時のdf_copy:")
        print(df_copy)
        
        # エラーが発生したDataFrameとそのインデックスを保存
        err_df.append(df_copy)
        topics_list.append(index)  # インデックスを格納
        
        # エラーが発生しても次のデータベースの解析を続行

print(f"解析完了。エラーまたは不十分なデータが発生したDataFrame数: {len(err_df)}")



2024-12-18 14:01:54,547 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:00,917 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:00,918 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-12-18 14:02:04,552 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:04,552 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:04,562 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:04,567 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:04,600 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:04,632 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:07,086 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:07,087 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:07,126 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:07,126 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:07,129 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:07,130 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:07,140 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:07,172 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:09,936 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:09,936 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:09,970 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:09,971 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:09,973 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:09,975 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:09,990 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:10,002 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:12,408 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:12,408 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:12,442 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:12,443 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:12,445 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:12,446 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:12,453 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:12,466 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:14,947 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:14,947 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:15,004 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:15,005 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:15,008 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:15,010 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:15,022 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:15,039 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:17,377 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:17,377 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:17,440 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:17,441 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:17,444 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:17,447 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:17,452 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:17,475 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:20,848 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:20,848 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:20,882 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:20,882 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:20,884 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:20,886 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:20,894 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:20,904 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:24,597 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:24,598 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:24,672 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:24,673 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:24,677 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:24,679 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:24,686 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:24,701 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:28,704 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:28,705 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:28,760 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:28,760 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:28,764 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:28,765 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:28,779 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:28,804 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:31,118 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:31,119 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:31,169 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:31,170 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:31,173 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:31,175 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:31,205 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:31,215 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:33,333 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:33,333 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:33,377 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:33,377 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:33,380 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:33,381 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:33,394 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:33,404 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:35,490 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:35,490 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:35,529 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:35,529 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:35,532 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:35,533 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:35,545 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:35,552 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:37,698 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:37,698 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:37,752 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:37,753 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:37,755 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:37,756 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:37,762 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:37,773 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:40,235 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:40,235 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:40,270 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:40,270 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:40,272 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:40,274 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:40,285 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:40,297 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-12-18 14:02:42,780 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:42,780 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:42,827 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:42,828 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:42,831 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:42,832 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:43,038 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:43,084 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:02:47,252 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:47,253 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:47,302 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:47,303 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:47,306 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:47,308 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:47,325 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:47,336 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:02:50,200 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:50,200 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:50,269 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:50,269 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:50,272 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:50,275 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:50,294 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:50,307 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:02:53,413 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:53,413 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:53,451 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:53,452 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:53,458 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:53,459 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:53,471 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:53,480 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:02:55,863 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:55,863 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:55,913 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:55,913 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:55,918 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:55,919 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:55,934 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:55,949 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:02:58,508 - BERTopic - Embedding - Completed ✓
2024-12-18 14:02:58,509 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:02:58,550 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:02:58,551 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:02:58,554 - BERTopic - Cluster - Completed ✓
2024-12-18 14:02:58,555 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:02:58,571 - BERTopic - Representation - Completed ✓
2024-12-18 14:02:58,580 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:01,136 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:01,136 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:01,178 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:01,178 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:01,181 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:01,182 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:01,198 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:01,213 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:03,525 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:03,526 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:03,615 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:03,615 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:03,619 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:03,620 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:03,633 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:03,641 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:06,097 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:06,097 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:06,146 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:06,147 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:06,151 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:06,152 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:06,159 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:06,168 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:08,279 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:08,279 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:08,318 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:08,318 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:08,322 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:08,324 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:08,332 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:08,340 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:10,439 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:10,440 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:10,496 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:10,497 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:10,501 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:10,503 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:10,510 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:10,523 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:13,445 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:13,446 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:13,489 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:13,490 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:13,493 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:13,494 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:13,510 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:13,519 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:15,748 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:15,748 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:15,815 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:15,816 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:15,820 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:15,822 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:15,838 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:15,848 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:19,248 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:19,249 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:19,317 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:19,317 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:19,322 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:19,324 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:19,337 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:19,348 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:22,009 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:22,009 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:22,052 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:22,053 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:22,056 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:22,057 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:22,067 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:22,075 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:24,792 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:24,792 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:24,835 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:24,836 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:24,839 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:24,840 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:24,850 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:24,862 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:27,559 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:27,559 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:27,603 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:27,603 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:27,608 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:27,610 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:27,624 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:27,634 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:29,919 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:29,919 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:29,963 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:29,963 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:29,968 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:29,969 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:29,977 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:29,986 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:32,245 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:32,246 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:32,297 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:32,298 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:32,301 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:32,302 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:32,312 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:32,320 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:34,417 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:34,417 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:34,460 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:34,461 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:34,466 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:34,467 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:34,479 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:34,487 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:36,986 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:36,986 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:37,028 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:37,029 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:37,033 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:37,035 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:37,042 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:37,049 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:39,388 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:39,388 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:39,453 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:39,454 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:39,459 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:39,460 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:39,465 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:39,472 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:42,242 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:42,243 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:42,311 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:42,311 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:42,316 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:42,318 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:42,323 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:42,335 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:44,872 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:44,873 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:44,918 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:44,918 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:44,923 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:44,924 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:44,934 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:44,943 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:47,343 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:47,344 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:47,417 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:47,418 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:47,423 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:47,426 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:47,447 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:47,464 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:51,038 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:51,038 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:51,100 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:51,100 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:51,107 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:51,110 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:51,127 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:51,149 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:53,404 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:53,404 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:53,479 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:53,479 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:53,485 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:53,486 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:53,503 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:53,513 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:56,944 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:56,945 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:56,998 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:56,999 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:57,005 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:57,007 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:57,017 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:57,028 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-18 14:03:59,314 - BERTopic - Embedding - Completed ✓
2024-12-18 14:03:59,315 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:03:59,367 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:03:59,368 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:03:59,373 - BERTopic - Cluster - Completed ✓
2024-12-18 14:03:59,375 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:03:59,387 - BERTopic - Representation - Completed ✓
2024-12-18 14:03:59,397 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:02,958 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:02,959 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:03,024 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:03,025 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:03,030 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:03,032 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:03,104 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:03,121 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:06,649 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:06,650 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:06,708 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:06,708 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:06,712 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:06,714 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:06,722 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:06,730 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:09,276 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:09,276 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:09,352 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:09,352 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:09,358 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:09,359 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:09,366 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:09,376 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:12,905 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:12,906 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:13,005 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:13,006 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:13,015 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:13,025 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:13,042 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:13,052 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:15,891 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:15,892 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:15,998 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:15,999 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:16,007 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:16,009 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:16,016 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:16,054 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:18,631 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:18,632 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:18,698 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:18,699 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:18,706 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:18,709 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:18,727 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:18,744 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:22,663 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:22,664 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:22,745 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:22,746 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:22,758 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:22,767 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:22,801 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:22,815 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:25,145 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:25,145 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:25,195 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:25,196 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:25,202 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:25,203 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:25,212 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:25,228 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:28,230 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:28,230 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:28,324 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:28,325 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:28,335 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:28,338 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:28,360 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:28,378 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:31,317 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:31,318 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:31,379 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:31,380 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:31,386 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:31,387 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:31,405 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:31,416 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:34,472 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:34,473 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:34,565 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:34,566 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:34,572 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:34,574 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:34,587 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:34,600 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:36,891 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:36,891 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:36,944 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:36,944 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:36,948 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:36,949 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:36,962 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:36,971 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:39,672 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:39,673 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:39,729 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:39,729 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:39,733 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:39,735 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:39,746 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:39,758 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:42,336 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:42,336 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:42,437 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:42,437 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:42,441 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:42,442 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:42,455 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:42,465 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:45,276 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:45,276 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:45,331 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:45,332 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:45,338 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:45,339 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:45,354 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:45,365 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:48,530 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:48,531 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:48,648 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:48,649 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:48,658 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:48,660 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:48,684 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:48,698 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:51,353 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:51,354 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:51,413 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:51,413 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:51,423 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:51,426 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:51,447 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:51,464 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:53,914 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:53,915 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:53,977 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:53,978 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:53,984 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:53,985 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:53,995 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:54,006 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:56,269 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:56,270 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:56,318 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:56,318 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:56,324 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:56,325 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:56,340 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:56,349 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:04:58,431 - BERTopic - Embedding - Completed ✓
2024-12-18 14:04:58,431 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:04:58,520 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:04:58,521 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:04:58,527 - BERTopic - Cluster - Completed ✓
2024-12-18 14:04:58,528 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:04:58,534 - BERTopic - Representation - Completed ✓
2024-12-18 14:04:58,543 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:00,739 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:00,740 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:00,798 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:00,799 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:00,805 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:00,807 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:00,816 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:00,827 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:03,878 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:03,878 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:03,941 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:03,941 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:03,945 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:03,947 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:03,956 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:03,964 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:08,128 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:08,128 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:08,193 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:08,193 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:08,200 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:08,201 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:08,213 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:08,224 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:11,173 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:11,174 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:11,227 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:11,228 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:11,235 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:11,236 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:11,267 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:11,280 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:13,345 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:13,345 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:13,400 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:13,400 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:13,406 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:13,407 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:13,419 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:13,434 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:17,557 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:17,557 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:17,623 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:17,624 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:17,631 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:17,633 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:17,645 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:17,661 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:22,093 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:22,093 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:22,150 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:22,150 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:22,157 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:22,158 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:22,175 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:22,188 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:24,955 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:24,955 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:25,009 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:25,010 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:25,016 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:25,017 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:25,031 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:25,042 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:27,724 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:27,725 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:27,783 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:27,783 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:27,789 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:27,791 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:27,800 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:27,810 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:30,568 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:30,568 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:30,625 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:30,625 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:30,632 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:30,633 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:30,644 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:30,654 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:33,337 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:33,337 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:33,421 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:33,421 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:33,427 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:33,429 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:33,437 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:33,447 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:36,378 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:36,378 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:36,442 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:36,443 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:36,449 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:36,451 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:36,461 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:36,476 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:39,037 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:39,037 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:39,112 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:39,113 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:39,119 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:39,121 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:39,134 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:39,146 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:42,431 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:42,431 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:42,503 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:42,504 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:42,510 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:42,512 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:42,524 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:42,536 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:45,111 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:45,111 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:45,173 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:45,174 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:45,180 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:45,182 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:45,196 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:45,211 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:47,825 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:47,826 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:47,931 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:47,932 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:47,939 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:47,941 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:47,954 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:47,966 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:52,183 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:52,184 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:52,249 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:52,249 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:52,257 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:52,258 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:52,272 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:52,282 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:55,401 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:55,401 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:55,473 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:55,473 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:55,481 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:55,482 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:55,488 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:55,498 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:05:59,407 - BERTopic - Embedding - Completed ✓
2024-12-18 14:05:59,407 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:05:59,468 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:05:59,468 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:05:59,475 - BERTopic - Cluster - Completed ✓
2024-12-18 14:05:59,477 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:05:59,502 - BERTopic - Representation - Completed ✓
2024-12-18 14:05:59,514 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:06:04,232 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:04,232 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:04,295 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:04,296 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:04,303 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:04,305 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:04,318 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:04,333 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:06:07,805 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:07,806 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:07,863 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:07,864 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:07,871 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:07,872 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:07,890 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:07,907 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:06:11,464 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:11,465 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:11,538 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:11,538 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:11,546 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:11,548 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:11,564 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:11,579 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:06:15,101 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:15,101 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:15,201 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:15,202 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:15,208 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:15,210 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:15,238 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:15,253 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:06:18,391 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:18,392 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:18,452 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:18,453 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:18,460 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:18,462 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:18,475 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:18,486 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-18 14:06:21,045 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:21,046 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:21,106 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:21,106 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:21,113 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:21,114 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:21,124 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:21,134 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:25,354 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:25,355 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:25,462 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:25,462 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:25,469 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:25,471 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:25,491 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:25,506 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:29,455 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:29,456 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:29,520 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:29,521 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:29,528 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:29,530 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:29,542 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:29,557 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:33,260 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:33,260 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:33,328 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:33,328 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:33,337 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:33,338 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:33,351 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:33,383 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:37,143 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:37,143 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:37,207 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:37,208 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:37,214 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:37,216 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:37,229 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:37,239 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:40,806 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:40,806 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:40,930 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:40,931 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:40,938 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:40,939 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:40,944 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:40,953 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:44,574 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:44,574 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:44,643 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:44,644 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:44,650 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:44,651 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:44,663 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:44,675 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:48,429 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:48,429 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:48,505 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:48,505 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:48,513 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:48,514 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:48,527 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:48,541 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:52,723 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:52,724 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:52,845 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:52,846 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:52,854 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:52,856 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:52,887 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:52,912 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:06:57,103 - BERTopic - Embedding - Completed ✓
2024-12-18 14:06:57,103 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:06:57,183 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:06:57,183 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:06:57,191 - BERTopic - Cluster - Completed ✓
2024-12-18 14:06:57,192 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:06:57,210 - BERTopic - Representation - Completed ✓
2024-12-18 14:06:57,223 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-18 14:07:00,874 - BERTopic - Embedding - Completed ✓
2024-12-18 14:07:00,875 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:07:01,124 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:07:01,125 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:07:01,133 - BERTopic - Cluster - Completed ✓
2024-12-18 14:07:01,135 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:07:01,145 - BERTopic - Representation - Completed ✓
2024-12-18 14:07:01,167 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

2024-12-18 14:07:05,176 - BERTopic - Embedding - Completed ✓
2024-12-18 14:07:05,179 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:07:05,272 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:07:05,273 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:07:05,284 - BERTopic - Cluster - Completed ✓
2024-12-18 14:07:05,287 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:07:05,348 - BERTopic - Representation - Completed ✓
2024-12-18 14:07:05,431 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

2024-12-18 14:07:08,474 - BERTopic - Embedding - Completed ✓
2024-12-18 14:07:08,474 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-18 14:07:08,715 - BERTopic - Dimensionality - Completed ✓
2024-12-18 14:07:08,716 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-18 14:07:08,724 - BERTopic - Cluster - Completed ✓
2024-12-18 14:07:08,726 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-18 14:07:08,734 - BERTopic - Representation - Completed ✓


解析完了。エラーまたは不十分なデータが発生したDataFrame数: 0


In [30]:
# トピックが取れたか確認
if len(topics_list) == len(sorted_dfs_copy):
    print("全てトピックを取得")
else:
    print("一部、解析エラー")
    err_df

全てトピックを取得


In [31]:
sorted_dfs_copy[90]["文章"].tolist()

['かなり長いこと染めていなかったので',
 'カラーだったので',
 '黒傾向になったので、',
 '矯正とカラー同時にしたからでしょうか?でも、日中はうるうるのサラサラなので',
 '色や仕上がりのイメージを事前に共有していなかった為、',
 'インナーカラーなら隠せるから',
 '初めてのカラーだったので',
 'かなり緑が強かったので',
 '最近カラーばかりでヘッドスパが疎かになってたので、',
 'もう根本が黒くなってきたので',
 'はじめてのカラーだったので',
 '顔の印象も少し明るくなったので',
 '初めての利用で初めてのインナーカラーだったので、',
 'カラー剤などが染みてないかどうかの確認も何度もしていただいたので、',
 'カラーは、全体的な明るさをなくしてもらえたので',
 '元々かなり明るい髪色だったため、',
 'カラー前に保護するためのスプレーをかけてもらえたので',
 '今回は明るめのインナーカラーを入れたかったので、',
 'ブラウンも少し入れてるから',
 '普段あまり使わない色だったので',
 'のがことのないカラーを提案していただけたので',
 '初カラーだったので',
 'ハイライトを入れたので',
 '全体が鏡でしたので',
 '半年ぶりにカラーしたので',
 'ハイライトを入れたので',
 'カラーはこちらのお店では初めてだったので、',
 'ブリーチもカラーもしたので',
 '特にカラーは悩んでいたので',
 'それに何回も染め直ししていたようなので、',
 'カラーの希望も特になかったので',
 '初めての美容室でのカラーだったので',
 'ブルーブラックのような色を一度やってみたかったので',
 '明るすぎると職場がNGなので',
 '以前より黒くしたので、',
 '初めてのアッシュカラーなので、',
 '思ってたよりシルバーパープルだったので',
 '人生初カラーだったので、',
 '初めてのカラーだったので、',
 '縮毛矯正と暗染の履歴があったため、',
 'ピンクブラウンを想像していたので',
 '白髪染めで暗くなったカラーを明るくしたいので',
 '注文した色と全く違う色にされたこともあったため',
 '染まって無いところがチラホラと‥初めて来店の時はとてもキレイに染めて頂いただけに',
 '久々

In [32]:
topics_list[90].get_topic_info()["Representative_Docs"]

0                                      [前回 青 身 強かっ]
1    [今回 カラー, インナー カラー し もらっ の, 今回 明るめ インナー カラー 入れ]
2              [黒 染め し い ため, 黒 染め し もらい, 今回 黒 染め し]
3                                   [カラー, カラー, カラー]
Name: Representative_Docs, dtype: object

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [34]:
sorted_dfs_copy[90]

Unnamed: 0,文章,クラスタ
153,かなり長いこと染めていなかったので,6
189,カラーだったので,6
220,黒傾向になったので、,6
261,矯正とカラー同時にしたからでしょうか?でも、日中はうるうるのサラサラなので,6
265,色や仕上がりのイメージを事前に共有していなかった為、,6
...,...,...
5840,薬剤が染みたのですが、,6
6247,カラーのムラは覚悟していましたし、,6
6263,派手過ぎないようにカラーしてもらいました,6
6305,今回は少し明るいカラーにしました,6


In [35]:
sorted_dfs_copy[90]["文章"].tolist()

['かなり長いこと染めていなかったので',
 'カラーだったので',
 '黒傾向になったので、',
 '矯正とカラー同時にしたからでしょうか?でも、日中はうるうるのサラサラなので',
 '色や仕上がりのイメージを事前に共有していなかった為、',
 'インナーカラーなら隠せるから',
 '初めてのカラーだったので',
 'かなり緑が強かったので',
 '最近カラーばかりでヘッドスパが疎かになってたので、',
 'もう根本が黒くなってきたので',
 'はじめてのカラーだったので',
 '顔の印象も少し明るくなったので',
 '初めての利用で初めてのインナーカラーだったので、',
 'カラー剤などが染みてないかどうかの確認も何度もしていただいたので、',
 'カラーは、全体的な明るさをなくしてもらえたので',
 '元々かなり明るい髪色だったため、',
 'カラー前に保護するためのスプレーをかけてもらえたので',
 '今回は明るめのインナーカラーを入れたかったので、',
 'ブラウンも少し入れてるから',
 '普段あまり使わない色だったので',
 'のがことのないカラーを提案していただけたので',
 '初カラーだったので',
 'ハイライトを入れたので',
 '全体が鏡でしたので',
 '半年ぶりにカラーしたので',
 'ハイライトを入れたので',
 'カラーはこちらのお店では初めてだったので、',
 'ブリーチもカラーもしたので',
 '特にカラーは悩んでいたので',
 'それに何回も染め直ししていたようなので、',
 'カラーの希望も特になかったので',
 '初めての美容室でのカラーだったので',
 'ブルーブラックのような色を一度やってみたかったので',
 '明るすぎると職場がNGなので',
 '以前より黒くしたので、',
 '初めてのアッシュカラーなので、',
 '思ってたよりシルバーパープルだったので',
 '人生初カラーだったので、',
 '初めてのカラーだったので、',
 '縮毛矯正と暗染の履歴があったため、',
 'ピンクブラウンを想像していたので',
 '白髪染めで暗くなったカラーを明るくしたいので',
 '注文した色と全く違う色にされたこともあったため',
 '染まって無いところがチラホラと‥初めて来店の時はとてもキレイに染めて頂いただけに',
 '久々

In [36]:
# sorted_dfs_copyの全てのdfに列名"トピック"を追加
for df in sorted_dfs_copy:
    if 'トピック' not in df.columns:
        df['トピック'] = 'トピック名'

In [37]:
# トピックと各dfの文章のベクトルを取得して、近いトピックを割り当てる
def assign_topics(df, topics_info):
    # 文章とトピックをベクトル化
    vectorizer = TfidfVectorizer()
    sentences = df["文章"].tolist()
    sentence_vectors = vectorizer.fit_transform(sentences)

    # 元のデータ
    representative_docs = pd.Series(topics_info.get_topic_info()["Representative_Docs"], name='Representative_Docs')

    # 指定されたフォーマットに変換
    topics = [', '.join(topic) for topic in representative_docs]
    topic_vectors = vectorizer.transform(topics)

    # 余弦類似度を計算
    similarity_matrix = cosine_similarity(sentence_vectors, topic_vectors)

    # 各文章を最も近いトピックに割り当て
    assigned_topics = [topics[i] for i in similarity_matrix.argmax(axis=1)]

    # データフレームに"トピック"列を追加し、assigned_topicsを代入
    df["トピック"] = assigned_topics

    return df

In [199]:
sorted_dfs_copy[62]["文章"].tolist()

['大変助かります',
 '助かります',
 '大変助かりました',
 '毎回助かってます',
 '助かっています',
 '助かります',
 'とても助かっています',
 '凄く助かりました',
 '重くて助かりました',
 '本当に助かってます',
 '今回は助かっています',
 '助かります',
 '助かりますし',
 'また、非常に助かります',
 '助かります',
 '疎い私はとても助かっていますし、',
 'とても助かりました',
 '助かりました',
 '助かります笑サラサラにになったし、',
 '助かってますますmm',
 '助かってます',
 '今回は助かりましたまた宜しくお願いします',
 '大変助かりました',
 '大変助かりました',
 'とても助かります',
 '大変助かりました',
 '助かります',
 '助かります',
 'とても助かりました',
 '助かっています',
 '助かりました',
 'とても助かってます',
 '助かります',
 '助かります',
 '凄く助かっています',
 'とても助かりました',
 '助かっております',
 'とても助かっています',
 'とても助かりました',
 '助かりました',
 '助かっています',
 '助かります',
 'とても助かりましたありがとうございます',
 '助かります',
 '助かりました',
 '助かってます',
 'とても助かりました',
 'やく助かりました',
 'とても助かっています',
 '助かります',
 'とても助かりました',
 'とても助かります',
 '助かりましたまたお願いしたいです',
 '助かりました',
 'とても助かっています',
 'とても助かりました',
 '助かります',
 '本当に助かります',
 '助かっています',
 'とても助かりました',
 '助かっています',
 '助かります',
 '助かります',
 'ありがとうございます私には助かります',
 'とても助かりました',
 '助かっています',
 '助かっています',
 '助かったなと思いました',
 '助かります',
 '助かっています',
 '助かってます',
 'とても助かりました',
 '助かりました',
 '助かりました',
 'とても助かりました']

In [203]:
manual_topics_list = [
    "ツヤツヤになった",
    "駅から近い",
    "札幌での美容サービス体験",
    "外が寒い",
    "サラサラになった",
    "信用している",
    "外・心が暖かい",
    "外が暑い",
    "リラックスできた",
    "明るい、丁寧な接客",
    "ありがたい",
    "落ち着いた施術・放置された",
    "よろしくお願いします",
    "香りや雰囲気がいい",
    "様々な動画配信サービスが見れた",
    "口コミが良かった",
    "切られすぎた",
    "立地がいい",
    "今後も通う",
    "丁寧な説明/実践的なアドバイス",
    "短期的なリピート",
    "話すのが苦手/名前が分からない",
    "継続利用",
    "残念",
    "適度な距離感",
    "伸ばしている/伸ばしたい",
    "普段の感謝",
    "接客が丁寧",
    "指名します/指名しなかった",
    "メニューの豊富さ/ドリンクサービス",
    "来店した",
    "パーマをかけたい/かけた",
    "パサつき・色抜けうねりが気になる",
    "楽しかった/楽しみ",
    "緊張した",
    "感謝",
    "楽/スムーズ",
    "あっという間",
    "オーダー・希望通り",
    "金額が高い・安い",
    "感動した",
    "仕上がりに満足・不満足",
    "色がいい/色落ちが綺麗",
    "安定した技術・接客",
    "不安・心配",
    "ネット予約ができる/当日予約ができる",
    "嬉しい",
    "お店に関する内容",
    "予期しないクオリティ",
    "安心の施術",
    "美容室・スタイリストへの不安/あまり美容室に行かない",
    "アイロンやヘアケア方法に関する学び",
    "縮毛矯正の予定/縮毛矯正への満足感",
    "肌が弱い",
    "失敗経験",
    "特別なイベント",
    "施術中の経験や安心感",
    "挑戦/前向き",
    "担当者が違う/担当者の異動",
    "行きます",
    "施術やケアに関する感謝・満足",
    "助かっています",
    "予約がスムーズ",
    "カットの技術が高い",
    "カラーリングがきれい",
    "新しいヘアスタイル",
    "カット後の扱いやすさ",
    "仕上がりに満足",
    "スタイリングが簡単になった",
    "セットがしやすい",
    "頑固なクセが改善された",
    "シャンプーが気持ちいい",
    "トリートメントが良かった",
    "笑顔で対応してくれた",
    "スムーズなカウンセリング",
    "無理なく会話ができた",
    "早く仕上がった",
    "仕上がりが想像以上",
    "髪の傷みが改善した",
    "再来店する予定",
    "施術の後のアフターケア",
    "ヘアケア商品の紹介",
    "優れたカットライン",
    "サロンの雰囲気が良かった",
    "美容師さんが親切だった",
    "丁寧なカット",
    "選べるシャンプーの種類",
    "清潔感のあるサロン",
    "親身な対応",
    "アクセスが良い",
    "静かな空間",
    "丁寧な仕上がり",
    "希望通りのヘアスタイル",
    "価格が納得",
    "ヘアアレンジの提案",
    "長時間の施術でも快適",
    "髪が元気を取り戻した",
    "落ち着いたインテリア",
    "心地よい音楽",
    "リピート決定"
]


In [204]:
len(manual_topics_list)

100

In [205]:
for i, df in enumerate(sorted_dfs_copy):
    df["トピック"] = manual_topics_list[i]

In [None]:
# print(manual_topics_list[len(manual_topics_list)])
# # sorted_dfs_copy[len(manual_topics_list)-1]["文章"].tolist()

IndexError: list index out of range

In [206]:
# 必要


# # 取得したトピックと各dfの文章のベクトルを取得して、近いトピックを割り当てる
# for i in range(len(sorted_dfs_copy)):
#     sorted_dfs_copy[i] = assign_topics(sorted_dfs_copy[i], topics_list[i])

# # 結果の確認（オプション）
# for i in range(len(sorted_dfs_copy)):
#     print(f"DataFrame {i}:")
#     print(sorted_dfs_copy[i][["文章", "トピック"]].head())
#     print("\n")

In [207]:
sorted_dfs_copy

[                            文章  クラスタ      トピック
 775         ツルツルツヤツヤが持続しているので、    62  ツヤツヤになった
 868                 ツヤツヤになるので、    62  ツヤツヤになった
 987        ツヤのある綺麗な状態がキープできるので    62  ツヤツヤになった
 996             ツヤツヤな仕上がりになるので    62  ツヤツヤになった
 1102  大満足ですトリートメントでもツヤツヤになったので    62  ツヤツヤになった
 2517            しっかり伸びてツヤも出たので    62  ツヤツヤになった
 2522              見た目にもツヤが出たので    62  ツヤツヤになった
 3736                    ツルツルです    62  ツヤツヤになった
 4122        ツルツルツヤツヤが持続しているので、    62  ツヤツヤになった
 4449  大満足ですトリートメントでもツヤツヤになったので    62  ツヤツヤになった
 5864            しっかり伸びてツヤも出たので    62  ツヤツヤになった,
                               文章  クラスタ   トピック
 1441                     駅から近いので    48  駅から近い
 1544             仕上がりもよきでした駅近なので    48  駅から近い
 1761                地下歩出口からも近いので    48  駅から近い
 1934           地下歩行空間の出口からはすぐなので    48  駅から近い
 2259                     駅の近くなので    48  駅から近い
 2289                     札駅に近いので    48  駅から近い
 2359  お店の雰囲気もすごくおしゃれで駅からも徒歩で近いので    48  駅から近い
 2412                    駅からも近いので    48  駅から近い


In [208]:
# すべてのDataFrameを連結
combined_df = pd.concat(sorted_dfs_copy, ignore_index=False)
# index番号でソート
df_results = combined_df.sort_index()

In [209]:
# dfを前半と後半で分ける
total_rows = len(df_results)

# データフレームを半分に分割
df_first_half = df_results.iloc[:total_rows//2]
df_second_half = df_results.iloc[total_rows//2:]

df_first_half = df_first_half.rename(columns={"文章": "理由", "クラスタ": "理由_クラスタ", "トピック": "理由_トピック"})
df_second_half = df_second_half.rename(columns={"文章": "結果", "クラスタ": "結果_クラスタ", "トピック": "結果_トピック"})

# 後半のデータセットのインデックスを振り直す
df_second_half = df_second_half.reset_index(drop=True)

# 結果を確認
print("前半のデータセット:")
print(df_first_half)
print("\n後半のデータセット（インデックス振り直し後）:")
print(df_second_half)

前半のデータセット:
                          理由  理由_クラスタ       理由_トピック
0     今回もカラーも今どきな雰囲気に仕上がったので       64         親身な対応
1          シャンプーを使っていただいたので、       16    無理なく会話ができた
2           とても大事な友達の式だったので、        4  伸ばしている/伸ばしたい
3         とって前髪は顔のパーツの一部なので、       20   美容師さんが親切だった
4                 今後も通いたいので、       76         今後も通う
...                      ...      ...           ...
3343      トリートメントもとても良かったので、        1    髪が元気を取り戻した
3344       色味のアドバイスをいただけるので、       47     安定した技術・接客
3345       のは、担当スタイリストさんなので、       92     仕上がりが想像以上
3346        つけっぱなしの上でのカラーなので        6         静かな空間
3347             またお願いしたいので、       15        リピート決定

[3348 rows x 3 columns]

後半のデータセット（インデックス振り直し後）:
                                      結果  結果_クラスタ        結果_トピック
0                    良かったですまたよろしくお願いします♪       81         丁寧なカット
1                         来月またよろしくお願いします       46       短期的なリピート
2     アレンジが不慣れなら他のスタッフの方にもサポートいただきたかったです       68           行きます
3                    もっと研究していただきたいと思いました       27  担当者が

In [210]:
# df_first_halfとdf_second_halfを横方向に連結
concatenated_df_horizontal = pd.concat([df_first_half, df_second_half], axis=1)

# 結果を表示
print(concatenated_df_horizontal)

                          理由  理由_クラスタ       理由_トピック  \
0     今回もカラーも今どきな雰囲気に仕上がったので       64         親身な対応   
1          シャンプーを使っていただいたので、       16    無理なく会話ができた   
2           とても大事な友達の式だったので、        4  伸ばしている/伸ばしたい   
3         とって前髪は顔のパーツの一部なので、       20   美容師さんが親切だった   
4                 今後も通いたいので、       76         今後も通う   
...                      ...      ...           ...   
3343      トリートメントもとても良かったので、        1    髪が元気を取り戻した   
3344       色味のアドバイスをいただけるので、       47     安定した技術・接客   
3345       のは、担当スタイリストさんなので、       92     仕上がりが想像以上   
3346        つけっぱなしの上でのカラーなので        6         静かな空間   
3347             またお願いしたいので、       15        リピート決定   

                                      結果  結果_クラスタ        結果_トピック  
0                    良かったですまたよろしくお願いします♪       81         丁寧なカット  
1                         来月またよろしくお願いします       46       短期的なリピート  
2     アレンジが不慣れなら他のスタッフの方にもサポートいただきたかったです       68           行きます  
3                    もっと研究していただきたいと思いました       27  担当者が違う/担当者の異動  
4   

In [211]:
concatenated_df_horizontal

Unnamed: 0,理由,理由_クラスタ,理由_トピック,結果,結果_クラスタ,結果_トピック
0,今回もカラーも今どきな雰囲気に仕上がったので,64,親身な対応,良かったですまたよろしくお願いします♪,81,丁寧なカット
1,シャンプーを使っていただいたので、,16,無理なく会話ができた,来月またよろしくお願いします,46,短期的なリピート
2,とても大事な友達の式だったので、,4,伸ばしている/伸ばしたい,アレンジが不慣れなら他のスタッフの方にもサポートいただきたかったです,68,行きます
3,とって前髪は顔のパーツの一部なので、,20,美容師さんが親切だった,もっと研究していただきたいと思いました,27,担当者が違う/担当者の異動
4,今後も通いたいので、,76,今後も通う,改善していただければいいなと思います,1,髪が元気を取り戻した
...,...,...,...,...,...,...
3343,トリートメントもとても良かったので、,1,髪が元気を取り戻した,次回もよろしくお願いします,69,笑顔で対応してくれた
3344,色味のアドバイスをいただけるので、,47,安定した技術・接客,大変有り難いです,42,セットがしやすい
3345,のは、担当スタイリストさんなので、,92,仕上がりが想像以上,改めて感謝したいなと思いました,67,楽/スムーズ
3346,つけっぱなしの上でのカラーなので,6,静かな空間,マスクを仕方ないのかもしれませんが,42,セットがしやすい


In [212]:
# import fugashi
# from transformers import BertTokenizer, BertForMaskedLM
# import torch
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer

# # BERTモデルとトークナイザーの準備
# tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v2')
# model = BertForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese-v2')

# # fugashiのトークナイザーを準備
# tagger = fugashi.Tagger()

# # 入力文のリスト
# input_sentences = topics_list[80].get_topic_info()["Representative_Docs"][0]

# # 学習データ
# training_data = marge_rev

# def tokenize_ja(text):
#     return [word.surface for word in tagger(text)]

# def complete_sentence(input_sentence):
#     tokens = tokenize_ja(input_sentence)
    
#     masked_tokens = []
#     for i, token in enumerate(tokens):
#         if i == 0 or i == len(tokens) - 1 or np.random.rand() < 0.2:  # マスクの確率を下げる
#             masked_tokens.append(token)
#         else:
#             masked_tokens.append('[MASK]')
    
#     input_ids = tokenizer.convert_tokens_to_ids(masked_tokens)
#     input_ids = torch.tensor([input_ids])
    
#     with torch.no_grad():
#         outputs = model(input_ids)
#         predictions = outputs[0]
    
#     predicted_tokens = []
#     for i, token in enumerate(masked_tokens):
#         if token == '[MASK]':
#             predicted_ids = torch.argsort(predictions[0, i], descending=True)[:5]  # 上位5候補を取得
#             predicted_tokens_candidates = tokenizer.convert_ids_to_tokens(predicted_ids)
#             # 文脈に合う最適な候補を選択（ここでは簡単のため最初の候補を選択）
#             predicted_tokens.append(predicted_tokens_candidates[0])
#         else:
#             predicted_tokens.append(token)
    
#     completed_sentence = ''.join(predicted_tokens).replace('##', '')
#     return completed_sentence

# # 各入力文に対して文章を生成
# generated_sentences = [complete_sentence(sentence) for sentence in input_sentences]

# # 生成された文章と学習データの類似度を計算
# vectorizer = TfidfVectorizer(tokenizer=tokenize_ja)
# tfidf_matrix = vectorizer.fit_transform(training_data + generated_sentences)
# cosine_similarities = cosine_similarity(tfidf_matrix[-len(generated_sentences):], tfidf_matrix[:len(training_data)])

# # 最も類似度の高い文章を選択
# best_match_index = np.argmax(cosine_similarities.max(axis=1))
# best_match = generated_sentences[best_match_index]

# print("生成された文章:")
# for sentence in generated_sentences:
#     print(sentence)

# print("\n最も内容が近い生成文:", best_match)


Unnamed: 0,文章,クラスタ
3,旦那さんにみせることができませんでした風呂上りに自然と外ハネになるので,13
7,仕事上全頭は奇抜にできないので、,13
21,塗ってる人は担当者じゃないので、,13
25,混んでいてみなさん担当されていて誰にも声をかけられずだったので、,13
34,従業員の顔写真がないので,13
45,顔剃りの時に、担当の人でない人が急に来て始めたので,13
55,あまりアシスタントさんが入ることがなかったのですが、,13


In [None]:
# topics = get_topics_from_texts(sorted_dfs_copy[30]["文章"].tolist())
# for topic, words in topics.items():
#     print(f"トピック {topic}: {words}")

Unnamed: 0,文章,クラスタ
0,今回もカラーも今どきな雰囲気に仕上がったので,7
1,シャンプーを使っていただいたので、,31
2,とても大事な友達の式だったので、,17
3,とって前髪は顔のパーツの一部なので、,28
4,今後も通いたいので、,15
...,...,...
6691,次回もよろしくお願いします,94
6692,大変有り難いです,69
6693,改めて感謝したいなと思いました,49
6694,マスクを仕方ないのかもしれませんが,69


In [None]:
# df_results_list = []
# batch_size = 10  # バッチサイズを設定

# for i in range(0, len(sorted_dfs), batch_size):
#     batch = sorted_dfs[i:i+batch_size]
    
#     for j, cl_df in enumerate(batch):
#         # combined_text = connect_text_list(cl_df)  # 一文に繋げる
#         # df_results_keywords = text_extract_keywords(combined_text, int(cl_df['クラスタ'].iloc[0]), kw_model, top_n=len(cl_df))  # クラスタリング
#         # df_results_list.append(df_results_keywords)
        
#         keyword = df_results_keywords["キーワード1"].iloc[0]
#         sorted_dfs[i+j] = replace_text(sorted_dfs[i+j], keyword)
    
#     # バッチ処理後にメモリをクリア
#     torch.mps.empty_cache()
    
#     if i % 100 == 0:
#         print(f"Processed {i} items")

# print(df_results_list)

In [None]:
# df_results_list[192]

In [149]:
# # 要素数が一番多いクラスタ
# max_size = max([df.shape[0], i] for i, df in enumerate(cluster_dfs.values()))
# max_size

In [150]:
# cluster_dfs[max_size[1]]

In [151]:
# # すべてのDataFrameを連結
# combined_df = pd.concat(sorted_dfs, ignore_index=False)

# # index番号でソート
# df_results = combined_df.sort_index()

In [None]:
# # データフレームの行数を取得
# total_rows = len(df_results)

# # データフレームを半分に分割
# df_first_half = df_results.iloc[:total_rows//2]
# df_second_half = df_results.iloc[total_rows//2:]

# df_first_half = df_first_half.rename(columns={"文章": "理由", "クラスタ": "理由_クラスタ"})
# df_second_half = df_second_half.rename(columns={"文章": "結果", "クラスタ": "結果_クラスタ"})

# # 後半のデータセットのインデックスを振り直す
# df_second_half = df_second_half.reset_index(drop=True)

# # 結果を確認
# print("前半のデータセット:")
# print(df_first_half)
# print("\n後半のデータセット（インデックス振り直し後）:")
# print(df_second_half)

In [None]:
# # df_first_halfとdf_second_halfを横方向に連結
# concatenated_df_horizontal = pd.concat([df_first_half, df_second_half], axis=1)

# # 結果を表示
# print(concatenated_df_horizontal)

In [None]:
# concatenated_df_horizontal

## networrkxで可視化 ##

In [155]:
# !pip install networkx
# !pip install japanize_matplotlib

In [156]:
# brew install graphviz
# !pip install --global-option=build_ext --global-option="-I$(brew --prefix graphviz)/include/" --global-option="-L$(brew --prefix graphviz)/lib/" pygraphviz

In [157]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
# 日本語フォントの設定
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Hiragino Sans', 'Yu Gothic', 'Meiryo', 'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']
import japanize_matplotlib
import pygraphviz
from networkx.drawing.nx_agraph import graphviz_layout

In [None]:
# # グラフの作成
# G = nx.Graph()

# # ノードの追加
# for index, row in concatenated_df_horizontal.iterrows():
#     G.add_node(row['理由'], type='理由')
#     G.add_node(row['結果'], type='結果')

# # エッジの追加（インデックス番号が同じもの同士）
# for index, row in concatenated_df_horizontal.iterrows():
#     G.add_edge(row['理由'], row['結果'], type='index')

# # エッジの追加（クラスタ番号が同じもの同士）
# 理由クラスタグループ = concatenated_df_horizontal.groupby('理由_クラスタ')
# 結果クラスタグループ = concatenated_df_horizontal.groupby('結果_クラスタ')

# for _, group in 理由クラスタグループ:
#     nodes = group['理由'].tolist()
#     for i in range(len(nodes)):
#         for j in range(i + 1, len(nodes)):
#             G.add_edge(nodes[i], nodes[j], type='理由クラスタ')

# for _, group in 結果クラスタグループ:
#     nodes = group['結果'].tolist()
#     for i in range(len(nodes)):
#         for j in range(i + 1, len(nodes)):
#             G.add_edge(nodes[i], nodes[j], type='結果クラスタ')

# # グラフの描画
# plt.figure(figsize=(20, 16))

# # graphviz_layoutを使用してノードの配置を最適化
# pos = graphviz_layout(G, prog='neato')

# # spring_layoutを使用して更に最適化
# pos = nx.spring_layout(G, pos=pos, iterations=50)

# # ノードの描画
# nx.draw_networkx_nodes(G, pos, 
#                        node_color=['skyblue' if G.nodes[node]['type'] == '理由' else 'lightgreen' for node in G.nodes],
#                        node_size=300)

# # エッジの描画
# edge_colors = ['red' if G[u][v]['type'] == 'index' else 'blue' if G[u][v]['type'] == '理由クラスタ' else 'green' for u, v in G.edges()]
# nx.draw_networkx_edges(G, pos, edge_color=edge_colors, alpha=0.3)

# # ラベルの描画
# nx.draw_networkx_labels(G, pos, font_size=8)

# plt.title("Network Visualization")
# plt.axis('off')
# plt.tight_layout()
# plt.show()

## pyvisで可視化 ##

In [159]:
# !pip install pyvis

In [213]:
from pyvis.network import Network

In [214]:
len(concatenated_df_horizontal)

3348

In [218]:
net = Network(height="1200px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='remote', select_menu=True)

In [219]:
net.set_options('''
var options = {
    "interaction": {
        "hover": true
    },
    "manipulation": {
        "enabled": true
    }
}
''')

net.set_edge_smooth('dynamic')

net.add_js_callback("selectNode", '''
function(params) {
    var selectedNodeId = params.nodes[0];
    var edges = network.getConnectedEdges(selectedNodeId);
    edges.forEach(function(edgeId) {
        network.updateEdge(edgeId, {color: {color: '#FF0000'}});
    });
    network.redraw();
}
''')

net.add_js_callback("deselectNode", '''
function(params) {
    network.edges.forEach(function(edge) {
        network.updateEdge(edge.id, {color: {color: edge.color.color}});
    });
    network.redraw();
}
''')


AttributeError: 'dict' object has no attribute 'edges'

In [221]:
# PyvisのNetworkオブジェクトを作成
net = Network(height="1200px", width="100%", bgcolor="#222222", font_color="white", notebook=True, cdn_resources='remote')
# ノードの追加
for index, row in concatenated_df_horizontal.iterrows():
    net.add_node(row['理由_トピック'], label=row['理由_トピック'][:20] + '...', title=row['理由_トピック'], color='#97C2FC')
    net.add_node(row['結果_トピック'], label=row['結果_トピック'][:20] + '...', title=row['結果_トピック'], color='#FB7E81')

# エッジの追加（インデックス番号が同じもの同士）
for index, row in concatenated_df_horizontal.iterrows():
    net.add_edge(row['理由_トピック'], row['結果_トピック'], color='#FFFF00')

# エッジの追加（クラスタ番号が同じもの同士）
理由クラスタグループ = concatenated_df_horizontal.groupby('理由_クラスタ')
結果クラスタグループ = concatenated_df_horizontal.groupby('結果_クラスタ')

for _, group in 理由クラスタグループ:
    nodes = group['理由_トピック'].tolist()
    for i in range(len(nodes)):
        for j in range(i + 1, len(nodes)):
            net.add_edge(nodes[i], nodes[j], color='#00FFFF')

for _, group in 結果クラスタグループ:
    nodes = group['結果_トピック'].tolist()
    for i in range(len(nodes)):
        for j in range(i + 1, len(nodes)):
            net.add_edge(nodes[i], nodes[j], color='#FF00FF')

# グラフの設定
net.toggle_physics(True)
net.show_buttons(filter_=['physics'])

# HTMLファイルとして保存
net.show('network_graph_2.html')

network_graph_2.html


# テスト実行ゾーン #

In [None]:
import unicodedata
from pyknp import Juman

def preprocess_text(text):
    # 全角文字を半角に変換し、テキストを正規化
    normalized_text = unicodedata.normalize('NFKC', text)
    
    # Juman++を使用して形態素解析
    jumanpp = Juman()
    result = jumanpp.analysis(normalized_text)
    
    # 形態素に分割されたテキストを作成
    morphemes = [mrph.midasi for mrph in result.mrph_list()]
    segmented_text = ' '.join(morphemes)
    
    return segmented_text

# テスト
input_text = "ホットペッパーの口コミが良い雰囲気だった"
processed_text = preprocess_text(input_text)
print(processed_text)


In [None]:
from transformers import BertJapaneseTokenizer, BertModel
import torch

def prepare_bert_model():
    # 正しいモデル名を指定
    model_name = "tohoku-nlp/bert-base-japanese-char-v2"

    # トークナイザーの準備
    tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)

    # モデルの準備
    model = BertModel.from_pretrained(model_name)

    # GPUが利用可能な場合はGPUを使用
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # 評価モードに設定
    model.eval()

    return tokenizer, model, device

# モデルの準備
tokenizer, model, device = prepare_bert_model()
print("BERTモデルの準備が完了しました。")


In [None]:
from transformers import BertJapaneseTokenizer

def tokenize_and_encode(text, tokenizer, max_length=128):
    # テキストをトークン化し、特殊トークンを追加
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    # トークンIDとアテンションマスクを取得
    input_ids = encoded['input_ids'].squeeze(0)
    attention_mask = encoded['attention_mask'].squeeze(0)
    
    # トークン化されたテキストを取得（デバッグ用）
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    return input_ids, attention_mask, tokens

# トークナイザーの準備
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char-v2")

# テキストの例
text = "ホットペッパーの口コミが良い雰囲気だった"

# トークン化とエンコーディングを実行
input_ids, attention_mask, tokens = tokenize_and_encode(text, tokenizer)

# 結果の表示
print("トークン化されたテキスト:")
print(" ".join(tokens))
print("\nInput IDs:", input_ids)
print("\nAttention Mask:", attention_mask)


In [None]:
import torch
from transformers import BertJapaneseTokenizer, BertModel

def get_contextualized_embeddings(text, model, tokenizer, device):
    # テキストをトークン化
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # モデルに入力を渡し、隠れ状態を取得
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # 最後の層の隠れ状態を取得
    last_hidden_states = outputs.last_hidden_state

    # トークンと対応する埋め込みを取得
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    embeddings = last_hidden_states[0]

    return tokens, embeddings

# モデルとトークナイザーの準備
model_name = "cl-tohoku/bert-base-japanese-char-v2"
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# テキストの例
text = "ホットペッパーの口コミが良い雰囲気だった"

# 文脈化された表現を取得
tokens, embeddings = get_contextualized_embeddings(text, model, tokenizer, device)

# 結果の表示
print("トークン:")
print(tokens)
print("\n最初の3トークンの埋め込み:")
for token, embedding in zip(tokens[:3], embeddings[:3]):
    print(f"{token}: {embedding[:5]}...")  # 最初の5次元のみ表示


In [212]:
from transformers import BertJapaneseTokenizer, BertModel
import torch
import numpy as np
from scipy.special import softmax

def dependency_parse(text, model, tokenizer, device):
    # テキストをトークン化
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # BERTモデルで文脈化された表現を取得
    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_states = outputs.last_hidden_state[0]
    
    # 係り受け行列の計算
    scores = torch.matmul(last_hidden_states, last_hidden_states.transpose(0, 1))
    scores = scores.cpu().numpy()

    # マスクの作成（自身より後ろの単語にのみ係るようにする）
    mask = np.tril(np.ones_like(scores), k=-1).T
    scores = scores * mask + -1e9 * (1 - mask)

    # スコアを確率に変換
    probs = softmax(scores, axis=1)

    # 最も確率の高い係り先を選択
    heads = np.argmax(probs, axis=1)

    # トークンと係り受け関係を取得
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    dependencies = []
    for i, head in enumerate(heads):
        if i == 0 or tokens[i].startswith('##'):  # [CLS]トークンやサブワードは無視
            continue
        if head == 0:  # rootの場合
            dependencies.append((tokens[i], 'ROOT'))
        else:
            dependencies.append((tokens[i], tokens[head]))

    return dependencies

In [None]:
from transformers import BertJapaneseTokenizer, BertModel
import torch
import numpy as np
from scipy.special import softmax

def dependency_parse(text, model, tokenizer, device):
    # テキストをトークン化
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # BERTモデルで文脈化された表現を取得
    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_states = outputs.last_hidden_state[0]
    
    # 係り受け行列の計算
    scores = torch.matmul(last_hidden_states, last_hidden_states.transpose(0, 1))
    scores = scores.cpu().numpy()

    # マスクの作成（自身より後ろの単語にのみ係るようにする）
    mask = np.tril(np.ones_like(scores), k=-1).T
    scores = scores * mask + -1e9 * (1 - mask)

    # スコアを確率に変換
    probs = softmax(scores, axis=1)

    # 最も確率の高い係り先を選択
    heads = np.argmax(probs, axis=1)

    # トークンと係り受け関係を取得
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    dependencies = []
    for i, head in enumerate(heads):
        if i == 0 or tokens[i].startswith('##'):  # [CLS]トークンやサブワードは無視
            continue
        if head == 0:  # rootの場合
            dependencies.append((tokens[i], 'ROOT'))
        else:
            dependencies.append((tokens[i], tokens[head]))

    return dependencies

# BERTモデルとトークナイザーの準備
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# テキストの例
text = "ホットペッパーの口コミが良い雰囲気だった"

# 構文解析の実行
dependencies = dependency_parse(text, model, tokenizer, device)

# 結果の表示
print("係り受け関係:")
for dep in dependencies:
    print(f"{dep[0]} -> {dep[1]}")

# 特定の文法構造の抽出
for i, dep in enumerate(dependencies):
    if dep[0] == 'が' and i+1 < len(dependencies):
        subject = dependencies[i-1][0]
        predicate = dependencies[i+1][0]
        print(f"\n抽出された「〇〇が〇〇」構造: {subject}が{predicate}")


# 逆説の分析 #

In [None]:
data[100:200]

In [221]:
paradox_temp = data[data["関係"]=="原因・理由"].copy()
paradox_temp.reset_index(drop=True, inplace=True)
paradox_temp_unique = temp.drop_duplicates()
paradox_temp = paradox_temp_unique

In [None]:
paradox_temp[200:300]

In [None]:
import pandas as pd
from textblob import TextBlob

# ポジティブ・ネガティブの分析を行う関数
def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# 各行の原因と結果のポジティブ・ネガティブを分析
paradox_temp['原因_ポジティブ度'] = paradox_temp['原因'].apply(analyze_sentiment)
paradox_temp['結果_ポジティブ度'] = paradox_temp['結果'].apply(analyze_sentiment)

# 結果を保存するDataFrameを作成
result_df = pd.DataFrame(columns=['index', 'ポジティブ', 'ネガティブ'])

# 各行の原因と結果のポジティブ・ネガティブを比較し、結果を保存
for index, row in paradox_temp.iterrows():
    if row['原因_ポジティブ度'] > row['結果_ポジティブ度']:
        positive = row['原因']
        negative = row['結果']
    else:
        positive = row['結果']
        negative = row['原因']
    
    # appendの代わりにconcatを使用
    new_row = pd.DataFrame({'index': [index], 'ポジティブ': [positive], 'ネガティブ': [negative]})
    result_df = pd.concat([result_df, new_row], ignore_index=True)

# 結果を表示
print(result_df)