In [None]:
import sys
sys.path.append("/home/joe/projects/tm-projects/topic-modeling/evaluation") # 将module1所在的文件夹路径放入sys.path中
sys.path.append("/home/joe/projects/tm-projects/topic-modeling/plugins/TSCTM/")

In [None]:
from eva_data import DataLoader
from eva_evaluation import Trainer

## **LDA**

In [None]:
dataset_name = ["20NewsGroup"]

for name in dataset_name:
    if name == "20NewsGroup":
        dataset, custom = name, False
    else:
        dataset, custom = name, True

    params =  {"num_topics": [20, 30, 50]}
    trainer = Trainer(dataset=dataset,
                    model_name="LDA",
                    params=params,
                    custom_dataset=custom,
                    verbose=True)
    results = trainer.train(save=f"./extension_results/malletLDA_{name}")

## **malletLDA**

In [None]:
dataset_name = ["20NewsGroup"]
for i, random_state in enumerate([0]):
    for name in dataset_name:
        if name == "20NewsGroup":
            dataset, custom = name, False
        else:
            dataset, custom = name, True

        params =  {"num_topics": [20, 30, 50], "random_seed": random_state}
        trainer = Trainer(dataset=dataset,
                        model_name="malletLDA",
                        params=params,
                        custom_dataset=custom,
                        verbose=True,
                        malletLDA_path="")  # update this path
        results = trainer.train(save=f"./extension_results/malletLDA_{name}")

## **BERTopic (MPNET)**

To speed up BERTopic, we can generate the embeddings before passing it to the `Trainer`. This way, the same embeddings do not have to be generated 5 times which speeds up evaluation quite a bit. 

In [None]:
%%capture
from sentence_transformers import SentenceTransformer
# Prepare data
dataset_name = ["20NewsGroup"]
for name in dataset_name:
    if name == "20NewsGroup":
        dataset, custom = name, False
    else:
        dataset, custom = name, True

    data_loader = DataLoader(dataset)
    _, timestamps = data_loader.load_docs()
    data = data_loader.load_octis(custom)
    data = [" ".join(words) for words in data.get_corpus()]

    # Extract embeddings
    model = SentenceTransformer("all-mpnet-base-v2")
    # model = SentenceTransformer("stsb-roberta-base-v2")
    embeddings = model.encode(data, show_progress_bar=True)

    # # Extract embeddings
    # documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data)]
    # model = Doc2Vec(vector_size=300, min_count=2)
    # embeddings = model.build_vocab(documents)

    params = {
        # "embedding_model": "stsb-roberta-base-v2",
        "embedding_model": "all-mpnet-base-v2",
        # "embedding_model": "doc2vec",
        "nr_topics": [20, 30, 50, 75, 100],
        "min_topic_size": 20,
        "diversity": None,
        "verbose": True
    }

    trainer = Trainer(dataset=dataset,
                        model_name="BERTopic",
                        params=params,
                        bt_embeddings=embeddings,
                        custom_dataset=custom,
                        verbose=True)
    results = trainer.train(save=f"./extension_results/BERTopic_{name}_mpnet")

## **CTM**

In [None]:
dataset_name = ["20NewsGroup", "agnews", "reuters8", "StackOverflow", "yahoo_answer"]
for name in dataset_name:
    if name == "20NewsGroup":
        dataset, custom = name, False
    else:
        dataset, custom = name, True
        
    params = {
        "n_components": [20, 30, 50, 75, 100],
        "contextual_size":768
    }

    trainer = Trainer(dataset=dataset,
                        model_name="CTM_CUSTOM",
                        params=params,
                        custom_dataset=custom,
                        verbose=True)
    results = trainer.train(save=f"./extension_results/CTM_Results_{dataset}")

## **Top2Vec**
Aside from its Doc2Vec backend, we also want to explore its performance using the `"all-mpnet-base-v2"` SBERT model as that was used in BERTopic. To do so, we make a very slight change to the core code of Top2Vec, namely replacing all instances of `""distiluse-base-multilingual-cased"` with `"all-mpnet-base-v2"`:

In [None]:
import logging
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
import umap
import hdbscan
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.cluster import dbscan
import tempfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from scipy.special import softmax
from top2vec import Top2Vec

try:
    import hnswlib

    _HAVE_HNSWLIB = True
except ImportError:
    _HAVE_HNSWLIB = False

try:
    import tensorflow as tf
    import tensorflow_hub as hub
    import tensorflow_text

    _HAVE_TENSORFLOW = True
except ImportError:
    _HAVE_TENSORFLOW = False

try:
    from sentence_transformers import SentenceTransformer

    _HAVE_TORCH = True
except ImportError:
    _HAVE_TORCH = False

logger = logging.getLogger('top2vec')
logger.setLevel(logging.WARNING)
sh = logging.StreamHandler()
sh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(sh)


def default_tokenizer(doc):
    """Tokenize documents for training and remove too long/short words"""
    return simple_preprocess(strip_tags(doc), deacc=True)


class Top2VecNew(Top2Vec):
    """
    Top2Vec
    Creates jointly embedded topic, document and word vectors.
    Parameters
    ----------
    embedding_model: string
        This will determine which model is used to generate the document and
        word embeddings. The valid string options are:
            * doc2vec
            * universal-sentence-encoder
            * universal-sentence-encoder-multilingual
            * distiluse-base-multilingual-cased
        For large data sets and data sets with very unique vocabulary doc2vec
        could produce better results. This will train a doc2vec model from
        scratch. This method is language agnostic. However multiple languages
        will not be aligned.
        Using the universal sentence encoder options will be much faster since
        those are pre-trained and efficient models. The universal sentence
        encoder options are suggested for smaller data sets. They are also
        good options for large data sets that are in English or in languages
        covered by the multilingual model. It is also suggested for data sets
        that are multilingual.
        For more information on universal-sentence-encoder visit:
        https://tfhub.dev/google/universal-sentence-encoder/4
        For more information on universal-sentence-encoder-multilingual visit:
        https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
        The distiluse-base-multilingual-cased pre-trained sentence transformer
        is suggested for multilingual datasets and languages that are not
        covered by the multilingual universal sentence encoder. The
        transformer is significantly slower than the universal sentence
        encoder options.
        For more informati ond istiluse-base-multilingual-cased visit:
        https://www.sbert.net/docs/pretrained_models.html
    embedding_model_path: string (Optional)
        Pre-trained embedding models will be downloaded automatically by
        default. However they can also be uploaded from a file that is in the
        location of embedding_model_path.
        Warning: the model at embedding_model_path must match the
        embedding_model parameter type.
    documents: List of str
        Input corpus, should be a list of strings.
    min_count: int (Optional, default 50)
        Ignores all words with total frequency lower than this. For smaller
        corpora a smaller min_count will be necessary.
    speed: string (Optional, default 'learn')
        This parameter is only used when using doc2vec as embedding_model.
        It will determine how fast the model takes to train. The
        fast-learn option is the fastest and will generate the lowest quality
        vectors. The learn option will learn better quality vectors but take
        a longer time to train. The deep-learn option will learn the best
        quality vectors but will take significant time to train. The valid
        string speed options are:
        
            * fast-learn
            * learn
            * deep-learn
    use_corpus_file: bool (Optional, default False)
        This parameter is only used when using doc2vec as embedding_model.
        Setting use_corpus_file to True can sometimes provide speedup for
        large datasets when multiple worker threads are available. Documents
        are still passed to the model as a list of str, the model will create
        a temporary corpus file for training.
    document_ids: List of str, int (Optional)
        A unique value per document that will be used for referring to
        documents in search results. If ids are not given to the model, the
        index of each document in the original corpus will become the id.
    keep_documents: bool (Optional, default True)
        If set to False documents will only be used for training and not saved
        as part of the model. This will reduce model size. When using search
        functions only document ids will be returned, not the actual
        documents.
    workers: int (Optional)
        The amount of worker threads to be used in training the model. Larger
        amount will lead to faster training.
    
    tokenizer: callable (Optional, default None)
        Override the default tokenization method. If None then
        gensim.utils.simple_preprocess will be used.
    use_embedding_model_tokenizer: bool (Optional, default False)
        If using an embedding model other than doc2vec, use the model's
        tokenizer for document embedding. If set to True the tokenizer, either
        default or passed callable will be used to tokenize the text to
        extract the vocabulary for word embedding.
    umap_args: dict (Optional, default None)
        Pass custom arguments to UMAP.
    hdbscan_args: dict (Optional, default None)
        Pass custom arguments to HDBSCAN.
    
    verbose: bool (Optional, default True)
        Whether to print status data during training.
    """

    def __init__(self,
                 documents,
                 min_count=50,
                 embedding_model='doc2vec',
                 embedding_model_path=None,
                 speed='learn',
                 use_corpus_file=False,
                 document_ids=None,
                 keep_documents=True,
                 workers=None,
                 tokenizer=None,
                 use_embedding_model_tokenizer=False,
                 umap_args=None,
                 hdbscan_args=None,
                 verbose=True
                 ):

        if verbose:
            logger.setLevel(logging.DEBUG)
            self.verbose = True
        else:
            logger.setLevel(logging.WARNING)
            self.verbose = False

        if tokenizer is None:
            tokenizer = default_tokenizer

        # validate documents
        if not (isinstance(documents, list) or isinstance(documents, np.ndarray)):
            raise ValueError("Documents need to be a list of strings")
        if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents):
            raise ValueError("Documents need to be a list of strings")
        if keep_documents:
            self.documents = np.array(documents, dtype="object")
        else:
            self.documents = None

        # validate document ids
        if document_ids is not None:
            if not (isinstance(document_ids, list) or isinstance(document_ids, np.ndarray)):
                raise ValueError("Documents ids need to be a list of str or int")

            if len(documents) != len(document_ids):
                raise ValueError("Document ids need to match number of documents")
            elif len(document_ids) != len(set(document_ids)):
                raise ValueError("Document ids need to be unique")

            if all((isinstance(doc_id, str) or isinstance(doc_id, np.str_)) for doc_id in document_ids):
                self.doc_id_type = np.str_
            elif all((isinstance(doc_id, int) or isinstance(doc_id, np.int_)) for doc_id in document_ids):
                self.doc_id_type = np.int_
            else:
                raise ValueError("Document ids need to be str or int")

            self.document_ids_provided = True
            self.document_ids = np.array(document_ids)
            self.doc_id2index = dict(zip(document_ids, list(range(0, len(document_ids)))))
        else:
            self.document_ids_provided = False
            self.document_ids = np.array(range(0, len(documents)))
            self.doc_id2index = dict(zip(self.document_ids, list(range(0, len(self.document_ids)))))
            self.doc_id_type = np.int_

        acceptable_embedding_models = ["universal-sentence-encoder-multilingual",
                                       "universal-sentence-encoder",
                                       "all-mpnet-base-v2"]

        self.embedding_model_path = embedding_model_path

        if embedding_model == 'doc2vec':

            # validate training inputs
            if speed == "fast-learn":
                hs = 0
                negative = 5
                epochs = 40
            elif speed == "learn":
                hs = 1
                negative = 0
                epochs = 40
            elif speed == "deep-learn":
                hs = 1
                negative = 0
                epochs = 400
            elif speed == "test-learn":
                hs = 0
                negative = 5
                epochs = 1
            else:
                raise ValueError("speed parameter needs to be one of: fast-learn, learn or deep-learn")

            if workers is None:
                pass
            elif isinstance(workers, int):
                pass
            else:
                raise ValueError("workers needs to be an int")

            doc2vec_args = {"vector_size": 300,
                            "min_count": min_count,
                            "window": 15,
                            "sample": 1e-5,
                            "negative": negative,
                            "hs": hs,
                            "epochs": epochs,
                            "dm": 0,
                            "dbow_words": 1}

            if workers is not None:
                doc2vec_args["workers"] = workers

            logger.info('Pre-processing documents for training')

            if use_corpus_file:
                processed = [' '.join(tokenizer(doc)) for doc in documents]
                lines = "\n".join(processed)
                temp = tempfile.NamedTemporaryFile(mode='w+t')
                temp.write(lines)
                doc2vec_args["corpus_file"] = temp.name


            else:
                train_corpus = [TaggedDocument(tokenizer(doc), [i]) for i, doc in enumerate(documents)]
                doc2vec_args["documents"] = train_corpus

            logger.info('Creating joint document/word embedding')
            self.embedding_model = 'doc2vec'
            self.model = Doc2Vec(**doc2vec_args)

            if use_corpus_file:
                temp.close()

        elif embedding_model in acceptable_embedding_models:

            self.embed = None
            self.embedding_model = embedding_model

            self._check_import_status()

            logger.info('Pre-processing documents for training')

            # preprocess documents
            tokenized_corpus = [tokenizer(doc) for doc in documents]

            def return_doc(doc):
                return doc

            # preprocess vocabulary
            vectorizer = CountVectorizer(tokenizer=return_doc, preprocessor=return_doc)
            doc_word_counts = vectorizer.fit_transform(tokenized_corpus)
            words = vectorizer.get_feature_names()
            word_counts = np.array(np.sum(doc_word_counts, axis=0).tolist()[0])
            vocab_inds = np.where(word_counts > min_count)[0]

            if len(vocab_inds) == 0:
                raise ValueError(f"A min_count of {min_count} results in "
                                 f"all words being ignored, choose a lower value.")
            self.vocab = [words[ind] for ind in vocab_inds]

            self._check_model_status()

            logger.info('Creating joint document/word embedding')

            # embed words
            self.word_indexes = dict(zip(self.vocab, range(len(self.vocab))))
            self.word_vectors = self._l2_normalize(np.array(self.embed(self.vocab)))

            # embed documents
            if use_embedding_model_tokenizer:
                self.document_vectors = self._embed_documents(documents)
            else:
                train_corpus = [' '.join(tokens) for tokens in tokenized_corpus]
                self.document_vectors = self._embed_documents(train_corpus)

        else:
            raise ValueError(f"{embedding_model} is an invalid embedding model.")

        # create 5D embeddings of documents
        logger.info('Creating lower dimension embedding of documents')

        if umap_args is None:
            umap_args = {'n_neighbors': 15,
                         'n_components': 5,
                         'metric': 'cosine'}

        umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))

        # find dense areas of document vectors
        logger.info('Finding dense areas of documents')

        if hdbscan_args is None:
            hdbscan_args = {'min_cluster_size': 15,
                            'metric': 'euclidean',
                            'cluster_selection_method': 'eom'}

        cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)

        # calculate topic vectors from dense areas of documents
        logger.info('Finding topics')

        # create topic vectors
        self._create_topic_vectors(cluster.labels_)

        # deduplicate topics
        self._deduplicate_topics()

        # find topic words and scores
        self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)

        # assign documents to topic
        self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
                                                                      self._get_document_vectors())

        # calculate topic sizes
        self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)

        # re-order topics
        self._reorder_topics(hierarchy=False)

        # initialize variables for hierarchical topic reduction
        self.topic_vectors_reduced = None
        self.doc_top_reduced = None
        self.doc_dist_reduced = None
        self.topic_sizes_reduced = None
        self.topic_words_reduced = None
        self.topic_word_scores_reduced = None
        self.hierarchy = None

        # initialize document indexing variables
        self.document_index = None
        self.serialized_document_index = None
        self.documents_indexed = False
        self.index_id2doc_id = None
        self.doc_id2index_id = None

        # initialize word indexing variables
        self.word_index = None
        self.serialized_word_index = None
        self.words_indexed = False

    def _check_import_status(self):
        if self.embedding_model != 'all-mpnet-base-v2':
            if not _HAVE_TENSORFLOW:
                raise ImportError(f"{self.embedding_model} is not available.\n\n"
                                  "Try: pip install top2vec[sentence_encoders]\n\n"
                                  "Alternatively try: pip install tensorflow tensorflow_hub tensorflow_text")
        else:
            if not _HAVE_TORCH:
                raise ImportError(f"{self.embedding_model} is not available.\n\n"
                                  "Try: pip install top2vec[sentence_transformers]\n\n"
                                  "Alternatively try: pip install torch sentence_transformers")

    def _check_model_status(self):
        if self.embed is None:
            if self.verbose is False:
                logger.setLevel(logging.DEBUG)

            if self.embedding_model != "all-mpnet-base-v2":
                if self.embedding_model_path is None:
                    logger.info(f'Downloading {self.embedding_model} model')
                    if self.embedding_model == "universal-sentence-encoder-multilingual":
                        module = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
                    else:
                        module = "https://tfhub.dev/google/universal-sentence-encoder/4"
                else:
                    logger.info(f'Loading {self.embedding_model} model at {self.embedding_model_path}')
                    module = self.embedding_model_path
                self.embed = hub.load(module)

            else:
                if self.embedding_model_path is None:
                    logger.info(f'Downloading {self.embedding_model} model')
                    module = 'all-mpnet-base-v2'
                else:
                    logger.info(f'Loading {self.embedding_model} model at {self.embedding_model_path}')
                    module = self.embedding_model_path
                model = SentenceTransformer(module)
                self.embed = model.encode

        if self.verbose is False:
            logger.setLevel(logging.WARNING)

We can then use this `Top2VecNew` class to run our experiments including the `"all-mpnet-base-v2"` model. 

In [None]:
for name in ["20NewsGroup", "reuters8"]:
    if name == "20NewsGroup":
        dataset, custom = name, False
    else:
        dataset, custom = name, True
        
    params = {"nr_topics": [20, 30, 50, 75, 100], "embedding_model": "all-mpnet-base-v2", "min_count": 20,
                "speed": "deep-learn",
                "hdbscan_args": {'min_cluster_size': 5,
                                'metric': 'euclidean',
                                'cluster_selection_method': 'eom'}}

    trainer = Trainer(dataset=dataset,
                        custom_dataset=custom,
                        custom_model=Top2VecNew,
                        model_name="Top2Vec",
                        params=params,
                        verbose=True)
    results = trainer.train(save=f"./extension_results/Top2Vec_{name}_mpnet")

## TSCTM ##

In [None]:
dataset_name = ["20NewsGroup", "agnews", "reuters8", "StackOverflow", "yahoo_answer"]
for i, random_state in enumerate([0]):
    for name in dataset_name:
        if name == "20NewsGroup":
            dataset, custom = name, False
        else:
            dataset, custom = name, True

        params =  {"num_topics": [20], "random_seed": random_state}
        trainer = Trainer(dataset=dataset,
                        model_name="tsctm",
                        params=params,
                        custom_dataset=custom,
                        verbose=True)
        results = trainer.train(save=f"./extension_results/tsctm_{name}")

## Bertop (MPNET) + LDA ##

In [None]:
from sentence_transformers import SentenceTransformer
import gensim
import gensim.corpora as corpora
import numpy as np

def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))

def get_vec_lda(model, corpus, k):
    """
    Get the LDA vector representation (probabilistic topic assignments for all documents)
    :return: vec_lda with dimension: (n_doc * n_topic)
    """
    # n_doc = len(corpus)
    # vec_lda = np.zeros((n_doc, k))
    # for i in range(n_doc):
    #     # get the distribution for the i-th document in corpus
    #     for topic, prob in model.get_document_topics(corpus[i]):
    #         vec_lda[i, topic] = prob

    n_doc = len(corpus)
    vec_lda = np.zeros((n_doc, k))
    for i, aa in enumerate(model.load_document_topics()):
        for item in aa:
            vec_lda[i, item[0]] = item[1]

    return vec_lda

In [None]:
%%capture
# Prepare data
dataset_name = ["reuters8"]
for name in dataset_name:
    if name == "20NewsGroup":
        dataset, custom = name, False
    else:
        dataset, custom = name, True
        
    data_loader = DataLoader(dataset)
    _, timestamps = data_loader.load_docs()
    sentences = data_loader.load_octis(custom)
    sentences = [" ".join(words) for words in sentences.get_corpus()]

    data_words = list(sent_to_words(sentences))
    # turn tokenized documents into a id <-> term dictionary
    id2word = corpora.Dictionary(data_words)
    # convert tokenized documents into a document-term matrix
    corpus = [id2word.doc2bow(text) for text in data_words]

    gamma = 30

    for ktop in [20, 30, 50, 75, 100]:
        print('Getting vector representations for LDA ...')

        mallet_path = '/mnt/raid1/gzhou/mallet-2.0.8/bin/mallet'
        ldamodel = gensim.models.wrappers.LdaMallet(mallet_path=mallet_path, corpus=corpus, 
                                                            num_topics=ktop, id2word=id2word, random_seed=42)

        # ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=ktop, id2word=id2word,
        #                                                 passes=20)
        vec_lda = get_vec_lda(ldamodel, corpus, ktop)
        print('Getting vector representations for LDA. Done!')


        print('Getting vector representations for BERT ...')
        # Extract embeddings
        model = SentenceTransformer("all-mpnet-base-v2")
        # model = SentenceTransformer("stsb-roberta-base-v2")
        vec_bert = np.array(model.encode(sentences, show_progress_bar=True))
        # embeddings = model.encode(sentences, show_progress_bar=True)
        print('Getting vector representations for BERT. Done!')  


        print(f'vec_lda.shape is {vec_lda.shape}')
        print(f'vec_bert.shape is {vec_bert.shape}')
        vec_ldabert = np.c_[vec_lda * gamma, vec_bert]
        print(f'vec_ldabert.shape is {vec_ldabert.shape}')

        params = {
            # "embedding_model": "stsb-roberta-base-v2",
            "embedding_model": "all-mpnet-base-v2",
            # "embedding_model": "doc2vec",
            "nr_topics": ktop,
            "min_topic_size": 20,
            "diversity": None,
            "verbose": True
        }

        trainer = Trainer(dataset=dataset,
                            model_name="BERTopic",
                            params=params,
                            bt_embeddings=vec_ldabert,
                            custom_dataset=custom,
                            verbose=True)
                            
        results = trainer.train(save=f"./extension_results/BERTopic+LDA_{name}_mpnet_{gamma}")