# Init

In [None]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from 

In [None]:
from google.colab import files
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from hdbscan import HDBSCAN
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from statistics import mean
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.cluster import AgglomerativeClustering, KMeans
from nltk.stem.porter import *
import pandas as pd
import numpy as np
from datetime import datetime
from umap import UMAP
import re
import pprint
np.random.seed(2018)
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

# Preprocess
def preprocess(text):
    result = []
    stemmer = SnowballStemmer(language='english')
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(stemmer.stem(WordNetLemmatizer().lemmatize(token)))
    return result

# N gram maker
def make_n_grams(texts, n, min_count=5, threshold=100):
    n_gram = ""
    n_gram_mod = ""
    text_proc = texts
    res = texts
    for i in range(n-1):
        n_gram = gensim.models.Phrases(text_proc, min_count=min_count, threshold=threshold)
        n_gram_mod = gensim.models.phrases.Phraser(n_gram)
        text_proc = n_gram[text_proc]
        res = [n_gram[i] for i in res]
    return res

# Augment data manually
def data_augment_manual(wordstocks, times):
    for i in range(times):
        wordstocks = wordstocks + wordstocks
    return wordstocks

# Document counter
def data_augment(wordstocks):
    result = wordstocks
    data_len = len(wordstocks)
    counter = 0
    while True:
        try:
            topic_model_aut = BERTopic(nr_topics='auto')
            topics_aut, probs_aut = topic_model_aut.fit_transform(result)
            break
        except:
            result = result + result
            counter += 1
            continue
    return (wordstocks, data_len, counter)

# Topic modelling
# Input : wordstocks, data length, number of topics
# Output : topics and coherences for 5 models
def topic_modelling(wordstocks, data_len, topic_num):
    topicDict_aut = dict()
    topicDict_ac = dict()
    topicDict_km = dict()
    topicDictLDA = dict()
    topicDictLDAtrigram = dict()
    # With HDBScan
    topicBTAUT = None
    coherence_aut = None
    try:
        topic_model_aut = BERTopic(nr_topics=topic_num)
        topics_aut, probs_aut = topic_model_aut.fit_transform(wordstocks)
        topicsAndName_aut = list(np.array(topic_model_aut.get_document_info(wordstocks)[["Topic", "Name"]]))
        for i in topicsAndName_aut:
            topicDict_aut[i[0]] = re.sub(r'\d+_', '', i[1]).replace("-", "")
        topicBTAUT = [topicDict_aut[i] for i in np.array(topic_model_aut.get_document_info(wordstocks)["Topic"])][:data_len]
        ### Compute Coherence Score of BERTopic Auto

        documents_aut = pd.DataFrame({"Document": wordstocks,
                              "ID": range(len(wordstocks)),
                              "Topic": topics_aut})
        dpt_aut = documents_aut.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
        cleaned_docs_aut = topic_model_aut._preprocess_text(dpt_aut.Document.values)

        # Extract vectorizer and analyzer from BERTopic
        vectorizer_aut = topic_model_aut.vectorizer_model
        analyzer_aut = vectorizer_aut.build_analyzer()

        # Extract features for Topic Coherence evaluation
        words_aut = vectorizer_aut.vocabulary_.keys()
        tokens_aut = [analyzer_aut(doc) for doc in cleaned_docs_aut]
        dictionary_aut = gensim.corpora.Dictionary(tokens_aut)
        corpus_aut = [dictionary_aut.doc2bow(token) for token in tokens_aut]
        topic_words_aut = [[words for words, _ in topic_model_aut.get_topic(topic)] for topic in range(len(set(topics_aut))-1)]

        # Evaluate
        coherence_aut = CoherenceModel(topics=topic_words_aut,
                                        texts=tokens_aut,
                                        corpus=corpus_aut,
                                        dictionary=dictionary_aut,
                                        coherence='c_v').get_coherence()
    except:
        topicBTAUT = None
        coherence_aut = None

    # Agglomerative Cluster
    topic_model_ac = BERTopic(hdbscan_model=AgglomerativeClustering(n_clusters=topic_num))
    topics_ac, probs_ac = topic_model_ac.fit_transform(wordstocks)
    topicsAndName_ac = list(np.array(topic_model_ac.get_document_info(wordstocks)[["Topic", "Name"]]))
    for i in topicsAndName_ac:
        topicDict_ac[i[0]] = re.sub(r'\d+_', '', i[1])
    topicBTAC = [topicDict_ac[i] for i in np.array(topic_model_ac.get_document_info(wordstocks)["Topic"])][:data_len]

    # KMeans Clustering
    topic_model_km = BERTopic(hdbscan_model=KMeans(n_clusters=topic_num))
    topics_km, probs_km = topic_model_km.fit_transform(wordstocks)
    topicsAndName_km = list(np.array(topic_model_km.get_document_info(wordstocks)[["Topic", "Name"]]))
    for i in topicsAndName_km:
        topicDict_km[i[0]] = re.sub(r'\d+_', '', i[1])
    topicBTKM = [topicDict_km[i] for i in np.array(topic_model_km.get_document_info(wordstocks)["Topic"])][:data_len]

    # With LDA bigram
    doc_lists = make_n_grams([i.split(" ") for i in wordstocks], 2)
    id2word = gensim.corpora.Dictionary(doc_lists)
    corpus = [id2word.doc2bow(i) for i in doc_lists]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=topic_num, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
    topicDictLDA = {inner_list[0]: inner_list[1] for inner_list in [[topic_num, re.sub(r'\d+\.\d+|\d+', '', words.replace('"', '').replace("*", "").replace(' + ', '_'))] for topic_num, words in lda_model.print_topics(num_words=4)]}
    ldaRes = []
    for i in corpus:
        max = ()
        max_val = -99
        for j,k in lda_model.get_document_topics(i):
            if k > max_val:
                max = (j,k)
                max_val = k
        ldaRes.append(topicDictLDA[max[0]])
    topicLDA = ldaRes[:data_len]

    # With LDA Trigram
    doc_lists_trigram = make_n_grams([i.split(" ") for i in wordstocks], 3)
    id2word_trigram = gensim.corpora.Dictionary(doc_lists_trigram)
    corpus_trigram = [id2word_trigram.doc2bow(i) for i in doc_lists_trigram]
    lda_model_trigram = gensim.models.ldamodel.LdaModel(corpus=corpus_trigram, id2word=id2word_trigram, num_topics=topic_num, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
    topicDictLDA_trigram = {inner_list[0]: inner_list[1] for inner_list in [[topic_num, re.sub(r'\d+\.\d+|\d+', '', words.replace('"', '').replace("*", "").replace(' + ', '_'))] for topic_num, words in lda_model_trigram.print_topics(num_words=4)]}
    ldaRes_trigram = []
    for i in corpus_trigram:
        max = ()
        max_val = -99
        for j,k in lda_model_trigram.get_document_topics(i):
            if k > max_val:
                max = (j,k)
                max_val = k
        ldaRes_trigram.append(topicDictLDA_trigram[max[0]])
    topicLDA_trigram = ldaRes_trigram[:data_len]

    ### Compute Coherence Score of LDA bigram
    coherence_lda = CoherenceModel(model=lda_model, texts=doc_lists, dictionary=id2word, coherence='c_v').get_coherence()

    ### Compute Coherence Score of LDA trigram
    coherence_lda_trigram = CoherenceModel(model=lda_model_trigram, texts=doc_lists_trigram, dictionary=id2word_trigram, coherence='c_v').get_coherence()


    ### Compute Coherence Score of BERTopic Agglo


    documents_ac = pd.DataFrame({"Document": wordstocks,
                          "ID": range(len(wordstocks)),
                          "Topic": topics_ac})
    dpt_ac = documents_ac.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs_ac = topic_model_ac._preprocess_text(dpt_ac.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer_ac = topic_model_ac.vectorizer_model
    analyzer_ac = vectorizer_ac.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words_ac = vectorizer_ac.vocabulary_.keys()
    tokens_ac = [analyzer_ac(doc) for doc in cleaned_docs_ac]
    dictionary_ac = gensim.corpora.Dictionary(tokens_ac)
    corpus_ac = [dictionary_ac.doc2bow(token) for token in tokens_ac]
    topic_words_ac = [[words for words, _ in topic_model_ac.get_topic(topic)] for topic in range(len(set(topics_ac))-1)]

    # Evaluate
    coherence_ac = CoherenceModel(topics=topic_words_ac,
                                    texts=tokens_ac,
                                    corpus=corpus_ac,
                                    dictionary=dictionary_ac,
                                    coherence='c_v').get_coherence()

    ### Compute Coherence Score of BERTopic KMeans


    documents_km = pd.DataFrame({"Document": wordstocks,
                          "ID": range(len(wordstocks)),
                          "Topic": topics_km})
    dpt_km = documents_km.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs_km = topic_model_km._preprocess_text(dpt_km.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer_km = topic_model_km.vectorizer_model
    analyzer_km = vectorizer_km.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words_km = vectorizer_km.vocabulary_.keys()
    tokens_km = [analyzer_km(doc) for doc in cleaned_docs_km]
    dictionary_km = gensim.corpora.Dictionary(tokens_km)
    corpus_km = [dictionary_km.doc2bow(token) for token in tokens_km]
    topic_words_km = [[words for words, _ in topic_model_km.get_topic(topic)] for topic in range(len(set(topics_km))-1)]

    # Evaluate
    coherence_km = CoherenceModel(topics=topic_words_km,
                                    texts=tokens_km,
                                    corpus=corpus_km,
                                    dictionary=dictionary_km,
                                    coherence='c_v').get_coherence()

    return (topicLDA, topicLDA_trigram, topicBTAUT, topicBTAC, topicBTKM, coherence_lda, coherence_lda_trigram, coherence_aut, coherence_ac, coherence_km)

# Topic modelling automatization for research purpose, this will run topic_modelling() for augmented datasets
# Input : dataframe, topic numbers, and maximal number of augmentation
# Output : list of dataframes containing (document, timestamp, and scores for each models), a dataframe of coherence scores, a concated list of both
def auto_topic_modelling(df, topic_num=3, MAX_AUGMENT=4):
    returned = []
    coherence_np = []
    list_of_both = []
    dataset = np.array(df['text'])
    timestamp = np.array(df['timestamp'])
    wordstocks, data_len, counter = data_augment([" ".join(preprocess(i)) for i in dataset])
    start_time = datetime.now()
    topicLDA, topicLDA_trigram, topicBTAUT, topicBTAC, topicBTKM, coherence_LDA, coherence_LDA_trigram, coherence_BTAUT, coherence_BTAC, coherence_BTKM = topic_modelling(wordstocks, data_len, topic_num)
    if topicBTAUT is None:
        csvTotal_aug = pd.DataFrame(data=[[dataset[i], timestamp[i], "Error:Dataset too small", topicBTAC[i], topicBTKM[i], topicLDA[i], topicLDA_trigram[i]] for i in range(data_len)], columns=["Document", "Timestamp", "BERTopic with HDBScan", "BERTopic with Agglomerative Topic", "BERTopic with KMeans", "LDA bigram", "LDA trigram"])
        end_time = datetime.now()
        returned.append(csvTotal_aug)
        coherence_np.append([coherence_LDA, coherence_LDA_trigram, "Error:Dataset too small", coherence_BTAC, coherence_BTKM, str('{} Second(s)'.format(end_time - start_time))])
    else:
        csvTotal_aug = pd.DataFrame(data=[[dataset[i], timestamp[i], topicBTAUT[i], topicBTAC[i], topicBTKM[i], topicLDA[i], topicLDA_trigram[i]] for i in range(data_len)], columns=["Document", "Timestamp", "BERTopic with HDBScan", "BERTopic with Agglomerative Topic", "BERTopic with KMeans", "LDA bigram", "LDA trigram"])
        end_time = datetime.now()
        returned.append(csvTotal_aug)
        coherence_np.append([coherence_LDA, coherence_LDA_trigram, coherence_BTAUT, coherence_BTAC, coherence_BTKM, str('{} Second(s)'.format(end_time - start_time))])
        list_of_both.append(["LDA bigram", 0, coherence_LDA, [i for i in np.array(csvTotal_aug[['Document', 'LDA bigram', 'Timestamp']])]])
        list_of_both.append(["LDA trigram", 0, coherence_LDA, [i for i in np.array(csvTotal_aug[['Document', 'LDA trigram', 'Timestamp']])]])
        list_of_both.append(["HDBScan", 0, coherence_BTAUT, [i for i in np.array(csvTotal_aug[['Document', 'BERTopic with HDBScan', 'Timestamp']])]])
        list_of_both.append(["Agglomerative", 0, coherence_BTAC, [i for i in np.array(csvTotal_aug[['Document', 'BERTopic with Agglomerative Topic', 'Timestamp']])]])
        list_of_both.append(["KMeans", 0, coherence_BTKM, [i for i in np.array(csvTotal_aug[['Document', 'BERTopic with KMeans', 'Timestamp']])]])

    for j in range(1, MAX_AUGMENT + 1):
        wordstocks_temp = data_augment_manual(wordstocks, j)
        start_time = datetime.now()
        topicLDA, topicLDA_trigram, topicBTAUT, topicBTAC, topicBTKM, coherence_LDA, coherence_LDA_trigram, coherence_BTAUT, coherence_BTAC, coherence_BTKM = topic_modelling(wordstocks_temp, data_len, topic_num)
        if topicBTAUT is None:
            csvTotal_temp_aug = pd.DataFrame(data=[[dataset[i], timestamp[i], "Error:Dataset too small", topicBTAC[i], topicBTKM[i], topicLDA[i], topicLDA_trigram[i]] for i in range(data_len)], columns=["Document", "Timestamp", "BERTopic with HDBScan", "BERTopic with Agglomerative Topic", "BERTopic with KMeans", "LDA bigram", "LDA trigram"])
            end_time = datetime.now()
            returned.append(csvTotal_temp_aug)
            coherence_np.append([coherence_LDA, coherence_LDA_trigram, "Error:Dataset too small", coherence_BTAC, coherence_BTKM, str('{} Second(s)'.format(end_time - start_time))])
        else:
            csvTotal_temp_aug = pd.DataFrame(data=[[dataset[i], timestamp[i], topicBTAUT[i], topicBTAC[i], topicBTKM[i], topicLDA[i], topicLDA_trigram[i]] for i in range(data_len)], columns=["Document", "Timestamp", "BERTopic with HDBScan", "BERTopic with Agglomerative Topic", "BERTopic with KMeans", "LDA bigram", "LDA trigram"])
            end_time = datetime.now()
            returned.append(csvTotal_temp_aug)
            coherence_np.append([coherence_LDA, coherence_LDA_trigram, coherence_BTAUT, coherence_BTAC, coherence_BTKM, str('{} Second(s)'.format(end_time - start_time))])
            list_of_both.append(["LDA bigram", j, coherence_LDA, [i for i in np.array(csvTotal_temp_aug[['Document', 'LDA bigram', 'Timestamp']])]])
            list_of_both.append(["LDA trigram", j, coherence_LDA, [i for i in np.array(csvTotal_temp_aug[['Document', 'LDA trigram', 'Timestamp']])]])
            list_of_both.append(["HDBScan", j, coherence_BTAUT, [i for i in np.array(csvTotal_temp_aug[['Document', 'BERTopic with HDBScan', 'Timestamp']])]])
            list_of_both.append(["Agglomerative", j, coherence_BTAC, [i for i in np.array(csvTotal_temp_aug[['Document', 'BERTopic with Agglomerative Topic', 'Timestamp']])]])
            list_of_both.append(["KMeans", j, coherence_BTKM, [i for i in np.array(csvTotal_temp_aug[['Document', 'BERTopic with KMeans', 'Timestamp']])]])

    coherence_df = pd.DataFrame(data=coherence_np, columns=["coherence_LDA", "coherence_LDA_trigram", "coherence_BTAUT", "coherence_BTAC", "coherence_BTKM", "Time Taken"])
    return (returned, coherence_df, list_of_both)


# Topic modelling automatization will be run for each topic numbers from 2 - 5 (for research purpose)
# Input : dataframe, max topic numbers, and maximal number of augmentation
# Output : list of lists of dataframes containing (document, timestamp, and scores for each models), a list of coherence scores, a concated list of lists of both
def topic_model_auto_iter(df, MAX_AUGMENT = 4, MAX_TOPIC_NUM = 5):
    list_of_lists_of_csvs = []
    lists_of_coherences = []
    list_of_lists_of_both = []
    for i in range(2, MAX_TOPIC_NUM + 1):
        res, coh, lob = auto_topic_modelling(df, i, MAX_AUGMENT)
        list_of_lists_of_csvs.append(res)
        lists_of_coherences.append(coh)
        for j in lob:
            list_of_lists_of_both.append([j[0], j[1], j[2], j[3], str(i)])
    return (list_of_lists_of_csvs, lists_of_coherences, list_of_lists_of_both)


# To find the highest coherence scores
def run_tm(df, topic_num=5, MAX_AUGMENT=4):
    res, coh, ttl = topic_model_auto_iter(df, MAX_AUGMENT, topic_num)
    pairing = []
    ttl_ind = max([x[2] for x in ttl])
    # ttl_ind = max([x[1] for x in ttl])
    for x in ttl:
        if x[2] == ttl_ind:
            pairing = x
    return (pairing, [[x[0], x[1], x[2], x[4]] for x in ttl], [res, coh, ttl])

# Saving the result
def save(result, filename, col=["Document", "Timestamp", "Topics"]):
    # print(result)
    pandas = pd.DataFrame(data=[[x[0], x[2], x[1]] for x in result[3]], columns=col)
    pandas.to_csv(str(filename) + "_Method_" + str(result[0]) + "_Augmentation_" + str(result[1]) +"_Topic_" + str(result[4]) + "_Score_" + str(result[2]) + "_" + ".csv", sep='\t', index=False)
    files.download(str(filename) + "_Method_" + str(result[0]) + "_Augmentation_" + str(result[1]) +"_Topic_" + str(result[4]) + "_Score_" + str(result[2]) + "_" + ".csv")

# Saving all possible results
def save_all_coherence(results_list, filename, col=["Document", "Timestamp", "Topics"]):
    for i in results_list:
        pandas = pd.DataFrame(data=[[x[0], x[2], x[1]] for x in i[3]], columns=col)
        pandas.to_csv(str(filename) + "_Method_" + str(i[0]) + "_Augmentation_" + str(i[1]) +"_Topic_" + str(i[4]) + "_Score_" + str(i[2]) + "_" + ".csv", sep='\t', index=False)
        files.download(str(filename) + "_Method_" + str(i[0]) + "_Augmentation_" + str(i[1]) +"_Topic_" + str(i[4]) + "_Score_" + str(i[2]) + "_" + ".csv")

# Run and save
def run_save(df, filename, save_best_only = True, save_seer = True, topic_num=5, MAX_AUGMENT=4, col=["Document", "Timestamp", "Topics"]):
    pair, coh_lst, datas_file = run_tm(df, topic_num, MAX_AUGMENT)
    ttl = datas_file[2]
    if save_best_only:
        save(pair, filename, col)
    else:
        for i in ttl:
            save(i, filename, col)
    if save_seer:
        seer_df = seer(coh_lst)
        seer_df.to_csv(str(filename) + "_coherences_seer.csv", sep='\t', index=False)
        files.download(str(filename) + "_coherences_seer.csv")

# Getting the result for the dataset with the highest score
def seer(coherence_lst):
    df = pd.DataFrame(coherence_lst, columns=["Method", "Augmentation", "Score", "Topic Num"])
    grp_df = df.groupby(["Topic Num", "Augmentation"]).agg({"Score": list}).reset_index()
    new_df = pd.DataFrame({
        "Topic Num": grp_df["Topic Num"],
        "Augmentation": grp_df["Augmentation"],
        "LDA bigram": grp_df["Score"].apply(lambda x: x[0]),
        "LDA trigram": grp_df["Score"].apply(lambda x: x[1]),
        "HDBScan": grp_df["Score"].apply(lambda x: x[2]),
        "Agglomerative": grp_df["Score"].apply(lambda x: x[3]),
        "KMeans": grp_df["Score"].apply(lambda x: x[4])
    })
    return new_df



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
