In [None]:
!pip install bertopic
!pip install transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from bertopic import BERTopic 
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP

### Useful links

Documentation:

* BERTopic GitHub: https://github.com/MaartenGr/BERTopic
* Documentation: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.approximate_distribution
* Visualization: https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-topics-per-class 
* Algorithm explanation: https://maartengr.github.io/BERTopic/algorithm/algorithm.html
* BERT multilingual supports Estonian: https://github.com/google-research/bert/blob/master/multilingual.md
* EstBERT: https://huggingface.co/tartuNLP/EstBERT

 


Official examples:
* Topic Modeling with BERT: https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6
* Interactive Topic Modeling with BERTopic: https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8
* Using Whisper and BERTopic to model Kurzgesagt’s videos: https://towardsdatascience.com/using-whisper-and-bertopic-to-model-kurzgesagts-videos-7d8a63139bdf

other:
*  https://medium.com/data-reply-it-datatech/bertopic-topic-modeling-as-you-have-never-seen-it-before-abb48bbab2b2
https://python.plainenglish.io/topic-modeling-for-beginners-using-bertopic-and-python-aaf1b421afeb 

Useful:
* https://stackoverflow.com/questions/64320883/the-size-of-tensor-a-707-must-match-the-size-of-tensor-b-512-at-non-singleto
*   https://stackoverflow.com/questions/61708486/whats-difference-between-tokenizer-encode-and-tokenizer-encode-plus-in-hugging

* EstBERT: https://github.com/Tlepsh64/UT_NaturalLanguageProcessing/blob/0499d016515dced624710277b804bec4b84b93a5/homework6.ipynb 
* Why results are not consistent between runs: https://maartengr.github.io/BERTopic/faq.html#why-are-the-results-not-consistent-between-runs --> *Using custom embeddings allows you to try out BERTopic several times until you find the topics that suit you best.*
* https://github.com/MaartenGr/BERTopic/issues/763

sentence transformer, multilingual models --> https://www.sbert.net/docs/pretrained_models.html, (selected) models, They have been extensively evaluated for their quality to embedded sentences --> https://maartengr.github.io/BERTopic/algorithm/algorithm.html#detailed-overview, multilingual model

## Multilingual BERT, language='Estonian'

In [4]:
# data
#df = pd.read_excel('leaders_bert.xlsx')
df = pd.read_excel('/content/drive/MyDrive/MSc/MAKATÖÖ/leaders_bert.xlsx')
df.head()

Unnamed: 0,nimi,erakond,kuupäev,postitus,clean_text,estnltk_text,lemma_text,bert_text
0,E200,E200,2023-03-14 08:47:15,Me ei tohi väsida Ukraina toetamisest!,tohi väsida ukraina toetamisest,Text(text='tohi väsida ukraina toetamisest'),tohtima väsima ukraina toetamine,ei tohi väsida ukraina toetamisest!
1,E200,E200,2023-02-24 12:25:09,Meie iseseisvuspäev ja Ukraina on kurval moel ...,iseseisvuspäev ukraina kurval moel seotud täna...,Text(text='iseseisvuspäev ukraina kurval moel ...,iseseisvuspäev ukraina kurb mood seotud täna m...,meie iseseisvuspäev ukraina kurval moel seotud...
2,E200,E200,2023-02-24 00:55:51,105 aastat tagasi algas Eesti ja aasta tagasi ...,aastat eesti aasta ukraina teekond euroopasse ...,Text(text='aastat eesti aasta ukraina teekond ...,aasta eesti aasta ukraina teekond euroopasse e...,105 aastat tagasi algas eesti aasta tagasi ukr...
3,E200,E200,2023-02-21 17:36:03,"Vladimir Putini tänane suur kõne, mis pidi löö...",putini tänane suur kõne lööma laineid juhtima ...,Text(text='putini tänane suur kõne lööma laine...,putin tänane suur kõne lööma laine juhtima täh...,"putini tänane suur kõne, mis pidi lööma lainei..."
4,E200,E200,2023-02-13 10:06:46,"Sinu kandidaadid Hiiu-, Lääne- ja Saaremaal. T...",kandidaadid hiiu lääne saaremaal tutvu lähemal...,Text(text='kandidaadid hiiu lääne saaremaal tu...,kandidaat hiid lääs saaremaa tutvuma lähemalt ...,"sinu kandidaadid hiiu -, lääne - saaremaal. tu..."


In [5]:
len(df)

3223

In [6]:
texts = df["bert_text"].tolist()

In [7]:
with open('/content/drive/MyDrive/MSc/MAKATÖÖ/estonian-stopwords.txt', encoding='utf-8') as f:
#with open('../estonian-stopwords.txt', encoding='utf-8') as f:
    stop_words_est = [word for line in f for word in line.split()]
    #stop_words_est = stop_words_est + ['eesti', 'eestis', 'venemaa', 'vene', 'ukraina', 'ukrainas', 'ukrainale']

# https://maartengr.github.io/BERTopic/faq.html#how-do-i-remove-stop-words
# https://github.com/MaartenGr/BERTopic/issues/181

In [32]:
# # check if word in word list 
# if any("ukraina" in word for word in stop_words_est):
#     print('is there')
# else:
#     print(' is not')

is there


In [8]:
# initialize models

vectorizer_model = CountVectorizer(stop_words=stop_words_est)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

umap_model = UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, random_state = 6)

cluster_model = KMeans(n_clusters=50)

#hdbscan_model = HDBSCAN(min_cluster_size=15, prediction_data=True)

In [9]:
bert_topic_model = BERTopic(language="Estonian", 
                             vectorizer_model=vectorizer_model,
                             ctfidf_model=ctfidf_model,
                             umap_model = umap_model,
                             hdbscan_model=cluster_model,
                             #nr_topics= 'auto', 
                             #min_topic_size = 15,
                             calculate_probabilities = True, 
                             verbose = True) 

topics, probabilities = bert_topic_model.fit_transform(texts)
bert_topic_model.save("bert_topic_model")

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2023-04-23 13:19:45,881 - BERTopic - Transformed documents to Embeddings
2023-04-23 13:20:13,442 - BERTopic - Reduced dimensionality
2023-04-23 13:20:14,038 - BERTopic - Clustered reduced embeddings


In [10]:
bert_topic_model.get_params()

{'calculate_probabilities': True,
 'ctfidf_model': ClassTfidfTransformer(reduce_frequent_words=True),
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f8034662d60>,
 'hdbscan_model': KMeans(n_clusters=50),
 'language': 'Estonian',
 'low_memory': False,
 'min_topic_size': 10,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'representation_model': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, random_state=6, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(stop_words=['minutaolisteks', 'veeres', 'samadeks',
                             'karkääksti', 'mihukeste', 'ii-ha-ha', 'milliseist',
                             'selleks', 'mõlemate', 'praeguseiks', 'prõmm',
                             'mingisugustest'

In [11]:
bert_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,144,0_vabadussõja_aastapäeva_vabadussõjas_vabariigi
1,1,127,1_lisaeelarve_miljonit_eurot_eelarve
2,2,123,2_hiina_taiwani_putini_has
3,3,105,3_kooli_kultuuri_õppima_keele
4,4,104,4_putini_kremli_seredenko_kriminaalkohtu
5,5,104,5_kaitseliidu_riigikaitse_malev_elanikkonnakaitse
6,6,97,6_elektri_mwh_energiasõjas_elektrit
7,7,90,7_ülemraada_visiidil_ruslan_välisminister
8,8,89,8_tanke_ukrainale_saksamaa_relvi
9,9,84,9_poola_etv_parlamendi_kohtusin


In [31]:
bert_topic_model.get_topic(10)

[('intervjuu', 0.6321517699974806),
 ('rääkisin', 0.5702795494441283),
 ('mõjudest', 0.5689623884643088),
 ('mõjust', 0.539765115155018),
 ('gümnaasiumi', 0.5166008886572814),
 ('kõnelesin', 0.4897111949772623),
 ('raadio', 0.4832358827646249),
 ('agressioonist', 0.47925011685787045),
 ('põhjustest', 0.4785881406924286),
 ('vastase', 0.4567669892859677)]

In [12]:
bert_topic_model.visualize_topics()

In [13]:
bert_topic_model.visualize_barchart(top_n_topics = 15, n_words=7)


### Topic reduction

In [14]:
# Topic Reduction after Training
# https://maartengr.github.io/BERTopic/getting_started/topicreduction/topicreduction.html

bert_topic_model.reduce_topics(texts, nr_topics=15)

# Access updated topics
topics = bert_topic_model.topics_


2023-04-23 13:21:07,029 - BERTopic - Reduced number of topics from 50 to 15


In [15]:
bert_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1571,0_ukraina_venemaa_vene_ukrainas
1,1,422,1_euroopa_parlamendi_liidu_ukraina
2,2,169,2_valimistel_eesti_kodanikele_päästame
3,3,162,3_riigikaitse_kaitseliidu_kaitseväe_elanikkonn...
4,4,144,4_vabadussõja_aastapäeva_vabariigi_vabadussõjas
5,5,127,5_lisaeelarve_eurot_miljonit_eelarve
6,6,110,6_epp_ecr_sportlased_sportlaste
7,7,105,7_kooli_kultuuri_keele_laste
8,8,97,8_elektri_mwh_energiasõjas_elektrit
9,9,75,9_õigeusu_karjala_kiriku_patriarhaadi


In [19]:
bert_topic_model.visualize_barchart(top_n_topics=15,n_words=7)


In [22]:
# manual reduction
topics_to_merge = [[3, 14],
                   [8, 12],
                   [0, 11]]

bert_topic_model.merge_topics(texts, topics_to_merge)

In [23]:
# Access updated topics
topics = bert_topic_model.topics_
bert_topic_model.visualize_barchart(top_n_topics=15,n_words=7)



In [24]:
bert_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,1627,0_ukraina_venemaa_vene_eesti
1,1,422,1_euroopa_parlamendi_ukraina_liidu
2,2,199,2_riigikaitse_kaitseliidu_kaitseväe_eesti
3,3,169,3_eesti_valimistel_kodanikele_päästame
4,4,144,4_vabadussõja_aastapäeva_vabariigi_vabadussõjas
5,5,139,5_elektri_gaasi_mwh_energiasõjas
6,6,127,6_lisaeelarve_eurot_miljonit_eelarve
7,7,110,7_epp_ecr_valgevene_sportlaste
8,8,105,8_kooli_kultuuri_keele_laste
9,9,75,9_õigeusu_karjala_kiriku_patriarhaadi


## Viz

In [None]:
from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)

hierarchical_topics = bert_topic_model.hierarchical_topics(texts, linkage_function=linkage_function)
bert_topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

# https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html#linkage-functions

100%|██████████| 49/49 [00:01<00:00, 29.79it/s]


In [None]:
# Visualize similarity using heatmap
bert_topic_model.visualize_heatmap()

#### Time

In [None]:
timestamps = df.kuupäev.to_list()

def convertTimestamp(value):
    curr_timestamp = pd.Timestamp(value)
    curr_timestamp = curr_timestamp.replace(day=1,hour=0, minute=0, second=0)
    return curr_timestamp

In [None]:
timestamps = [convertTimestamp(timestamp) for timestamp in timestamps]

In [None]:
# Create topics over time
topics_over_time = bert_topic_model.topics_over_time(texts, timestamps)

16it [00:07,  2.07it/s]


In [None]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"kiirtestide, valitsuse, bürokraatlik, delegats...",24,2022-01-01
1,0,"malis, raudsepp, väide, operatsioonidel, opera...",20,2022-01-01
2,1,"vabadussõjas, 102, vabadussõja, võidelnute, är...",10,2022-01-01
3,2,"teataja, haridustöötajate, õpetajate, haridust...",3,2022-01-01
4,3,"väärtuses, 000, 305, 752, pees",4,2022-01-01
...,...,...,...,...
181,8,"rumjantsev, rumjantsevi, afganistanis, sillamä...",1,2023-03-01
182,9,"tudengitele, ühendriikidega, asenda, transatla...",1,2023-03-01
183,13,"saginas, piirangut, boycottog2024saksa, tagasi...",2,2023-03-01
184,-1,"ilusad, kaunis, tahtis, haridusministriks, aja...",7,2023-04-01


In [None]:
bert_topic_model.visualize_topics_over_time(topics_over_time, topics=topics) 

#### Class

In [None]:
classes = df['erakond']

topics_per_class = bert_topic_model.topics_per_class(texts, classes=classes)


6it [00:01,  4.46it/s]


In [None]:
bert_topic_model.visualize_topics_per_class(topics_per_class, normalize_frequency=True)


In [None]:
topics_per_class[:20]

Unnamed: 0,Topic,Words,Frequency,Class,Name
0,-1,"venemaa, ukraina, euroopa, eesti, lähemalt",489,REF,-1_ukraina_venemaa_eesti_euroopa
1,0,"ukraina, euroopa, rootsi, venemaa, soome",266,REF,0_ukraina_riigikogu_euroopa_venemaa
2,1,"aastapäeva, rahu, vabadussõja, vabariigi, 105",31,REF,1_vabadussõja_aastapäeva_vabariigi_vabad...
3,2,"õpilastele, mõjudest, gümnaasiumi, mõjust, agr...",40,REF,2_kooli_kultuuri_laste_keele
4,3,"kross, ukrainas, lahkuma, kaugemale, vabasse",21,REF,3_tanke_ukrainale_ukraina_ukrainlased
5,4,"kodanikud, turismiviisade, kodanikele, agresso...",19,REF,4_kodanikud_kodanike_kodanikele_schengen...
6,5,"lisaeelarve, miljonit, eurot, miljoni, rahandu...",32,REF,5_lisaeelarve_eurot_miljonit_miljoni
7,6,"energiasõjas, energeetika, energia, universaal...",19,REF,6_elektri_energiasõjas_elektrit_uuendusr...
8,7,"julgeolekupoliitika, schvede, lähemalt, alused...",18,REF,7_kaitseliidu_malev_elanikkonnakaitse_ju...
9,8,"116, vaimse, tervise, telefon, 123",14,REF,8_116_vaimse_tervise_pärnu


In [None]:
bert_topic_model.get_document_info(texts)

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,ei tohi väsida ukraina toetamisest!,-1,-1_ukraina_venemaa_eesti_euroopa,ukraina - venemaa - eesti - euroopa - vene - t...,0.000000,False
1,meie iseseisvuspäev ukraina kurval moel seotud...,-1,-1_ukraina_venemaa_eesti_euroopa,ukraina - venemaa - eesti - euroopa - vene - t...,0.000000,False
2,105 aastat tagasi algas eesti aasta tagasi ukr...,-1,-1_ukraina_venemaa_eesti_euroopa,ukraina - venemaa - eesti - euroopa - vene - t...,0.000000,False
3,"putini tänane suur kõne, mis pidi lööma lainei...",0,0_ukraina_riigikogu_euroopa_venemaa,ukraina - riigikogu - euroopa - venemaa - eest...,1.000000,False
4,"sinu kandidaadid hiiu -, lääne - saaremaal. tu...",-1,-1_ukraina_venemaa_eesti_euroopa,ukraina - venemaa - eesti - euroopa - vene - t...,0.000000,False
...,...,...,...,...,...,...
3218,""" ärme kruti sellisel kombel ise pingeid ärme ...",0,0_ukraina_riigikogu_euroopa_venemaa,ukraina - riigikogu - euroopa - venemaa - eest...,0.984159,False
3219,esmaspäeval võeti riigikogus vastu välismaalas...,-1,-1_ukraina_venemaa_eesti_euroopa,ukraina - venemaa - eesti - euroopa - vene - t...,0.000000,False
3220,riigikogus võeti vastu välismaalaste seaduse v...,-1,-1_ukraina_venemaa_eesti_euroopa,ukraina - venemaa - eesti - euroopa - vene - t...,0.000000,False
3221,narvas tähistati üleeile vabadussõja relvarahu...,1,1_vabadussõja_aastapäeva_vabariigi_vabadussõjas,vabadussõja - aastapäeva - vabariigi - vabadus...,0.990629,False


## Coherence
- https://medium.com/@hajar.zankadi/using-bertopic-and-bertweet-transformer-to-predict-interest-tag-from-tweets-67189f11b992

In [25]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

docs = texts

# Preprocess Documents
documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = bert_topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = bert_topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

topic_words = [[words for words, _ in bert_topic_model.get_topic(topic) if words!=''] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
coherence

0.5925163415560033

## Add topic to df

In [28]:
topic_ids = bert_topic_model.get_document_info(texts)['Topic']
df['bert_topic_id'] = topic_ids

#topic_proba = bert_topic_model.get_document_info(texts)['probabilities']
#df['bert_topic_probability'] = topic_proba

# labels = ['eesti','ukraina',...]
# label_column = [labels[t_id] for t_id in topic_ids]
# df['bert_topic_labels'] = label_column

In [None]:
df['bert_topic_id'] = df['bert_topic_id'].replace([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                                ['Ukraine war', 'Allies', 'Defence', 'Elections', 'History', 'Energy security', 'Economy', 'Sports', 'Education', 'Religion', 'Discussion', 'Environment'])
df.head()


In [33]:
# save file
df.to_excel('leaders_topics_bertopic.xlsx', index = False)

- https://github.com/MaartenGr/BERTopic/issues/428
- https://github.com/MaartenGr/BERTopic/issues/126

In [None]:
my_model = BERTopic.load("bert_topic_model")
my_model.get_params()

{'calculate_probabilities': False,
 'ctfidf_model': ClassTfidfTransformer(reduce_frequent_words=True),
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f20a682edf0>,
 'hdbscan_model': HDBSCAN(min_cluster_size=15, prediction_data=True),
 'language': 'Estonian',
 'low_memory': False,
 'min_topic_size': 15,
 'n_gram_range': (1, 1),
 'nr_topics': 'auto',
 'representation_model': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, random_state=6, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(stop_words=['minutaolisteks', 'veeres', 'samadeks',
                             'karkääksti', 'mihukeste', 'ii-ha-ha', 'milliseist',
                             'selleks', 'mõlemate', 'praeguseiks', 'prõmm',
             

In [None]:
my_model.get_topic_info()