In [None]:
!pip install bertopic
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m69.8 MB/s[0m 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from bertopic import BERTopic 
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP

### Useful links

Documentation:

* BERTopic GitHub: https://github.com/MaartenGr/BERTopic
* Documentation: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.approximate_distribution
* Visualization: https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-topics-per-class 
* Algorithm explanation: https://maartengr.github.io/BERTopic/algorithm/algorithm.html
* BERT multilingual supports Estonian: https://github.com/google-research/bert/blob/master/multilingual.md
* EstBERT: https://huggingface.co/tartuNLP/EstBERT

 


Official examples:
* Topic Modeling with BERT: https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6
* Interactive Topic Modeling with BERTopic: https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8
* Using Whisper and BERTopic to model Kurzgesagt’s videos: https://towardsdatascience.com/using-whisper-and-bertopic-to-model-kurzgesagts-videos-7d8a63139bdf

other:
*  https://medium.com/data-reply-it-datatech/bertopic-topic-modeling-as-you-have-never-seen-it-before-abb48bbab2b2
https://python.plainenglish.io/topic-modeling-for-beginners-using-bertopic-and-python-aaf1b421afeb 

Useful:
* https://stackoverflow.com/questions/64320883/the-size-of-tensor-a-707-must-match-the-size-of-tensor-b-512-at-non-singleto
*   https://stackoverflow.com/questions/61708486/whats-difference-between-tokenizer-encode-and-tokenizer-encode-plus-in-hugging

* EstBERT: https://github.com/Tlepsh64/UT_NaturalLanguageProcessing/blob/0499d016515dced624710277b804bec4b84b93a5/homework6.ipynb 
* Why results are not consistent between runs: https://maartengr.github.io/BERTopic/faq.html#why-are-the-results-not-consistent-between-runs --> *Using custom embeddings allows you to try out BERTopic several times until you find the topics that suit you best.*
* https://github.com/MaartenGr/BERTopic/issues/763

sentence transformer, multilingual models --> https://www.sbert.net/docs/pretrained_models.html, (selected) models, They have been extensively evaluated for their quality to embedded sentences --> https://maartengr.github.io/BERTopic/algorithm/algorithm.html#detailed-overview, multilingual model

## Multilingual BERT, language='Estonian'

In [None]:
# data
#df = pd.read_excel('leaders_bert.xlsx')
df = pd.read_excel('/content/drive/MyDrive/MSc/MAKATÖÖ/public_bert.xlsx')
df.head()

In [None]:
len(df)

16927

In [None]:
texts = df["bert_text"].tolist()

In [None]:
with open('/content/drive/MyDrive/MSc/estonian-stopwords.txt', encoding='utf-8') as f:
#with open('../estonian-stopwords.txt', encoding='utf-8') as f:
    stop_words_est = [word for line in f for word in line.split()]
    #stop_words_est = stop_words_est + ['eesti', 'eestis', 'venemaa', 'vene', 'ukraina', 'ukrainas', 'ukrainale']

# https://maartengr.github.io/BERTopic/faq.html#how-do-i-remove-stop-words
# https://github.com/MaartenGr/BERTopic/issues/181

In [None]:
# # check if word in word list 
# if any("ukraina" in word for word in stop_words_est):
#     print('is there')
# else:
#     print(' is not')

In [None]:
# initialize models

vectorizer_model = CountVectorizer(stop_words=stop_words_est)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

umap_model = UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, random_state = 6)

cluster_model = KMeans(n_clusters=50) # random state

#hdbscan_model = HDBSCAN(min_cluster_size=15, prediction_data=True)

In [None]:
bert_topic_model = BERTopic(language="Estonian", 
                             vectorizer_model=vectorizer_model,
                             ctfidf_model=ctfidf_model,
                             umap_model = umap_model,
                             hdbscan_model=cluster_model,
                             nr_topics= 'auto', 
                             min_topic_size = 15,
                             calculate_probabilities = True, 
                             verbose = True) 

topics, probabilities = bert_topic_model.fit_transform(texts)
bert_topic_model.save("bert_topic_model")

Downloading (…)0fe39/.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)83e900fe39/README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading (…)e900fe39/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading (…)900fe39/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/529 [00:00<?, ?it/s]

2023-04-27 18:55:13,233 - BERTopic - Transformed documents to Embeddings
2023-04-27 18:55:54,584 - BERTopic - Reduced dimensionality
2023-04-27 18:55:55,975 - BERTopic - Clustered reduced embeddings
2023-04-27 18:55:57,616 - BERTopic - Reduced number of topics from 50 to 26


In [None]:
bert_topic_model.get_params()

{'calculate_probabilities': True,
 'ctfidf_model': ClassTfidfTransformer(reduce_frequent_words=True),
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f1fcf576f80>,
 'hdbscan_model': KMeans(n_clusters=50),
 'language': 'Estonian',
 'low_memory': False,
 'min_topic_size': 15,
 'n_gram_range': (1, 1),
 'nr_topics': 'auto',
 'representation_model': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, random_state=6, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(stop_words=['minutaolisteks', 'veeres', 'samadeks',
                             'karkääksti', 'mihukeste', 'ii-ha-ha', 'milliseist',
                             'selleks', 'mõlemate', 'praeguseiks', 'prõmm',
                             'mingisugustes

In [None]:
bert_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,9052,0_putin_eesti_putini_nato
1,1,1956,1_sõda_lääs_sõja_sõdi
2,2,545,2_raketid_armee_väed_vene
3,3,453,3_gaasi_gaasist_gaas_naftat
4,4,411,4_põgenike_põgenikud_põgenikke_põgenikele
5,5,373,5_hiina_rubla_raha_dollari
6,6,339,6_uudiseid_video_ajakirjanikud_uudised
7,7,313,7_kooli_lastele_startups_keele
8,8,297,8_raha_annetusi_mündikaardi_eurot
9,9,279,9_ukrainlane_emotsioonid_varsti_valus


In [None]:
bert_topic_model.get_topic(1)

[('inimesed', 0.1531379074606337),
 ('vaja', 0.15161847551171795),
 ('lihtsalt', 0.14427725526410162),
 ('abi', 0.14384256971905135),
 ('inimesi', 0.14373747209578194),
 ('riik', 0.14086041627008847),
 ('ajal', 0.13994824163876693),
 ('suur', 0.13617113915198795),
 ('üro', 0.13486205480097996),
 ('venemaa', 0.13410710539714746)]

In [None]:
bert_topic_model.get_topic(10)

[('raha', 0.426962152072894),
 ('annetusi', 0.3562885310141624),
 ('mtü', 0.30263658090503237),
 ('annetada', 0.30158580501680565),
 ('annetuse', 0.2959027455447002),
 ('annetused', 0.29255316931008096),
 ('toetuseks', 0.2849952208945215),
 ('mündikaardi', 0.2748458844643811),
 ('eurot', 0.26081908713045127),
 ('annetuste', 0.25172007870552654)]

In [None]:
bert_topic_model.visualize_topics()

In [None]:
bert_topic_model.visualize_barchart(top_n_topics = 15, n_words=7)


### Topic reduction

In [None]:
# Topic Reduction after Training
# https://maartengr.github.io/BERTopic/getting_started/topicreduction/topicreduction.html

bert_topic_model.reduce_topics(texts, nr_topics=22)

# Access updated topics
topics = bert_topic_model.topics_


2023-04-27 18:57:29,522 - BERTopic - Reduced number of topics from 26 to 22


In [None]:
bert_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,9331,0_eesti_putin_putini_nato
1,1,1956,1_sõda_lääs_sõja_rahu
2,2,1054,2_raketid_armee_väed_rakett
3,3,563,3_uudiseid_ajakirjanikud_meedia_video
4,4,453,4_gaasi_gaasist_euroopa_gaas
5,5,411,5_põgenike_põgenikud_põgenikke_põgenikele
6,6,373,6_hiina_rubla_raha_dollari
7,7,313,7_kooli_lastele_startups_keele
8,8,297,8_raha_annetusi_eurot_mündikaardi
9,9,273,9_terroristlikuks_terroristlik_terrorismi_riigiks


In [None]:
bert_topic_model.get_topic(0)

[('eesti', 0.19628929220849964),
 ('putin', 0.19590064109449537),
 ('putini', 0.17723946885436845),
 ('vene', 0.1668844107244746),
 ('eestis', 0.1583432507149676),
 ('sõda', 0.15388665734954532),
 ('venemaa', 0.15343910673016067),
 ('euroopa', 0.1523392013285039),
 ('ukrainas', 0.1519088454480187),
 ('ukrainat', 0.14706364050281737)]

In [None]:
bert_topic_model.visualize_barchart(top_n_topics=25,n_words=7)


In [None]:
# manual reduction
# topics_to_merge = [[8, 13, 7],
#                    [12, 18]]
topics_to_merge = [[15, 3], [17, 1], [18, 2]]
bert_topic_model.merge_topics(texts, topics_to_merge)

In [None]:
# Access updated topics
topics = bert_topic_model.topics_
bert_topic_model.visualize_barchart(top_n_topics=21,n_words=7)

##

In [None]:
bert_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,5235,0_putin_putini_venemaa_lihtsalt
1,1,4248,1_eesti_eestis_euroopa_ukraina
2,2,2710,2_sõda_sõja_lääs_vene
3,3,1021,3_twitteri_uudiseid_twitteris_meedia
4,4,660,4_raketid_rakett_linnade_päev
5,5,427,5_põgenike_põgenikud_põgenikke_põgenikele
6,6,409,6_gaasi_euroopa_gaasist_gaas
7,7,385,7_raha_annetusi_mtü_annetada
8,8,355,8_eurovision_laul_eurovisiooni_võitis
9,9,304,9_kooli_lastele_keele_keelt


In [None]:
bert_topic_model.get_topic(12)

[('viina', 0.40780812214972745),
 ('koerad', 0.3689843744384369),
 ('kass', 0.3527890678224714),
 ('tellida', 0.32467161135697026),
 ('disainerid', 0.32446666118626427),
 ('neste', 0.32446666118626427),
 ('liha', 0.32446666118626427),
 ('restorane', 0.32446666118626427),
 ('köök', 0.32446666118626427),
 ('koer', 0.31987910183165746)]

In [None]:
bert_topic_model.visualize_topics()

## Viz

In [None]:
from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)

hierarchical_topics = bert_topic_model.hierarchical_topics(texts, linkage_function=linkage_function)
bert_topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

# https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html#linkage-functions

100%|██████████| 32/32 [00:00<00:00, 72.66it/s]


In [None]:
# Visualize similarity using heatmap
bert_topic_model.visualize_heatmap()

#### Time

In [None]:
timestamps = df.created_at.to_list()

def convertTimestamp(value):
    curr_timestamp = pd.Timestamp(value)
    curr_timestamp = curr_timestamp.replace(day=1,hour=0, minute=0, second=0)
    return curr_timestamp

In [None]:
timestamps = [convertTimestamp(timestamp) for timestamp in timestamps]

In [None]:
# Create topics over time
topics_over_time = bert_topic_model.topics_over_time(texts, timestamps)

13it [00:04,  3.14it/s]


In [None]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,0,"kaevandamise, akadeemik, centralbank, memorial...",60,2022-01-01 00:00:00+00:00
1,1,"blinkeni, önnetuid, kaitseotstarbelist, eesti,...",57,2022-01-01 00:00:00+00:00
2,2,"lucas, poroshenko, petro, vandenõuteooria, vaga",36,2022-01-01 00:00:00+00:00
3,3,"käki, annekteerisin, ettekirjutisi, sümpaatias...",7,2022-01-01 00:00:00+00:00
4,4,"õhupiiri, lennuk, rikkus, ventikasse, vöimalikku",5,2022-01-01 00:00:00+00:00
...,...,...,...,...
198,11,"seksima, saare, hiv, naised, korrust",8,2023-01-01 00:00:00+00:00
199,12,"joon, komplekt, sardellid, sale, harjumuse",11,2023-01-01 00:00:00+00:00
200,13,"sanktsioonide, sanktsioonid, tuumaenergiale, r...",9,2023-01-01 00:00:00+00:00
201,14,"ptui, lippu, leegion, boikoteerigegi, kommionud",5,2023-01-01 00:00:00+00:00


In [None]:
bert_topic_model.visualize_topics_over_time(topics_over_time, topics=topics) 

In [None]:
bert_topic_model.get_document_info(texts)

Unnamed: 0,Document,Topic,Name,Top_n_words,Representative_document
0,Ei Ants! Gerassimov oma tegevusega samasugune ...,2,2_sõda_lääs_sõdi_sõdu,sõda - lääs - sõdi - sõdu - sõjakuritegu - rah...,False
1,"Ükskõik kui palju Putin sissetungi õigustaks, ...",4,4_putin_putini_putinit_putinile,putin - putini - putinit - putinile - sõda - p...,False
2,Vene agressioon toonud kaasa Eesti viisade tao...,1,1_eesti_eestis_eestit_estonia,eesti - eestis - eestit - estonia - soome - ee...,False
3,""" Hiinale oleks meelepärane, kui Venemaad saad...",7,7_sanktsioonid_sanktsioonide_hiina_sanktsioone,sanktsioonid - sanktsioonide - hiina - sanktsi...,False
4,Heino Enden väga tabavalt Betsafe LIVE podcast...,2,2_sõda_lääs_sõdi_sõdu,sõda - lääs - sõdi - sõdu - sõjakuritegu - rah...,False
...,...,...,...,...,...
18157,Venemaa saadab relvi Süüriasse - kas Oudekki t...,0,0_slava_vabaks_elagu_vene,slava - vabaks - elagu - vene - nato - lihtsal...,False
18158,toimetaja Kai Vare: Kasahstani lähevad appi Ve...,8,8_meedia_propagandat_kirik_õigeusu,meedia - propagandat - kirik - õigeusu - saatk...,False
18159,KASAHSTANI MÄSS JA TUUMAENERGIA Euroopa Liit k...,12,12_gaasi_gaasist_inflatsiooni_gaas,gaasi - gaasist - inflatsiooni - gaas - naftat...,False
18160,mul siiani prantsusmaa venemaa,3,3_veneimperialism_venemaa_tsivilisatsioon_suur,veneimperialism - venemaa - tsivilisatsioon - ...,False


## Coherence
- https://medium.com/@hajar.zankadi/using-bertopic-and-bertweet-transformer-to-predict-interest-tag-from-tweets-67189f11b992

In [None]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

docs = texts

# Preprocess Documents
documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = bert_topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = bert_topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

topic_words = [[words for words, _ in bert_topic_model.get_topic(topic) if words!=''] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
coherence

0.6856018513583672

## Add topic to df

In [None]:
topic_ids = bert_topic_model.get_document_info(texts)['Topic']
df['bert_topic_id'] = topic_ids

#topic_proba = bert_topic_model.get_document_info(texts)['probabilities']
#df['bert_topic_probability'] = topic_proba

# labels = ['eesti','ukraina',...]
# label_column = [labels[t_id] for t_id in topic_ids]
# df['bert_topic_labels'] = label_column

In [None]:
df['bert_topic_id'] = df['bert_topic_id'].replace([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                                ['Russia', 'Estonia', 'War', 'News/Social Media', 'Combat', 'Refugees', 'Gas', 'Donations', 'Eurovision', 'Education', 'Nuclear', 'Women', 'droppida', 
                                 'Sanctions', 'Flag & Colors', 'Sports'])
df


In [None]:
# drop rows where topic is 'droppida'
df = df[df['bert_topic_id'] != 'droppida']
df

In [None]:
# save file
df.to_excel('public_topics_bertopic.xlsx', index = False)

In [None]:
df['bert_topic_id'].unique()

array(['War', 'Russia', 'Estonia', 'Nuclear', 'Gas', 'Education',
       'News/Social Media', 'Women', 'Refugees', 'Combat', 'Sanctions',
       'Donations', 'Eurovision', 'Flag & Colors', 'Sports'], dtype=object)