# Dynamic Bert

In [2]:
%%capture
!pip install bertopic

In [None]:
import sys
if 'google.colab' in sys.modules:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/yourrepository

In [None]:
import os
import sys
import re
import nltk
nltk.download('punkt')

parent_directory = os.path.abspath('..')
sys.path.append(parent_directory)

from gensim.test.utils import common_texts
from bertopic import BERTopic
from gensim.corpora.dictionary import Dictionary
from Preprocess import preprocess
from Plot import plot_df
df= preprocess('yourrepository/Data/articles.json')

# Create a dictionary from the tokenized documents
dictionary = Dictionary(df['content'])

common_corpus = [','.join(text).replace(',',' ') for text in df['content']]

# Hyperparameter Tuning

In [None]:
from bertopic import BERTopic
from sklearn.model_selection import GridSearchCV

# Define your data and other parameters
docs = [...]  # Your input documents
params = {
    'language': ['english', 'multilingual'],
    'top_n_words': [10, 20, 30],
    'n_gram_range': [(1, 1), (1, 2), (1, 3)],
    'min_topic_size': [10, 100, 500],
    'nr_topics': [None, 20, 50],
    'low_memory': [True, False]
}

# Initialize BERTopic model
model = BERTopic()

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid=params, cv=5)
grid_search.fit(docs)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


Best Hyperparameters:
language: english
top_n_words: 20
n_gram_range: (1, 2)
min_topic_size: 100
nr_topics: None
low_memory: True


# Train model

In [None]:
language = "english"
top_n_words = 20
n_gram_range = (1, 2)
min_topic_size = 100
nr_topics = None
low_memory = True

# Initialize and train the BERTopic model
topic_model = BERTopic(language,top_n_words,n_gram_range,nr_topics,low_memory)

topics, probs = topic_model.fit_transform(common_corpus)

topic_model.save("my_model", serialization="safetensors")

BERTopic.load("my_model")

In [7]:
topic_model = BERTopic.load("my_model")

# Coherence Score

In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

topic_words = topic_model.get_topics(10)['Main']
topics_proportions = [topic_words[i] for i in range(len(topic_words)-1)]

words_list = [[word[0] for word in sublist] for sublist in topics_proportions]
dictionary = Dictionary(df['content'])

from Plot import print_coherence
dic = dictionary
print_coherence(dic, words_list, df['content'])

Topic: flood,risk,scheme,coastal,agenc,defenc,warn,e	Coherence: 0.73
Topic: use,govern,technolog,live,improv,open,support	Coherence: 0.41
Topic: uk,trade,govern,research,space,support,intern	Coherence: 0.41
Topic: water,compani,pollut,sewag,environ,treatment,	Coherence: 0.71
Topic: fish,river,fisheri,licenc,angler,pa,agenc,sea	Coherence: 0.79
Topic: appoint,chair,board,director,member,committe,	Coherence: 0.71
Topic: wast,crime,illeg,carrier,oper,vehicl,agenc,po	Coherence: 0.72
Topic: agenc,environ,wast,permit,site,illeg,use,crim	Coherence: 0.52
Topic: need,want,speech,heritag,year,world,work,uk,p	Coherence: 0.39
Topic: health,care,nh,patient,mental,research,social	Coherence: 0.79
Topic: food,drink,export,farm,industri,product,gin,b	Coherence: 0.56
Topic: fish,angler,licenc,fisheri,fine,illeg,rod,wit	Coherence: 0.88
Topic: tree,woodland,plant,forestri,forest,commiss,n	Coherence: 0.74
Topic: green,environment,natur,environ,govern,protec	Coherence: 0.46
Topic: train,forc,arm,soldier,armi

# Dynamic BERTopic

In [10]:
%cd /content/drive/MyDrive/yourrepository/Dynamic_BERTopic
docs = df['content'].tolist()

# Your topic distribution matrix
topic_distr, _ = topic_model.approximate_distribution(common_corpus)

# Your topics dictionary
topics = topic_model.get_topics()

selected_topics = topics
# Appending topic distributions to dataframe
for topic_id, topic_keywords in selected_topics:
    topic_name = ' '.join([keyword for keyword, _ in topic_keywords])
    df[topic_name] = [distr[topic_id] for distr in topic_distr]


In [11]:
%cd /content/drive/MyDrive/yourrepository
df_distribution= df

topic_means = df_distribution.groupby('year-month').mean()

plot_df(topic_means, 'BERTopic', add_error_bars = False)