# Top2Vec

In [1]:
%%capture
!pip install top2vec

In [None]:
import sys
if 'google.colab' in sys.modules:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/yourrepository

In [None]:
import os
import sys
import re
import nltk
nltk.download('punkt')

parent_directory = os.path.abspath('..')
sys.path.append(parent_directory)

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from top2vec import Top2Vec
from Preprocess import preprocess
from Plot import plot_df

# Run the preprocessing
df= preprocess('yourrepository/Data/articles.json')

# Create a dictionary from the tokenized documents
dictionary = Dictionary(df['content'])

common_corpus = [','.join(text).replace(',',' ') for text in df['content']]

# Hyperparameter Tuning

In [None]:
import os
import sys
import re
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from top2vec import Top2Vec
from Preprocess import preprocess
from Plot import plot_df

# Run the preprocessing
df = preprocess('Data/articles.json')

# Create a dictionary from the tokenized documents
dictionary = Dictionary(df['content'])

common_corpus = [','.join(text).replace(',', ' ') for text in df['content']]

# Define the hyperparameters to tune
min_count = [10, 50, 100]
topic_merge_delta = [0.05, 0.1, 0.2]
embedding_model = ['doc2vec', 'universal-sentence-encoder', 'distiluse-base-multilingual-cased']
chunk_length = [50, 100, 200]

best_coherence = float('-inf')
best_params = None

# Iterate over all combinations of hyperparameters
for count in min_count:
    for delta in topic_merge_delta:
        for model in embedding_model:
            for length in chunk_length:
                try:
                    # Create and train the Top2Vec model
                    top2vec_model = Top2Vec(common_corpus, min_count=count, topic_merge_delta=delta,
                                            embedding_model=model, chunk_length=length, verbose=False)

                    # Evaluate the model and store the results
                    coherence = top2vec_model.get_topic_coherence()

                    # Check if the current model has better coherence
                    if coherence > best_coherence:
                        best_coherence = coherence
                        best_params = {
                            'min_count': count,
                            'topic_merge_delta': delta,
                            'embedding_model': model,
                            'chunk_length': length
                        }
                except:
                    # Handle any errors during model training
                    print("An error occurred during model training.")

# Print the best parameters
print("min_count:", best_params['min_count'])
print("topic_merge_delta:", best_params['topic_merge_delta'])
print("embedding_model:", best_params['embedding_model'])
print("chunk_length:", best_params['chunk_length'])


min_count: 100
topic_merge_delta: 0.1
embedding_model: doc2vec
chunk_length: 50


# Train Model

In [None]:
min_count = 100
topic_merge_delta = 0.1
embedding_model = 'doc2vec'
chunk_length = 50

model = Top2Vec(common_corpus,min_count, topic_merge_delta, embedding_model,chunk_length)

model.save("Top2vec_Model")

# Coherence Score

In [1]:
topic_words = model.get_topics(num_topics=Num_topics, reduced=False)[0].tolist()

from Plot import print_coherence
dic = dictionary
print_coherence(dic, topics_words[0], df['content'])

Topic: 'wast,legitim,topicwast,crime,dump,anonym,impa'	Coherence: 0.79
Topic: 'sewag,discharg,effluent,pollut,silag,leak,sew'	Coherence: 0.74
Topic: 'comment,onshor,permit,variat,drill,stringent'	Coherence: 0.77
Topic: 'packag,undertak,oblig,complianc,turnov,sancti'	Coherence: 0.8
Topic: 'singleus,stirrer,plastic,straw,cotton,bud,bag'	Coherence: 0.89
Topic: 'violat,yorki,locationsuk,genevai,peac,osc,vio'	Coherence: 0.74
Topic: 'cma,mislead,consum,merger,unfair,urgenc,marke'	Coherence: 0.62
Topic: 'prison,probat,hmp,rehabilit,offend,jail,recor'	Coherence: 0.68
Topic: 'litter,tidi,pick,bin,rubbish,flytip,highway,b'	Coherence: 0.66
Topic: 'boater,waterway,boat,lock,nene,registr,thame'	Coherence: 0.72
Topic: 'rod,intelligencel,topicfish,angler,cheat,conc'	Coherence: 0.87
Topic: 'cefa,aquat,aquacultur,shellfish,herpesviru,ca'	Coherence: 0.82
Topic: 'sellafield,decommiss,nda,magnox,nuclear,ltdpu'	Coherence: 0.79
Topic: 'rod,bailiff,angler,unlicens,coars,cheat,still'	Coherence: 0.87
Topic: 'n

# Dynamic Topic modelling

In [10]:
import numpy as np

df_distribution = df


num_docs = len(df_distribution)  # Get the number of documents in the DataFrame

# Reset the index and create a new column for document IDs
df_distribution = df_distribution.reset_index()
df_distribution.rename(columns={'index': 'doc_id'}, inplace=True)

# Call the get_documents_topics function
topic_distribution = model.get_documents_topics([i for i in range(num_docs)], num_topics=model.get_num_topics())

topic_nums, topic_scores, topics_words, word_distribution = topic_distribution[0], topic_distribution[1], topic_distribution[2], topic_distribution[3]

# Some extra preprocessing was required due to the function get_documents_topics from the Top2vec API.
for i in range(num_docs):
    # Retrieve the topic numbers and scores for the current document
    doc_topic_nums = topic_nums[i].tolist()
    doc_topic_scores = topic_scores[i].tolist()
    topics = topics_words[i].tolist()
    topic_hierarchy = word_distribution[i].tolist()

    # Insert a column for each topic in the data
    for j, topic_num in enumerate(doc_topic_nums):
        topic_score = doc_topic_scores[j]
        topic_name =  topics[j]
        word_distri = topic_hierarchy[j]


        a = sorted(word_distri, reverse = True)[:4]
        Topic_tile = [topic_name[word_distri.index(i)] for i in a][:4]

        # Ensure the column name is a string
        column_name = f'{Topic_tile}'

        # Check if the column already exists
        if column_name not in df_distribution.columns:
            df_distribution[column_name] = np.nan

        # Convert the topic score to numpy.float32
        topic_score = np.float32(topic_score)

        # Insert the topic number and score into your data
        df_distribution.at[i, column_name] = topic_score

In [23]:
topic_means = df_distribution.groupby('year-month').mean()

plot_df(topic_means,'Top2vec', add_error_bars =False)

Output hidden; open in https://colab.research.google.com to view.