# Top2Vec

In [2]:
%%capture
!pip install top2vec

In [6]:
import sys
if 'google.colab' in sys.modules:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/yourrepository

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/yourrepository


In [7]:
import os
import sys
import re
import nltk
nltk.download('punkt')


parent_directory = os.path.abspath('..')
sys.path.append(parent_directory)

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from top2vec import Top2Vec
from Preprocess import preprocess
from Plot import plot_df

# Run the preprocessing
df= preprocess('yourrepository/Data/articles.json')

# Create a dictionary from the tokenized documents
dictionary = Dictionary(df['content'])

common_corpus = [','.join(text).replace(',',' ') for text in df['content']]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Hyperparameter Tuning

In [None]:
import os
import sys
import re
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from top2vec import Top2Vec
from Preprocess import preprocess
from Plot import plot_df

# Run the preprocessing
df = preprocess('Data/articles.json')

# Create a dictionary from the tokenized documents
dictionary = Dictionary(df['content'])

common_corpus = [','.join(text).replace(',', ' ') for text in df['content']]

# Define the hyperparameters to tune
min_count = [10, 50, 100]
topic_merge_delta = [0.05, 0.1, 0.2]
embedding_model = ['doc2vec', 'universal-sentence-encoder', 'distiluse-base-multilingual-cased']
chunk_length = [50, 100, 200]

best_coherence = float('-inf')
best_params = None

# Iterate over all combinations of hyperparameters
for count in min_count:
    for delta in topic_merge_delta:
        for model in embedding_model:
            for length in chunk_length:
                try:
                    # Create and train the Top2Vec model
                    top2vec_model = Top2Vec(common_corpus, min_count=count, topic_merge_delta=delta,
                                            embedding_model=model, chunk_length=length, verbose=False)

                    # Evaluate the model and store the results
                    coherence = top2vec_model.get_topic_coherence()

                    # Check if the current model has better coherence
                    if coherence > best_coherence:
                        best_coherence = coherence
                        best_params = {
                            'min_count': count,
                            'topic_merge_delta': delta,
                            'embedding_model': model,
                            'chunk_length': length
                        }
                except:
                    # Handle any errors during model training
                    print("An error occurred during model training.")

# Print the best parameters
print("min_count:", best_params['min_count'])
print("topic_merge_delta:", best_params['topic_merge_delta'])
print("embedding_model:", best_params['embedding_model'])
print("chunk_length:", best_params['chunk_length'])


min_count: 100
topic_merge_delta: 0.1
embedding_model: doc2vec
chunk_length: 50


# Train Model

In [None]:
min_count = 100
topic_merge_delta = 0.1
embedding_model = 'doc2vec'
chunk_length = 50

model = Top2Vec(common_corpus, min_count=min_count, topic_merge_delta=topic_merge_delta,
                            embedding_model=embedding_model, chunk_length=chunk_length, verbose=False)

model.save("Top2vec_Model")

In [10]:
model = Top2Vec.load("Top2vec_Model")
Num_topics = model.get_num_topics(reduced=False)

for i, words in enumerate(model.get_topics(num_topics=Num_topics, reduced=False)[0][:]):
    print(f"Topic {i+1}: {', '.join(words)[:50]}")

Topic 1: wast, legitim, crime, dump, anonym, blight, topicw
Topic 2: employe, employ, career, apprentic, workplac, incl
Topic 3: patient, nh, healthcar, clinic, cancer, health, ca
Topic 4: rain, rainfal, weekend, warn, flood, alert, floodl
Topic 5: nonexecut, appoint, board, truste, chair, chairman
Topic 6: bilater, foreign, relationship, asia, tie, trade, 
Topic 7: embassi, guatemala, englishespanol, hondura, ameri
Topic 8: dropin, flood, resid, session, floodlin, allevi, t
Topic 9: decommiss, nda, sellafield, nuclear, dounreay, rad
Topic 10: beef, export, lamb, dairi, food, meat, topicfood, 
Topic 11: avers, format, reader, file, assist, pageprint, re
Topic 12: cop, sharma, alok, glasgow, pari, climat, presid, 
Topic 13: school, pupil, teacher, educ, child, disadvantag, 
Topic 14: weir, spawn, migrat, upstream, pa, eel, passag, tr
Topic 15: embank, flood, allevi, wall, scheme, town, floodin
Topic 16: repair, wall, embank, flood, mainten, floodingth, 
Topic 17: permit, variat, comment

# Coherence Score

In [11]:
dictionary = Dictionary(df['content'])

topic_words = model.get_topics(num_topics=Num_topics, reduced=False)[0].tolist()

from Plot import print_coherence
dic = dictionary
print_coherence(dic, topic_words,df['content'])

Topic: wast,legitim,crime,dump,anonym,blight,topicwa	Coherence: 0.85
Topic: employe,employ,career,apprentic,workplac,incl	Coherence: 0.69
Topic: patient,nh,healthcar,clinic,cancer,health,car	Coherence: 0.77
Topic: rain,rainfal,weekend,warn,flood,alert,floodli	Coherence: 0.80
Topic: nonexecut,appoint,board,truste,chair,chairman	Coherence: 0.77
Topic: bilater,foreign,relationship,asia,tie,trade,c	Coherence: 0.71
Topic: embassi,guatemala,englishespanol,hondura,amer	Coherence: 0.65
Topic: dropin,flood,resid,session,floodlin,allevi,to	Coherence: 0.65
Topic: decommiss,nda,sellafield,nuclear,dounreay,rad	Coherence: 0.80
Topic: beef,export,lamb,dairi,food,meat,topicfood,dr	Coherence: 0.74
Topic: avers,format,reader,file,assist,pageprint,req	Coherence: 0.89
Topic: cop,sharma,alok,glasgow,pari,climat,presid,eg	Coherence: 0.83
Topic: school,pupil,teacher,educ,child,disadvantag,t	Coherence: 0.84
Topic: weir,spawn,migrat,upstream,pa,eel,passag,trou	Coherence: 0.84
Topic: embank,flood,allevi,wall,sc

# Dynamic Topic modelling

Some extra preprocessing was required due to the function get_documents_topics from the Top2vec API.

'get_documents_topics'

Returns:
topic_nums (array of int, shape(len(doc_ids), num_topics)) – The topic number(s) of the document corresponding to each doc_id.

topic_score (array of float, shape(len(doc_ids), num_topics)) – Semantic similarity of document to topic(s). The cosine similarity of the document and topic vector.

topics_words (array of shape(len(doc_ids), num_topics, 50)) – For each topic the top 50 words are returned, in order of semantic similarity to topic.



In [12]:
import numpy as np

df_distribution = df


num_docs = len(df_distribution)  # Get the number of documents in the DataFrame

# Reset the index and create a new column for document IDs
df_distribution = df_distribution.reset_index()
df_distribution.rename(columns={'index': 'doc_id'}, inplace=True)

# Call the get_documents_topics function
topic_distribution = model.get_documents_topics([i for i in range(num_docs)], num_topics=model.get_num_topics())

topic_nums, topic_scores, topics_words, word_distribution = topic_distribution[0], topic_distribution[1], topic_distribution[2], topic_distribution[3]

# Insert topic columns in the data
for i in range(num_docs):
    # Retrieve the topic numbers and scores for the current document
    doc_topic_nums = topic_nums[i].tolist()
    doc_topic_scores = topic_scores[i].tolist()
    topics = topics_words[i].tolist()
    topic_hierarchy = word_distribution[i].tolist()

    # Insert a column for each topic in the data
    for j, topic_num in enumerate(doc_topic_nums):
        topic_score = doc_topic_scores[j]
        topic_name =  topics[j]
        word_distri = topic_hierarchy[j]


        a = sorted(word_distri, reverse = True)[:4]
        Topic_tile = [topic_name[word_distri.index(i)] for i in a]

        # Ensure the column name is a string
        column_name = f'{Topic_tile}'

        # Check if the column already exists
        if column_name not in df_distribution.columns:
            df_distribution[column_name] = np.nan

        # Convert the topic score to numpy.float32
        topic_score = np.float32(topic_score)

        # Insert the topic number and score into your data
        df_distribution.at[i, column_name] = topic_score

In [13]:
  %cd /content/drive/MyDrive/yourrepository/Dynamic_Top2Vec
  topic_means = df_distribution.groupby('year-month').mean()

plot_df(topic_means,'Top2vec', add_error_bars =False)

Output hidden; open in https://colab.research.google.com to view.