# BERT+LDA

In [None]:
from stop_words import get_stop_words  # Importing the library for handling stop words
from nltk.stem.porter import PorterStemmer  # Importing Porter's stemmer for reducing words to their base form
import re  # Importing the regular expressions library
import nltk  # Importing the Natural Language Toolkit for text processing (e.g., POS tagging, tokenization)
from nltk.tokenize import word_tokenize  # Importing the function for word tokenization
from language_detector import detect_language  # Importing the function for language detection

import pkg_resources  # Importing to handle package resources
from symspellpy import SymSpell, Verbosity  # Importing SymSpell for typo correction and word suggestions

# Initializing SymSpell with a max edit distance of 3 and a prefix length of 7
sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
# Loading the frequency dictionary
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
if sym_spell.word_count:  # Checking if the dictionary has been loaded
    pass
else:
    # Loading the dictionary if it hasn't been loaded
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)


def f_base(s):
    """
    :param s: string to be processed
    :return: processed string
    """
    # Normalization 1: inserting a period before an uppercase letter if it follows a lowercase letter
    s = re.sub(r'([a-z])([A-Z])', r'\1\. \2', s)  # Pre-processing before converting to lowercase

    # Normalization 2: converting the text to lowercase
    s = s.lower()  # Ensures uniformity for analysis

    # Normalization 3: replacing HTML symbols (&gt, &lt) with spaces
    s = re.sub(r'&gt|&lt', ' ', s)

    # Normalization 4: reducing repeated letters occurring more than twice to one
    s = re.sub(r'([a-z])\1{2,}', r'\1', s)  # Reduces noise in the text

    # Normalization 5: removing repeated non-alphabetic characters occurring more than once
    s = re.sub(r'([\W+])\1{1,}', r'\1', s)  # Eliminates excessive punctuation

    # Normalization 6: replacing the '*' symbol with a period as a delimiter
    s = re.sub(r'\*|\W\*|\*\W', '. ', s)  # Replaces characters perceived as separators

    # Normalization 7: removing text in parentheses as it is considered less formal
    s = re.sub(r'\(.*?\)', '. ', s)

    # Normalization 8: replacing sequences of punctuation with a single period
    s = re.sub(r'\W+?\.', '.', s)

    # Normalization 9: adding a space after punctuation if followed by a word
    s = re.sub(r'(\.|\?|!)(\w)', r'\1 \2', s)

    # Normalization 10: removing the word 'ing' as it is considered noise
    s = re.sub(r' ing ', ' ', s)

    # Normalization 11: removing noise related to promotional text
    s = re.sub(r'product received for free[.| ]', ' ', s)

    # Normalization 12: removing repeated phrases to reduce redundancy
    s = re.sub(r'(.{2,}?)\1{1,}', r'\1', s)

    return s.strip()  # Returns the string without leading or trailing spaces


# Function for language detection
def f_lan(s):
    """
    :param s: string to be processed
    :return: boolean (whether the text is in English or French)
    """
    # Checks the language using a language detector
    return detect_language(s) in {'English', 'French'}  # Returns True if the language is English or French


# Filtering out punctuation and numbers
def f_punct(w_list):
    """
    :param w_list: list of words to be processed
    :return: list with punctuation and numbers filtered out
    """
    return [word for word in w_list if word.isalpha()]  # Removes elements that are not words


# Selecting only nouns
def f_noun(w_list):
    """
    :param w_list: list of words to be processed
    :return: list containing only nouns
    """
    return [word for (word, pos) in nltk.pos_tag(w_list) if pos[:2] == 'NN']  # Filters based on 'NN' tags (nouns)


# Typo correction
def f_typo(w_list):
    """
    :param w_list: list of words to be processed
    :return: list with typos corrected
    """
    w_list_fixed = []  # Initializes an empty list for corrected words
    for word in w_list:  # Iterates over each word in the list
        # Searches for suggestions to correct typos using SymSpell
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3)
        if suggestions:  # If suggestions exist, adds the first suggestion to the list
            w_list_fixed.append(suggestions[0].term)
        else:
            pass  # Skips the word if no suggestions are found
    return w_list_fixed  # Returns the list of corrected words


# Initializing Porter's stemmer
p_stemmer = PorterStemmer()


# Stemming
def f_stem(w_list):
    """
    :param w_list: list of words to be processed
    :return: list with stemmed words
    """
    return [p_stemmer.stem(word) for word in w_list]  # Reduces words to their base forms


# Creating an English stop word list
en_stop = get_stop_words('en')
# Adding additional game-related terms to the stop word list
en_stop.append('game')
en_stop.append('play')
en_stop.append('player')
en_stop.append('time')


# Removing stop words
def f_stopw(w_list):
    """
    Function to remove stop words from the list.
    """
    return [word for word in w_list if word not in en_stop]  # Filters out words that are in the stop word list


# Preprocessing text at the sentence level
def preprocess_sent(rw):
    """
    Obtains sentence-level preprocessed text from raw input.
    :param rw: text to be processed
    :return: preprocessed sentence-level text
    """
    s = f_base(rw)  # Normalizes the text
    if not f_lan(s):  # Checks if the text is in English or French
        return None  # Returns None if the language does not meet the criteria
    return s  # Returns the processed string


# Preprocessing text at the word level
def preprocess_word(s):
    """
    Obtains word-level preprocessed data from processed sentences, 
    including: removing punctuation, selecting nouns, typo correction, stemming, stop word removal.
    :param s: sentence to be processed
    :return: word-level preprocessed text
    """
    if not s:  # Checks if the string is not empty
        return None
    w_list = word_tokenize(s)  # Tokenizes the sentence into words
    w_list = f_punct(w_list)  # Removes punctuation and numbers
    w_list = f_noun(w_list)  # Selects nouns
    w_list = f_typo(w_list)  # Corrects typos
    w_list = f_stem(w_list)  # Applies stemming
    w_list = f_stopw(w_list)  # Removes stop words

    return w_list  # Returns the list of processed words


In [None]:
import keras  # Importing the Keras library for deep learning model building
from keras.layers import Input, Dense  # Importing the Input and Dense layers for constructing neural network models
from keras.models import Model  # Importing the Model class for creating the autoencoder model
from sklearn.model_selection import train_test_split  # Importing function to split data into training and testing sets
import warnings  # Importing the warnings library to manage warnings in code execution
warnings.filterwarnings('ignore')  # Suppressing warnings for cleaner output
import matplotlib.pyplot as plt  # Importing the Matplotlib library for plotting graphs

# Defining the Autoencoder class
class Autoencoder:
    """
    Autoencoder class for learning a latent space representation.
    The architecture is simplified, consisting of only one hidden layer.
    """
    
    def __init__(self, latent_dim=32, activation='relu', epochs=200, batch_size=128):
        """
        Initializes the Autoencoder object.
        :param latent_dim: Dimension of the latent space
        :param activation: Activation function to be used in the layers
        :param epochs: Number of epochs for training
        :param batch_size: Size of the batches during training
        """
        self.latent_dim = latent_dim  # Setting the latent space dimension
        self.activation = activation  # Setting the activation function
        self.epochs = epochs  # Setting the number of epochs
        self.batch_size = batch_size  # Setting the batch size
        self.autoencoder = None  # Placeholder for the autoencoder model
        self.encoder = None  # Placeholder for the encoder model
        self.decoder = None  # Placeholder for the decoder model
        self.his = None  # Placeholder for training history

    def _compile(self, input_dim):
        """
        Compiles the computational graph for the autoencoder.
        :param input_dim: Dimension of the input data
        """
        input_vec = Input(shape=(input_dim,))  # Defining the input layer with the given input dimension
        encoded = Dense(self.latent_dim, activation=self.activation)(input_vec)  # Creating the encoding layer
        decoded = Dense(input_dim, activation=self.activation)(encoded)  # Creating the decoding layer
        
        # Building the autoencoder model
        self.autoencoder = Model(input_vec, decoded)
        
        # Creating the encoder model up to the encoded representation
        self.encoder = Model(input_vec, encoded)
        
        # Creating the decoder model using the last layer of the autoencoder
        encoded_input = Input(shape=(self.latent_dim,))  # Defining a new input for the decoder
        decoder_layer = self.autoencoder.layers[-1]  # Retrieving the last layer (decoder)
        self.decoder = Model(encoded_input, decoder_layer(encoded_input))
        
        # Compiling the autoencoder model with the Adam optimizer and mean squared error loss function
        self.autoencoder.compile(optimizer='adam', loss=keras.losses.mean_squared_error)

    def fit(self, X):
        """
        Trains the autoencoder on the provided data.
        :param X: Input data to train the autoencoder
        """
        if not self.autoencoder:  # Check if the model is not already compiled
            self._compile(X.shape[1])  # Compile the model using the input data's feature size
        
        # Splitting the input data into training and testing sets
        X_train, X_test = train_test_split(X)
        
        # Fitting the autoencoder model to the training data
        self.his = self.autoencoder.fit(X_train, X_train,
                                        epochs=self.epochs,  # Number of epochs for training
                                        batch_size=self.batch_size,  # Batch size for training
                                        shuffle=True,  # Shuffling the data before each epoch
                                        validation_data=(X_test, X_test),  # Validation data for evaluation
                                        verbose=0)  # Silent mode during training
        
        # Plotting the training and validation loss over epochs
        plt.figure(figsize=(10, 6), dpi=350)  # Setting the plot size and resolution
        plt.plot(self.his.history['loss'], label='Training Loss')  # Plotting the training loss
        plt.plot(self.his.history['val_loss'], label='Validation Loss')  # Plotting the validation loss
        plt.title('Autoencoder Training Loss')  # Title of the plot
        plt.xlabel('Epochs')  # Label for the x-axis
        plt.ylabel('Loss')  # Label for the y-axis
        plt.legend()  # Displaying the legend
        plt.show()  # Showing the plot


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  # Importing TF-IDF vectorizer for text feature extraction
from sklearn.cluster import KMeans  # Importing KMeans for clustering
from gensim import corpora  # Importing Gensim's corpora for creating a dictionary from the tokenized texts
import gensim  # Importing Gensim for topic modeling (LDA)
from datetime import datetime  # Importing datetime for timestamping model identifiers
from sentence_transformers import SentenceTransformer  # Importing the SentenceTransformer for BERT embeddings
import numpy as np  # Importing NumPy for numerical operations
from sklearn.model_selection import train_test_split  # Importing train_test_split for splitting datasets
from sklearn.metrics import silhouette_score  # Importing silhouette_score for clustering evaluation
import keras  # Importing Keras for deep learning models
from keras.layers import Input, Dense  # Importing Input and Dense layers for neural network models
from keras.models import Model  # Importing Model for defining Keras models

# Preprocessing function for text documents
def preprocess(docs, samp_size=None):
    """
    Preprocesses the input documents.
    :param docs: List of documents (raw text data)
    :param samp_size: Number of samples to preprocess (default is 100)
    :return: Processed sentences, token lists, and indices of sampled documents
    """
    if not samp_size:  # If samp_size is not provided, set it to 100 by default
        samp_size = 100

    print('Preprocessing raw texts ...')
    n_docs = len(docs)  # Total number of documents
    sentences = []  # List for preprocessed sentences at the sentence level
    token_lists = []  # List for preprocessed word tokens at the word level
    idx_in = []  # List to store indices of selected samples
    samp = np.random.choice(n_docs, samp_size)  # Randomly selecting sample indices from the documents
    
    for i, idx in enumerate(samp):
        sentence = preprocess_sent(docs[idx])  # Preprocessing the document at the sentence level
        token_list = preprocess_word(sentence)  # Preprocessing the sentence into a list of words
        
        if token_list:  # If token_list is not empty, append data to the respective lists
            idx_in.append(idx)
            sentences.append(sentence)
            token_lists.append(token_list)
        
        # Display progress as a percentage
        print('{} %'.format(str(np.round((i + 1) / len(samp) * 100, 2))), end='\r')
    
    print('Preprocessing raw texts. Done!')
    return sentences, token_lists, idx_in  # Return processed data

# Class for topic modeling
class Topic_Model:
    def __init__(self, k=10, method='TFIDF'):
        """
        Initializes the topic modeling object.
        :param k: Number of topics to be generated
        :param method: Method for topic modeling ('TFIDF', 'LDA', 'BERT', 'LDA_BERT')
        """
        if method not in {'TFIDF', 'LDA', 'BERT', 'LDA_BERT'}:  # Check if the provided method is valid
            raise Exception('Invalid method!')
        
        self.k = k  # Number of topics
        self.dictionary = None  # Dictionary object for the corpus
        self.corpus = None  # Document-term matrix representation of the corpus
        self.cluster_model = None  # Clustering model
        self.ldamodel = None  # LDA model
        self.vec = {}  # Dictionary to store vector representations for different methods
        self.gamma = 15  # Parameter for adjusting the relative importance of LDA in 'LDA_BERT'
        self.method = method  # Selected method for topic modeling
        self.AE = None  # Autoencoder model (for 'LDA_BERT')
        self.id = method + '_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")  # Unique identifier for the model

    def vectorize(self, sentences, token_lists, method=None):
        """
        Generates vector representations using the selected method.
        :param sentences: List of preprocessed sentences
        :param token_lists: List of preprocessed word tokens
        :param method: Method for vectorization (default is the object's method)
        :return: Vector representation of the documents
        """
        if method is None:  # Set to default method if not specified
            method = self.method

        # Create a dictionary and a document-term matrix from tokenized documents
        self.dictionary = corpora.Dictionary(token_lists)
        self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

        if method == 'TFIDF':
            print('Getting vector representations for TF-IDF ...')
            tfidf = TfidfVectorizer()  # Instantiate TF-IDF vectorizer
            vec = tfidf.fit_transform(sentences)  # Fit and transform sentences
            print('Getting vector representations for TF-IDF. Done!')
            return vec

        elif method == 'LDA':
            print('Getting vector representations for LDA ...')
            if not self.ldamodel:  # If LDA model is not already created, create one
                self.ldamodel = gensim.models.ldamodel.LdaModel(
                    self.corpus, num_topics=self.k, id2word=self.dictionary, passes=20
                )

            def get_vec_lda(model, corpus, k):
                """
                Generates LDA vector representation for all documents.
                :param model: LDA model
                :param corpus: Corpus in a document-term matrix format
                :param k: Number of topics
                :return: LDA vectors with dimensions (n_docs x n_topics)
                """
                n_doc = len(corpus)
                vec_lda = np.zeros((n_doc, k))  # Initialize zero matrix for LDA vectors
                for i in range(n_doc):
                    # Get the topic distribution for the i-th document
                    for topic, prob in model.get_document_topics(corpus[i]):
                        vec_lda[i, topic] = prob
                return vec_lda

            vec = get_vec_lda(self.ldamodel, self.corpus, self.k)
            print('Getting vector representations for LDA. Done!')
            return vec

        elif method == 'BERT':
            print('Getting vector representations for BERT ...')
            model = SentenceTransformer('bert-base-nli-max-tokens')  # Load pre-trained BERT model
            vec = np.array(model.encode(sentences, show_progress_bar=True))  # Encode sentences
            print('Getting vector representations for BERT. Done!')
            return vec

        elif method == 'LDA_BERT':
            vec_lda = self.vectorize(sentences, token_lists, method='LDA')  # Generate LDA vectors
            vec_bert = self.vectorize(sentences, token_lists, method='BERT')  # Generate BERT vectors
            vec_ldabert = np.c_[vec_lda * self.gamma, vec_bert]  # Concatenate weighted LDA and BERT vectors
            self.vec['LDA_BERT_FULL'] = vec_ldabert  # Store full concatenated vector

            if not self.AE:  # If Autoencoder is not initialized, create and fit one
                self.AE = Autoencoder()  # Create Autoencoder object
                print('Fitting Autoencoder ...')
                self.AE.fit(vec_ldabert)  # Fit the Autoencoder on the concatenated vectors
                print('Fitting Autoencoder Done!')
            vec = self.AE.encoder.predict(vec_ldabert)  # Use encoder part to get reduced representation
            return vec

    def fit(self, sentences, token_lists, method=None, m_clustering=None):
        """
        Fits the topic model using the selected method and data.
        :param sentences: List of preprocessed sentences
        :param token_lists: List of preprocessed word tokens
        :param method: Method for fitting (default is the object's method)
        :param m_clustering: Clustering algorithm (default is KMeans)
        """
        if method is None:  # Set to default method if not specified
            method = self.method
        if m_clustering is None:  # Set to KMeans if no clustering method is specified
            m_clustering = KMeans

        if not self.dictionary:  # Create dictionary and document-term matrix if not already created
            self.dictionary = corpora.Dictionary(token_lists)
            self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

        if method == 'LDA':
            if not self.ldamodel:  # If LDA model is not created, create and fit one
                print('Fitting LDA ...')
                self.ldamodel = gensim.models.ldamodel.LdaModel(
                    self.corpus, num_topics=self.k, id2word=self.dictionary, passes=20
                )
                print('Fitting LDA Done!')
        else:
            print('Clustering embeddings ...')
            self.cluster_model = m_clustering(self.k)  # Initialize clustering model
            self.vec[method] = self.vectorize(sentences, token_lists, method)  # Get vector representations
            self.cluster_model.fit(self.vec[method])  # Fit the clustering model on the vectors
            print('Clustering embeddings. Done!')

    def predict(self, sentences, token_lists, out_of_sample=None):
        """
        Predicts topics for new documents.
        :param sentences: List of preprocessed sentences
        :param token_lists: List of preprocessed word tokens
        :param out_of_sample: Indicates if prediction is for out-of-sample data
        :return: Predicted topic labels
        """
        out_of_sample = out_of_sample is not None  # Set default to False if not specified

        if out_of_sample:  # Handle out-of-sample prediction
            corpus = [self.dictionary.doc2bow(text) for text in token_lists]
            if self.method != 'LDA':
                vec = self.vectorize(sentences, token_lists)  # Generate vectors for new data
                print(vec)
        else:
            corpus = self.corpus  # Use existing corpus for in-sample prediction
            vec = self.vec.get(self.method, None)

        if self.method == "LDA":  # Prediction for LDA model
            lbs = np.array(list(map(lambda x: sorted(
                self.ldamodel.get_document_topics(x), key=lambda x: x[1], reverse=True)[0][0], corpus)))
        else:  # Prediction for non-LDA methods using clustering
            lbs = self.cluster_model.predict(vec)
        return lbs  # Return predicted labels


In [None]:
from typing import Counter  # Importing Counter for counting occurrences of items
from gensim.models.coherencemodel import CoherenceModel  # Importing CoherenceModel for evaluating topic coherence
from sklearn.metrics import silhouette_score  # Importing silhouette_score for cluster quality evaluation
import matplotlib.pyplot as plt  # Importing Matplotlib for data visualization
from wordcloud import WordCloud  # Importing WordCloud for generating word cloud images
import umap  # Importing UMAP for dimensionality reduction and visualization

# Setting up a figure for plotting with a specified size and resolution
plt.figure(figsize=(10, 10), dpi=200)

# Function for plotting UMAP embeddings
def plot_proj(embedding, lbs):
    """
    Plots UMAP embeddings for visualizing clusters.
    :param embedding: 2D array representing UMAP (or other) embeddings.
    :param lbs: Array of labels for each point.
    """
    n = len(embedding)  # Number of data points
    counter = Counter(lbs)  # Count occurrences of each label
    for i in range(len(np.unique(lbs))):  # Loop over each unique label
        # Plot each cluster with unique label and percentage of total points
        plt.plot(embedding[:, 0][lbs == i], embedding[:, 1][lbs == i], '.', alpha=0.5,
                 label='cluster {}: {:.2f}%'.format(i, counter[i] / n * 100))
    plt.legend()  # Display legend on the plot

# Function to compute coherence score for a topic model
def get_coherence(model, token_lists, measure='c_v'):
    """
    Computes the coherence score for a given topic model.
    :param model: Topic_Model object.
    :param token_lists: Tokenized documents.
    :param measure: Coherence metric to be used (default is 'c_v').
    :return: Coherence score.
    """
    if model.method == 'LDA':  # Check if the method is LDA
        # Create a CoherenceModel using the LDA model, tokenized texts, and dictionary
        cm = CoherenceModel(model=model.ldamodel, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
                            coherence=measure)
    else:
        # Get top words for each topic if the model is not LDA
        topics = get_topic_words(model, model.k)
        # Create a CoherenceModel using the topics and input data
        cm = CoherenceModel(topics=topics, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
                            coherence=measure)
    return cm.get_coherence()  # Return the coherence score

# Function to calculate the silhouette score for clustering
def get_silhouette(model):
    """
    Computes the silhouette score of the clusters.
    :param model: Topic_Model object.
    :return: Silhouette score or 'N/A' for LDA models.
    """
    if model.method == 'LDA':  # Silhouette score is not applicable for LDA directly
        return 'N/A'
    lbs = model.cluster_model.labels_  # Cluster labels
    vec = model.vec[model.method]  # Vector representation of documents
    return silhouette_score(vec, lbs)  # Compute and return the silhouette score

# Function for visualizing the topic model using UMAP embeddings
def visualize(model):
    """
    Visualizes the model embeddings using UMAP.
    :param model: Topic_Model object.
    """
    if model.method == 'LDA':
        print("LDA model does not support this type of visualization.")
        return  # Exit function if the method is LDA

    reducer = umap.UMAP()  # Initialize UMAP reducer
    vec_umap = reducer.fit_transform(model.vec[model.method])  # Fit and transform the vector representation
    plot_proj(vec_umap, model.cluster_model.labels_)  # Plot the UMAP projection
    
    # Scatter plot for UMAP embeddings with color coding by cluster labels
    plt.scatter(vec_umap[:, 0], vec_umap[:, 1], c=model.cluster_model.labels_, cmap='Spectral', s=5)
    plt.colorbar(boundaries=np.arange(model.k + 1) - 0.5).set_ticks(np.arange(model.k))  # Display color bar for clusters
    plt.show()  # Display the plot

# Function to extract topic words from an LDA model
def get_topic_words(lda_model, num_topics):
    """
    Retrieves top words for each topic in an LDA model.
    :param lda_model: Trained LDA model.
    :param num_topics: Number of topics.
    :return: List of strings containing top words for each topic.
    """
    words = []  # Initialize list to store topic words
    for topic_id in range(num_topics):  # Loop through each topic
        top_words = [word for word, prop in lda_model.show_topic(topic_id)]  # Extract top words for the topic
        words.append(' '.join(top_words))  # Join words into a string and add to the list
    return words  # Return list of topic words

# Function for generating a word cloud for a specific topic
def get_wordcloud(topic_words, topic_num):
    """
    Generates and displays a word cloud for a given topic.
    :param topic_words: List of topic words.
    :param topic_num: Index of the topic to visualize.
    """
    wordcloud = WordCloud(width=500, height=560, background_color='white', collocations=False).generate(topic_words[topic_num])  # Generate word cloud
    plt.figure(figsize=(10, 7))  # Set plot size
    plt.imshow(wordcloud, interpolation='bilinear')  # Display word cloud
    plt.axis("off")  # Hide axes
    plt.show()  # Show plot


In [None]:
import pandas as pd  # Importing pandas for data manipulation and analysis

# Loading the data from a CSV file
data_path = 'bbc-text.csv'  # Specify the path to the CSV file containing the text data
print("Загрузка данных...")  # Print a message indicating that data loading has started
data = pd.read_csv(data_path)  # Read the CSV file into a pandas DataFrame
print("Данные загружены.")  # Print a message indicating that data loading is complete

# Preprocessing the text data
sentences, token_lists = preprocess(data['text'])  # Apply the `preprocess` function to the 'text' column
print("Предварительная обработка текстов завершена.")  # Print a message indicating the completion of text preprocessing

# Specify the method for topic modeling
method = 'LDA_BERT'  # Define the method for topic modeling (can be 'LDA', 'TFIDF', 'BERT', or 'LDA_BERT')

# Creating and training the topic model
print("Начало обучения модели тематического моделирования...")  # Print a message indicating the start of model training
model = Topic_Model(k=5, method=method)  # Instantiate a Topic_Model object with 5 topics and the specified method
model.fit(sentences, token_lists)  # Fit the topic model using the preprocessed sentences and token lists
print("Модель тематического моделирования обучена.")  # Print a message indicating that the model has been trained

# Saving the trained model using pickle
import pickle  # Importing pickle for saving and loading Python objects

# Save the trained model to a file
with open(f'{method}_model.pkl', 'wb') as file:  # Open a file in write-binary mode to store the model
    pickle.dump(model, file)  # Serialize and save the model to the file


In [None]:
# Analyzing topic coherence and other metrics
coherence = get_coherence(model, token_lists, measure='c_v')  # Compute the coherence score using the 'c_v' metric
print(f"Когерентность тем: {coherence}")  # Display the coherence score for the topics

# Visualizing results (e.g., with UMAP and word clouds)
print("Визуализация результатов...")  # Print a message indicating the start of visualization
visualize(model)  # Call the function to visualize the model's UMAP embeddings

# Importing necessary modules for further analysis
from gensim.models.coherencemodel import CoherenceModel  # Importing CoherenceModel for evaluating coherence
from sklearn.metrics import silhouette_score  # Importing silhouette_score for evaluating clustering quality

# Assuming that trained models and evaluation data are available

# Coherence analysis for LDA or LDA_BERT models:
lda_coherence = CoherenceModel(model=model, texts=token_lists, dictionary=model.dictionary, coherence='u_mass').get_coherence()  # Compute U-Mass coherence
lda_cv = CoherenceModel(model=model, texts=token_lists, dictionary=model.dictionary, coherence='c_v').get_coherence()  # Compute C_V coherence

# If the method involves clustering (e.g., TF-IDF + Clustering, BERT + Clustering, LDA_BERT + Clustering):
labels = model.cluster_model.labels_  # Get the labels assigned to each document by the clustering model
silhouette_avg = silhouette_score(model.vec[model.method], labels)  # Compute the silhouette score using the document vectors and labels

# Display results
print(f"LDA U-Mass Coherence: {lda_coherence}")  # Print U-Mass coherence score
print(f"LDA C_V Coherence: {lda_cv}")  # Print C_V coherence score
print(f"Silhouette Score: {silhouette_avg}")  # Print the silhouette score

# Setting the number of topics for extracting top words
num_topics = 5

# Retrieving the top words for each topic
topic_words = get_topic_words(model.ldamodel, num_topics)  # Call the function to get the top words for each topic

# Generate a word cloud for a specific topic, e.g., the first topic
get_wordcloud(topic_words, topic_num=0)  # Generate and display the word cloud for the specified topic


In [None]:
from gensim.corpora import Dictionary  # Importing Gensim's Dictionary for handling the mapping between words and their IDs
from gensim.models import LdaModel  # Importing LdaModel for topic modeling

# Defining topic names as a mapping from topic index to human-readable labels
topic_names = {0: 'tech', 1: 'business', 2: 'sport', 3: 'entertainment', 4: 'politics'}

# Creating a set of test texts categorized into different themes
test_texts = [
    # Tech
    "The rapid advancement in quantum computing has the potential to revolutionize industries by making data processing significantly faster.",
    "Emerging technologies such as blockchain and IoT are becoming pivotal in shaping the future landscape of digital transactions and smart homes.",
    
    # Business
    "Global markets are increasingly volatile, with trade tensions and geopolitical uncertainties affecting investor sentiment.",
    "Startups are finding it more challenging to secure funding as venture capitalists tighten their criteria in a post-pandemic economy.",
    
    # Sport
    "The sports world is eagerly anticipating the upcoming Olympics, where new records are expected to be set in various disciplines.",
    "Major League Baseball sees a historic season as a young rookie breaks the long-standing home run record.",
    
    # Entertainment
    "The film industry is seeing a shift towards streaming platforms, which are now premiering blockbuster movies directly to consumers at home.",
    "Virtual reality concerts are gaining popularity, offering an immersive experience for fans to see their favorite artists perform live.",
    
    # Politics
    "Election campaigns are increasingly relying on social media to engage with voters, raising concerns about misinformation and data privacy.",
    "International relations are tense as negotiations stall on climate change initiatives, with major countries failing to agree on emissions targets."
]

# Step 1: Preprocessing the test texts
# Apply sentence-level and word-level preprocessing to each text in the test set
processed_test_texts = [preprocess_word(preprocess_sent(text)) for text in test_texts]

# Step 2: Transform the processed test texts into vectors using the model's dictionary
# Convert each processed text into a bag-of-words representation
test_corpus = [model.dictionary.doc2bow(text) for text in processed_test_texts]

# Step 3: Get the topic distribution for each test text
# Check if the method used by the model is 'LDA' or 'LDA_BERT'
if model.method in ['LDA', 'LDA_BERT']:
    # Obtain the topic distribution for each document in the test corpus
    test_topics = [model.ldamodel.get_document_topics(bow) for bow in test_corpus]
else:
    # Raise an error if the method is not implemented for this type of analysis
    raise NotImplementedError("Әдістер тек LDA және LDA_BERT арналған")  # "Methods only for LDA and LDA_BERT"

# Print the topic distribution for each test text
for i, topics_distribution in enumerate(test_topics):
    print(f"\nМәтін {i+1}:")  # "Text {i+1}:"
    for topic, prob in topics_distribution:
        # Retrieve the topic name by its index, or label it as "Unknown" if not found
        topic_name = topic_names.get(topic, f"Белгісіз мәтін {topic}")  # "Unknown text"
        print(f"Тема '{topic_name}': {prob:.4f}")  # Print the topic name and its probability
