# ProdLDA

In [1]:
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [2]:
dataset_save = 'total_dataset'
file_path = '../DICTIONARYTOTALE.txt' 
my_corpus = 'my_corpus_folder/my_corpus.txt' 

In [3]:
import json

In [4]:
# load the text file as a JSON object
with open(file_path, 'r') as f:
    data = json.load(f)

# extract the descriptions under the key 'wikipedia'
descriptions = [data[key]['wikipedia'] for key in data]

with open(my_corpus, 'w') as f:
  for description in descriptions:
    description = description.replace('\n', ' ')
    f.write(description + '\n')

## Preprocessing


In [None]:
from octis.dataset.dataset import Dataset

class MyDataset(Dataset):
    def __init__(self, data_folder):
        super(MyDataset, self).__init__(data_folder)

    def _load_dataset(self):
        with open(self.data_folder, 'r') as f:
            documents = f.read().splitlines()
        return documents

In [6]:
from octis.dataset.dataset import Dataset
dataset = MyDataset('my_corpus_folder')

In [7]:
dataset.get_corpus()

'my_corpus_folder'

In [8]:
dataset.get_vocabulary()

In [9]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/MindHard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/MindHard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('italian'))
stopwords_list = stopwords.words('italian')

In [11]:
!ls

ProdLDA.ipynb  my_corpus_folder  results_npmi  results_td  total_dataset


In [12]:
import os
import string
from octis.preprocessing.preprocessing import Preprocessing

# Change the current working directory to the parent directory
os.chdir(os.path.pardir)

# Initialize preprocessing with specified parameters
preprocessor = Preprocessing(
    vocabulary=None,                  # No predefined vocabulary is used
    max_features=None,                # No limit on the maximum number of features
    remove_punctuation=True,          # Remove punctuation from the text
    punctuation=string.punctuation,   # Use default punctuation marks
    remove_numbers=True,              # Remove numeric values from the text
    lemmatize=True,                   # Enable lemmatization to reduce words to their base form
    stopword_list=stopwords_list,     # Use a custom stopword list
    language='italian',               # Specify the language for preprocessing (Italian)
    split=False,                      # Do not split the dataset
    lowercase=True,                   # Convert all text to lowercase
    min_chars=3,                      # Keep words with at least 3 characters
    min_words_docs=0,                 # Allow documents with any number of words
    verbose=True                      # Enable detailed output for the preprocessing steps
)

# Preprocess the dataset located at the specified file path
dataset = preprocessor.preprocess_dataset('/home/MindHard/fpulera/topic extraction/Prod_LDA/my_corpus_folder/my_corpus.txt')

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 4141/4141 [05:28<00:00, 12.62it/s]


created vocab
67363
words filtering done


In [13]:
dataset.save("/home/MindHard/fpulera/topic extraction/Prod_LDA/total_dataset")

In [14]:
len(dataset.get_vocabulary())
len(dataset.get_corpus())   # number of POI

4141

## Optimization

In [15]:
!ls

CTM		      DICTIONARY_GPT.txt      Prod_LDA		setup.ipynb
DICTIONARY.txt	      DICTIONARY_IGENIUS.txt  README.txt
DICTIONARYTOTALE.txt  LDA		      dataset_poi.xlsx


In [16]:
# path
dataset_save = 'Prod_LDA/total_dataset'
risultati_td = 'Prod_LDA/results_td'
risultati_npmi = 'Prod_LDA/results_npmi'

In [18]:
print(f"Number of documents: {len(dataset.get_corpus())}")
print(f"Vocabulary size: {len(dataset.get_vocabulary())}")

Number of documents: 4141
Vocabulary size: 67363


In [19]:
from pathlib import Path

class CustomDataset(Dataset):
    def _load_vocabulary(self, file_name):
        """
        Loads vocabulary from a file.
        
        Parameters
        ----------
        file_name : str
            Name of the file to read.
        """
        vocabulary = []  # Initialize an empty list to store vocabulary words
        file = Path(file_name)
        
        # Check if the file exists and is accessible
        if file.is_file():
            with open(file_name, 'r', encoding='utf-8') as vocabulary_file:
                for line in vocabulary_file:
                    vocabulary.append(line.strip())  # Add each word, stripped of whitespace, to the list
            # Update the vocabulary attribute in the Dataset class
            self._Dataset__vocabulary = vocabulary
        else:
            # Raise an exception if the file is not found or cannot be accessed
            raise Exception(f"Error: File {file_name} not found or inaccessible")
        
        # Print the first 5 lines of the vocabulary file for inspection
        with open(file_name, 'r', encoding='utf-8') as f:
            print("First 5 lines of vocabulary file:")
            for _ in range(5):
                print(f.readline().strip())

# Use your CustomDataset class
dataset = CustomDataset()
dataset.load_custom_dataset_from_folder(dataset_save)

First 5 lines of vocabulary file:
aas
aavv
aba
abaco
abacucre


In [20]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dataset_save)

In [21]:
print("Vocabulary size:", len(dataset.get_vocabulary()))

Vocabulary size: 67363


In [22]:
print(dataset.get_vocabulary())

['aas', 'aavv', 'aba', 'abaco', 'abacucre', 'abamonte', 'abamore', 'abare', 'abas', 'abase', 'abate', 'abatecollocare', 'abatedi', 'abategio', 'abateo', 'abateultimato', 'abateè', 'abato', 'abazia', 'abb', 'abba', 'abbacchio', 'abbaci', 'abbacomites', 'abbadere', 'abbadessa', 'abbadesso', 'abbadia', 'abbadiare', 'abbado', 'abbagliante', 'abballeo', 'abbamano', 'abbande', 'abbandere', 'abbandonandoso', 'abbandonare', 'abbandonarere', 'abbandonarire', 'abbandonarla', 'abbandonarono', 'abbandonata', 'abbandonatigramigna', 'abbandonato', 'abbandone', 'abbandono', 'abbandonorudere', 'abbandonò', 'abbandonòre', 'abbarbicare', 'abbarbicato', 'abbare', 'abbas', 'abbassamento', 'abbassare', 'abbassato', 'abbastanza', 'abbate', 'abbateccio', 'abbateggiano', 'abbateggio', 'abbatejo', 'abbatia', 'abbatino', 'abbatis', 'abbatiæ', 'abbato', 'abbattere', 'abbatterere', 'abbattessero', 'abbattimento', 'abbattire', 'abbattuta', 'abbattuto', 'abbatté', 'abbatèggio', 'abbazia', 'abbaziale', 'abbazialo', 

In [23]:
from octis.models.pytorchavitm.AVITM import AVITM
from skopt.space.space import Real

# Initialize the AVITM model
model = AVITM(model_type='prodLDA')
# Specify that the model type is 'prodLDA', a variant of Latent Dirichlet Allocation (LDA)
# that uses a Product of Experts model for topic generation.

# Turn off data partitioning
model.partitioning(False)
# Disables partitioning of the dataset into training, validation, and test subsets.
# This setting might be useful when pre-partitioned data or specific configurations are used.

In [24]:
model_output = model.train_model(dataset)   # train the model

Epoch: [1/100]	Samples: [4141/414100]	Train Loss: 4246.394405789664	Time: 0:00:02.847143
Epoch: [2/100]	Samples: [8282/414100]	Train Loss: 4115.705668165902	Time: 0:00:02.887862
Epoch: [3/100]	Samples: [12423/414100]	Train Loss: 4093.5801700978022	Time: 0:00:03.082640
Epoch: [4/100]	Samples: [16564/414100]	Train Loss: 4094.436105786646	Time: 0:00:02.505742
Epoch: [5/100]	Samples: [20705/414100]	Train Loss: 4076.614038728568	Time: 0:00:02.734266
Epoch: [6/100]	Samples: [24846/414100]	Train Loss: 4074.812692435402	Time: 0:00:02.489648
Epoch: [7/100]	Samples: [28987/414100]	Train Loss: 4065.6010512255493	Time: 0:00:02.611264
Epoch: [8/100]	Samples: [33128/414100]	Train Loss: 4063.7984899480803	Time: 0:00:02.923253
Epoch: [9/100]	Samples: [37269/414100]	Train Loss: 4065.4237918075346	Time: 0:00:02.513799
Epoch: [10/100]	Samples: [41410/414100]	Train Loss: 4054.4403978507607	Time: 0:00:02.483833
Epoch: [11/100]	Samples: [45551/414100]	Train Loss: 4063.4506761651774	Time: 0:00:03.188445
Epoc

In [25]:
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')
topic_diversity = TopicDiversity(topk=10)

In [26]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.93


Coherence: -0.06156345315672973


### Hyperparameter Optimization

In [27]:
for t in model_output['topics']:
  print(" ".join(t))

pesco vitello portella bensì montepagano tavernello cartignano corvo rilasciare sangeminese
fure odierna cannara arischiese caprafico mosciare schiavo rituale nascente tennero
puntone faraone branconio sacrario domizio tinario santacroce beuys umbre sangeminese
chiesa essere secolo due trovare interno avere altro facciata venire
viaggiatore binario rfi fabbricato fascicolo attesa movimento fermata trenitalia ferroviario
descrizione esternare organo tabernacolo descriziorre architetturare beweb cattolico ambone episcopale
straordinario visitatore offrire nettuno icona rilevanza credenza testimone colosseo visione
comune territorio anno paese avere venire centro primo città altro
riserva avezzare naturale monte marsicano versante velino marsica specie iccu
visitatore straordinario offrire didattico capolavore capodimonte laboratorio collezione temporaneo raffaello


In [28]:
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

In [33]:
# Define the search space for hyperparameter optimization
search_space = {
    "num_layers": Categorical([1, 2, 3]),  # Number of layers in the model, can be 1, 2, or 3
    "num_neurons": Categorical([100, 200, 300]),  # Number of neurons in each layer, options are 100, 200, or 300
    "activation": Categorical(['sigmoid', 'relu', 'softplus']),  # Activation functions to choose from
    "dropout": Real(0.0, 0.95),  # Dropout rate, a continuous value between 0.0 and 0.95
    "num_topics": Integer(low=5, high=15)  # Number of topics, an integer between 5 and 15
}

# Number of optimization runs to search for the best hyperparameter configuration
optimization_runs = 15

# Number of runs to train and evaluate the model for each configuration
model_runs = 1

### Topic Diversity

In [31]:
!ls

CTM		      DICTIONARY_GPT.txt      Prod_LDA		setup.ipynb
DICTIONARY.txt	      DICTIONARY_IGENIUS.txt  README.txt
DICTIONARYTOTALE.txt  LDA		      dataset_poi.xlsx


In [32]:
save_path_TD = 'Prod_LDA/results_td'
save_path_npmi = 'Prod_LDA/results_npmi'

In [34]:
# Initialize an optimizer instance
optimizer = Optimizer()

# Perform optimization to tune the model's hyperparameters
optimization_result = optimizer.optimize(
    model,                   # The topic model instance (e.g., AVITM or another topic model)
    dataset,                 # The dataset to be used for training and evaluation
    topic_diversity,         # The metric to optimize (e.g., Topic Diversity in this case)
    search_space,            # The hyperparameter search space defined earlier
    number_of_call=optimization_runs,  # Number of optimization iterations
    model_runs=model_runs,   # Number of times the model is run per configuration
    save_models=False,       # Whether to save each model instance during the optimization process
    extra_metrics=None,      # Optionally track additional metrics (None in this case)
    save_path=save_path_TD   # Path to save the results of the optimization process
)

Current call:  0
Epoch: [1/100]	Samples: [4141/414100]	Train Loss: 4385.14968455687	Time: 0:00:04.031753
Epoch: [2/100]	Samples: [8282/414100]	Train Loss: 4286.299248369959	Time: 0:00:02.952836
Epoch: [3/100]	Samples: [12423/414100]	Train Loss: 4265.598602390727	Time: 0:00:02.616438
Epoch: [4/100]	Samples: [16564/414100]	Train Loss: 4254.081777499397	Time: 0:00:02.697867
Epoch: [5/100]	Samples: [20705/414100]	Train Loss: 4249.134146341464	Time: 0:00:02.597026
Epoch: [6/100]	Samples: [24846/414100]	Train Loss: 4234.927732960034	Time: 0:00:02.710860
Epoch: [7/100]	Samples: [28987/414100]	Train Loss: 4213.322857552524	Time: 0:00:02.560481
Epoch: [8/100]	Samples: [33128/414100]	Train Loss: 4205.473160921275	Time: 0:00:02.651545
Epoch: [9/100]	Samples: [37269/414100]	Train Loss: 4214.267681417532	Time: 0:00:02.882434
Epoch: [10/100]	Samples: [41410/414100]	Train Loss: 4215.6335199830955	Time: 0:00:02.623840
Epoch: [11/100]	Samples: [45551/414100]	Train Loss: 4209.539860540932	Time: 0:00:02.

In [35]:
optimization_result.save_to_csv("Prod_LDA/results_td/results.csv")

In [36]:
import pandas as pd

# Load the CSV file containing the results of the topic diversity optimization
df = pd.read_csv("Prod_LDA/results_td/results.csv")

# Find the index of the maximum value in the "Mean(model_runs)" column
indice_max_mean = df['Mean(model_runs)'].idxmax()

# Extract the values of the relevant parameters for the row with the maximum "Mean(model_runs)"
parametri_max_mean = df.loc[indice_max_mean, [
    'Mean(model_runs)',      # Mean value of the topic diversity metric across model runs
    'num_iteration',         # Iteration number in the optimization process
    'activation',            # Activation function used (e.g., 'relu', 'sigmoid')
    'num_layers',            # Number of layers in the model
    'num_neurons',           # Number of neurons per layer
    'num_topics'             # Number of topics in the model
]]

# Print the parameters corresponding to the maximum topic diversity score
print("Parameters associated with the highest mean value in the topic diversity column:")
print(parametri_max_mean)

Parameters associated with the highest mean value in the topic diversity column:
Mean(model_runs)         1.0
num_iteration             13
activation          softplus
num_layers                 1
num_neurons              100
num_topics                 7
Name: 13, dtype: object


We create the model by inserting the parameter values ​​that I obtained through optimization:

In [37]:
# Initialize the AVITM model with specified parameters
model = AVITM(
    model_type='prodLDA',  # Use Product of Dirichlet Distributions Latent Dirichlet Allocation (prodLDA)
    activation='softplus',    # Set the activation function to ReLU
    num_topics=7,         # Number of topics to be generated by the model
    num_neurons=100,      # Number of neurons per layer
    num_layers=1          # Number of layers in the neural network
)

# Disable partitioning of the dataset (e.g., no train-test split is performed)
model.partitioning(False)

In [38]:
output_TD = model.train_model(dataset)

Epoch: [1/100]	Samples: [4141/414100]	Train Loss: 4283.580360646583	Time: 0:00:02.349927
Epoch: [2/100]	Samples: [8282/414100]	Train Loss: 4171.545433621106	Time: 0:00:02.434179
Epoch: [3/100]	Samples: [12423/414100]	Train Loss: 4151.907849855107	Time: 0:00:02.359185
Epoch: [4/100]	Samples: [16564/414100]	Train Loss: 4136.183800335064	Time: 0:00:02.407886
Epoch: [5/100]	Samples: [20705/414100]	Train Loss: 4130.015711784593	Time: 0:00:02.324711
Epoch: [6/100]	Samples: [24846/414100]	Train Loss: 4120.063360299445	Time: 0:00:02.361460
Epoch: [7/100]	Samples: [28987/414100]	Train Loss: 4110.703238197295	Time: 0:00:02.325103
Epoch: [8/100]	Samples: [33128/414100]	Train Loss: 4106.964331532238	Time: 0:00:02.424083
Epoch: [9/100]	Samples: [37269/414100]	Train Loss: 4088.88738378411	Time: 0:00:02.481348
Epoch: [10/100]	Samples: [41410/414100]	Train Loss: 4104.083392447476	Time: 0:00:02.598859
Epoch: [11/100]	Samples: [45551/414100]	Train Loss: 4088.059866276262	Time: 0:00:02.426637
Epoch: [12/

In [46]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output_TD)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output_TD)
print("Coherence: "+str(npmi_score))

Topic diversity: 1.0
Coherence: 0.030744574801329378


In [39]:
for t in output_TD['topics']:
  print(" ".join(t))

meteorologico trentennale coordinata coordinare climatologico cannara fumone sulmo abbassare telescopio
rfi viaggiatore banchina fascicolo trenitalia binario scheda circolazione mandamento stazionidelmondo
essere chiesa avere secolo anno parte venire altro due primo
monte specie lago marsica naturale riserva versante torrente avezzare fucino
partita spettatore stadio capienza testimone olimpico milanese siro nettuno milan
organo culto cappella immagine descrizione attribuire ligneo absido campata abside
capolavore tiziano temporaneo caravaggio raffaello spaziare esposto manoscritto conoscenza collezione


In [40]:
topics = pd.DataFrame(output_TD['topics'])
topics.to_excel('Prod_LDA/results_td/topics.xlsx', index=False)  

In [41]:
vocabolario = []
with open('Prod_LDA/total_dataset/vocabulary.txt', 'r') as file:
    for word in file:
        vocabolario.append(word)

In [42]:
# Carica la matrice topic-word in un DataFrame pandas
topic_word_matrix = pd.DataFrame(output_TD['topic-word-matrix'])

# Imposta i nomi delle colonne come parole

# Ottieni i primi k termini da ciascuna riga come nomi di colonna
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# inverto il colonne e righe
inverted_topic_word_matrix = topic_word_matrix.transpose()

# Stampa il DataFrame
print(inverted_topic_word_matrix)

inverted_topic_word_matrix.to_excel('Prod_LDA/results_td/topic_word_matrix.xlsx', index=False)

                   0         1         2         3         4         5  \
aas\n       0.266096  0.204399 -0.245217 -0.053632  0.246267 -0.038927   
aavv\n      0.226484  0.161124 -0.150746 -0.063741  0.200190 -0.068242   
aba\n       0.278481  0.214533 -0.199918 -0.082086  0.247787 -0.022826   
abaco\n     0.297148  0.153010 -0.138826 -0.169358  0.233337  0.061209   
abacucre\n  0.277163  0.223098 -0.153938 -0.098192  0.241288 -0.018140   
...              ...       ...       ...       ...       ...       ...   
ἥλιος\n     0.227104  0.184210 -0.178021 -0.038273  0.213023 -0.009806   
ἰχθύς\n     0.295077  0.230004 -0.237918 -0.064075  0.256648  0.014141   
ἱστόνιον\n  0.233616  0.179127 -0.163997 -0.074446  0.203898 -0.031090   
ὑπὸ\n       0.239352  0.194253 -0.183845 -0.044638  0.217568 -0.015980   
ﬁume\n      0.283344  0.229062 -0.255379 -0.042092  0.257754 -0.003779   

                   6  
aas\n       0.228156  
aavv\n      0.178972  
aba\n       0.225904  
abaco\n     0.20458

In [43]:
import pandas as pd

corpus_processed = pd.read_csv('Prod_LDA/total_dataset/corpus.tsv', sep='\t', header=None)
corpus_processed
new_df = pd.DataFrame(corpus_processed[0])

In [44]:
num = 1
for topic in output_TD['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,stazione acquasparta essere stazione ferroviar...,0.042423,0.653775,0.029486,0.046850,0.058890,0.082364,0.086212
1,scoppio essere frazione disabitato comune acqu...,0.223053,0.099850,0.052552,0.066572,0.231023,0.119651,0.207298
2,chiesa essere parrocchiale acquasparta provinc...,0.104374,0.276390,0.083541,0.096323,0.048732,0.176417,0.214223
3,palazzo cese essere situare luogo antico rocca...,0.060373,0.123063,0.056505,0.285809,0.059437,0.378094,0.036718
4,acquasparto essere comune italiano abitante pr...,0.067999,0.137980,0.145489,0.313812,0.100093,0.147057,0.087569
...,...,...,...,...,...,...,...,...
4136,museo arte moderno bologna conoscere mambo ess...,0.062860,0.069005,0.034686,0.029433,0.084695,0.102948,0.616372
4137,parco archeologico paeste situare cuore essere...,0.138137,0.035317,0.084394,0.040619,0.596802,0.046579,0.058153
4138,biblioteca storico comune italiano gioiello ar...,0.085757,0.038229,0.053988,0.148939,0.130058,0.159118,0.383911
4139,parco archeologico paeste situare splendido re...,0.151449,0.124923,0.043975,0.014051,0.429429,0.092067,0.144106


In [None]:
new_df.to_excel('Prod_LDA/results_td/topic-document-matrix.xlsx', index=False)  

### Topic Coherence

In [47]:
# Initialize the optimizer object
optimizer = Optimizer()

# Run the optimization process
optimization_result = optimizer.optimize(
    model,               # The topic model to be optimized (e.g., AVITM with prodLDA)
    dataset,             # The dataset on which the model will be trained
    npmi,                # The evaluation metric to optimize (Normalized Pointwise Mutual Information)
    search_space,        # The hyperparameter search space to explore
    number_of_call=optimization_runs,  # The number of optimization iterations
    model_runs=model_runs,             # The number of times each model configuration will be run
    save_models=False,   # Do not save the intermediate models during optimization
    extra_metrics=None,  # No additional metrics to track during optimization
    save_path=save_path_npmi  # Path to save the optimization results
)


Current call:  0
Epoch: [1/100]	Samples: [4141/414100]	Train Loss: 4509.812960335667	Time: 0:00:02.498512
Epoch: [2/100]	Samples: [8282/414100]	Train Loss: 4378.902684285197	Time: 0:00:02.656480
Epoch: [3/100]	Samples: [12423/414100]	Train Loss: 4344.654846353538	Time: 0:00:02.665862
Epoch: [4/100]	Samples: [16564/414100]	Train Loss: 4338.694389942043	Time: 0:00:02.493132
Epoch: [5/100]	Samples: [20705/414100]	Train Loss: 4321.66807534412	Time: 0:00:02.589683
Epoch: [6/100]	Samples: [24846/414100]	Train Loss: 4318.989804696933	Time: 0:00:02.548093
Epoch: [7/100]	Samples: [28987/414100]	Train Loss: 4313.054572416083	Time: 0:00:02.571204
Epoch: [8/100]	Samples: [33128/414100]	Train Loss: 4297.637202668438	Time: 0:00:02.589026
Epoch: [9/100]	Samples: [37269/414100]	Train Loss: 4307.080928519681	Time: 0:00:02.517418
Epoch: [10/100]	Samples: [41410/414100]	Train Loss: 4292.664483216614	Time: 0:00:02.542660
Epoch: [11/100]	Samples: [45551/414100]	Train Loss: 4256.837380765515	Time: 0:00:02.4

In [48]:
optimization_result.save_to_csv("Prod_LDA/results_npmi/results.csv")

In [49]:
import pandas as pd

# Load the CSV file containing optimization results into a DataFrame
df = pd.read_csv("Prod_LDA/results_npmi/results.csv")

# Find the index of the row with the maximum value in the "Mean(model_runs)" column
indice_max_mean = df['Mean(model_runs)'].idxmax()

# Extract the parameters corresponding to the row with the maximum "Mean(model_runs)" value
parametri_max_mean = df.loc[indice_max_mean, ['Mean(model_runs)', 'num_iteration', 'activation', 'num_layers', 'num_neurons', 'num_topics']]

# Print the parameters associated with the best "Mean(model_runs)" value
print("Parameters associated with the maximum mean value in the topic coherence column:")
print(parametri_max_mean)

Parameters associated with the maximum mean value in the topic coherence column:
Mean(model_runs)    0.024456
num_iteration              5
activation              relu
num_layers                 1
num_neurons              300
num_topics                12
Name: 5, dtype: object


In [50]:
# Initialize the AVITM model with the specified hyperparameters
model = AVITM(model_type='prodLDA',  # Use the 'prodLDA' model type (a variant of LDA)
              activation='relu',  # Set the activation function to 'relu'
              num_topics=12,         # Define the number of topics to be extracted 
              num_neurons=300,       # Set the number of neurons in the model's layers 
              num_layers=1)          # Define the number of layers in the neural network 

# Disable partitioning of the dataset, meaning the model will treat the entire dataset as a single entity
model.partitioning(False)


In [51]:
output_npmi = model.train_model(dataset)

Epoch: [1/100]	Samples: [4141/414100]	Train Loss: 4271.283972394953	Time: 0:00:04.387287
Epoch: [2/100]	Samples: [8282/414100]	Train Loss: 4172.672859816469	Time: 0:00:04.417102
Epoch: [3/100]	Samples: [12423/414100]	Train Loss: 4152.79947476455	Time: 0:00:04.535719
Epoch: [4/100]	Samples: [16564/414100]	Train Loss: 4112.530291596233	Time: 0:00:04.361750
Epoch: [5/100]	Samples: [20705/414100]	Train Loss: 4108.7544071480315	Time: 0:00:04.446843
Epoch: [6/100]	Samples: [24846/414100]	Train Loss: 4096.511293316832	Time: 0:00:04.304403
Epoch: [7/100]	Samples: [28987/414100]	Train Loss: 4087.2284019560493	Time: 0:00:04.334698
Epoch: [8/100]	Samples: [33128/414100]	Train Loss: 4070.951917562183	Time: 0:00:04.331053
Epoch: [9/100]	Samples: [37269/414100]	Train Loss: 4062.5518688873462	Time: 0:00:04.439537
Epoch: [10/100]	Samples: [41410/414100]	Train Loss: 4060.546516541898	Time: 0:00:04.344352
Epoch: [11/100]	Samples: [45551/414100]	Train Loss: 4060.012598858971	Time: 0:00:04.411080
Epoch: [

In [59]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output_npmi)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output_npmi)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.95
Coherence: -0.06790454897776624


In [52]:
for t in output_npmi['topics']:
  print(" ".join(t))

chiesa secolo due cappella essere trovare realizzare opera conservare interno
istat censimento sterlich sagra corciare marsciare manifestazione marsciano aliprando pamphile
spoleto perugia umbro lago umbria acqua terno tempo foligno sala
riserva corno flora fauna vetta orfento uccello acero mammifere cresta
organo aula absido cattolico descrizione esternare abside tabernacolo ecclesiastico beweb
raffaello capolavore caravaggio uffizio galleria conoscenza temporaneo manoscritto vinci inestimabile
avezzare situare mare valle monte località borgo territorio geografia marsica
carolis fure fresa montepagano canadese mosciano comizio monografia etnografico messer
essere comune avere anno secolo centro primo venire chiesa territorio
montecalvello comizio commerciante gaudioso carolis raspa avvocato ridimensionare lubriano serratura
partita pompeo quotidiano stadio spettatore capienza icona nettuno siro paestum
meteorologico fermata viaggiatore trenitalia rfi fascicolo binario scartamento fabb

In [53]:
topics = pd.DataFrame(output_npmi['topics'])
topics.to_excel('Prod_LDA/results_npmi/topics.xlsx', index=False) 

In [55]:
# Load the topic-word matrix from the output into a pandas DataFrame
topic_word_matrix = pd.DataFrame(output_npmi['topic-word-matrix'])

# Set the column names as words from the vocabulary
# Get the first k terms from each row as column names
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Transpose the matrix to invert rows and columns (so words become rows and topics become columns)
inverted_topic_word_matrix = topic_word_matrix.transpose()

# Print the transposed DataFrame to view it
print(inverted_topic_word_matrix)

# Save the transposed matrix to an Excel file for further analysis or use
inverted_topic_word_matrix.to_excel('Prod_LDA/results_npmi/topic_word_matrix.xlsx', index=False)

                  0         1         2         3         4         5   \
aas\n      -0.293553  0.104734 -0.078251  0.124035  0.070566  0.218282   
aavv\n     -0.110254  0.048537 -0.042953  0.114419 -0.030603  0.192617   
aba\n      -0.142660  0.098978 -0.144142  0.097773  0.040194  0.192659   
abaco\n    -0.010658 -0.000173 -0.087907  0.060947  0.083135  0.195569   
abacucre\n -0.043853  0.122923 -0.168542  0.130908  0.024729  0.211800   
...              ...       ...       ...       ...       ...       ...   
ἥλιος\n    -0.222521  0.106754 -0.137610  0.110039  0.065950  0.194571   
ἰχθύς\n    -0.211464  0.100691 -0.138699  0.113656  0.086915  0.195870   
ἱστόνιον\n -0.196306  0.109304 -0.157406  0.115745  0.057184  0.199397   
ὑπὸ\n      -0.257232  0.104861 -0.063801  0.110648  0.070322  0.206282   
ﬁume\n     -0.264089  0.109855 -0.087866  0.133970  0.071382  0.224300   

                  6         7         8         9         10        11  
aas\n      -0.166686  0.293056 -0.3523

In [56]:
import pandas as pd

corpus_processed = pd.read_csv('Prod_LDA/total_dataset/corpus.tsv', sep='\t', header=None)
corpus_processed
new_df = pd.DataFrame(corpus_processed[0])

In [57]:
num = 1
for topic in output_npmi['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,stazione acquasparta essere stazione ferroviar...,0.018144,0.014359,0.035127,0.036222,0.037762,0.025351,0.022264,0.010937,0.020236,0.030944,0.051389,0.697263
1,scoppio essere frazione disabitato comune acqu...,0.033161,0.150576,0.041338,0.102116,0.038105,0.069873,0.081467,0.138222,0.042261,0.147695,0.060993,0.094193
2,chiesa essere parrocchiale acquasparta provinc...,0.052605,0.044345,0.014832,0.028549,0.510118,0.023894,0.052591,0.079161,0.045770,0.034499,0.038754,0.074883
3,palazzo cese essere situare luogo antico rocca...,0.057753,0.242908,0.068921,0.034620,0.222168,0.044713,0.073571,0.039440,0.063075,0.015252,0.034650,0.102929
4,acquasparto essere comune italiano abitante pr...,0.034594,0.523876,0.104283,0.034010,0.053074,0.016441,0.014174,0.060621,0.035321,0.026844,0.038710,0.058052
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4136,museo arte moderno bologna conoscere mambo ess...,0.062853,0.074179,0.021351,0.085038,0.036130,0.364388,0.079605,0.058760,0.052537,0.073378,0.064734,0.027047
4137,parco archeologico paeste situare cuore essere...,0.052228,0.047664,0.028952,0.057232,0.027838,0.052593,0.062217,0.033282,0.057137,0.100783,0.433926,0.046149
4138,biblioteca storico comune italiano gioiello ar...,0.020417,0.034542,0.061240,0.102899,0.025697,0.379447,0.033573,0.083721,0.043206,0.039255,0.137192,0.038812
4139,parco archeologico paeste situare splendido re...,0.040870,0.046282,0.027937,0.063965,0.032679,0.106999,0.018684,0.034535,0.026507,0.072249,0.421294,0.108000


In [58]:
new_df.to_excel('Prod_LDA/results_npmi/topic-document-matrix.xlsx', index=False)  