# CTM

## Preprocessing

We apply different preprocessing than LDA as CTM requires much less of it.

In [1]:
import json

In [2]:
dataset_save = 'total_dataset'
file_path = '../DICTIONARYTOTALE.txt' 
my_corpus = 'my_corpus_folder/my_corpus.txt' 

In [3]:
# load the text file as a JSON object
with open(file_path, 'r') as f:
    data = json.load(f)

# extract the descriptions under the key 'wikipedia'
descriptions = [data[key]['wikipedia'] for key in data]

with open(my_corpus, 'w') as f:
  for description in descriptions:
    description = description.replace('\n', ' ')
    f.write(description + '\n')

In [4]:
len(descriptions)

4141

In [5]:
from octis.dataset.dataset import Dataset

class MyDataset(Dataset):
    def __init__(self, data_folder):
        super(MyDataset, self).__init__(data_folder)

    def _load_dataset(self):
        with open(self.data_folder, 'r') as f:
            documents = f.read().splitlines()
        return documents

In [6]:
from octis.dataset.dataset import Dataset
dataset = MyDataset('my_corpus_folder')

This code:
- Parses a corpus of documents, breaking them down into individual sentences using spaCy.
- Filters out short sentences (3 words or less) to get only the most potentially meaningful ones.
- Calculates and prints the total number of sentences remaining after filtering.
This approach is useful, for example, to pre-process data for NLP tasks like sentiment analysis or topic modeling.

In [7]:
import spacy

# Load the Italian spaCy language model
nlp = spacy.load("it_core_news_sm")

# Function to split a document into sentences
def split_into_sentences(document):
    doc = nlp(document)  # Process the document with the spaCy model
    return [sent.text.strip() for sent in doc.sents]  # Extract and clean sentences

# Apply the sentence splitting function to all documents in the corpus
split = []
total_docs = len(descriptions)  # Total number of documents in the corpus

for idx, doc in enumerate(descriptions, start=1):
    split.extend(split_into_sentences(doc))  # Add the sentences from each document to the list
    print(f"Processing document {idx}/{total_docs}")  # Show progress
    
# Filter out short or insignificant sentences (e.g., those with less than 10 words)
split = [sentence for sentence in split if len(sentence.split()) > 9]

# Verify and display the number of sentences obtained
print(f"Number of sentences: {len(split)}")

# Number of sentences: 104815



Processing document 1/4141
Processing document 2/4141
Processing document 3/4141
Processing document 4/4141
Processing document 5/4141
Processing document 6/4141
Processing document 7/4141
Processing document 8/4141
Processing document 9/4141
Processing document 10/4141
Processing document 11/4141
Processing document 12/4141
Processing document 13/4141
Processing document 14/4141
Processing document 15/4141
Processing document 16/4141
Processing document 17/4141
Processing document 18/4141
Processing document 19/4141
Processing document 20/4141
Processing document 21/4141
Processing document 22/4141
Processing document 23/4141
Processing document 24/4141
Processing document 25/4141
Processing document 26/4141
Processing document 27/4141
Processing document 28/4141
Processing document 29/4141
Processing document 30/4141
Processing document 31/4141
Processing document 32/4141
Processing document 33/4141
Processing document 34/4141
Processing document 35/4141
Processing document 36/4141
P

In [8]:
from octis.dataset.dataset import Dataset

# Custom Dataset class inheriting from the base `Dataset` class in OCTIS
class MyDataset(Dataset):
    def __init__(self, data_folder):
        # Initialize the parent `Dataset` class with the provided folder path
        super(MyDataset, self).__init__(data_folder)

    # Override the method to load the dataset
    def _load_dataset(self):
        # Open the file containing the corpus in read mode with UTF-8 encoding
        with open(self.data_folder, 'r', encoding='utf-8') as f:
            # Read the file line by line and split it into individual documents
            documents = f.read().splitlines()
        # Return the list of documents as the dataset
        return documents

In [9]:
my_corpus_folder = '/home/MindHard/fpulera/topic extraction/CTM/my_corpus_folder'

In [10]:
dataset = MyDataset(my_corpus_folder)

In [11]:
# We select about half of the corpus for a question of computational power
split = split[:52400]

In [29]:
split

['La stazione di Acquasparta è una stazione ferroviaria a servizio del comune di Acquasparta sulla ferrovia Centrale Umbra nel tratto fra Terni e Perugia.',
 'La gestione degli impianti è affidata Ferrovia Centrale Umbra s.r.l..\n\nStruttura ed impianti\nIl fabbricato viaggiatori si sviluppa su due livelli di cui solo il piano terra è fruibile dai viaggiatori.',
 'Dal fabbricato principale si diramano poi due corpi di minori dimensioni ad un solo piano.',
 "Solo parte del piano terra è fruibile per i viaggiatori il resto ospita l'ufficio tecnico di FCU mentre il piano superiore è una abitazione privata.",
 "Le condizioni del fabbricato sono piuttosto buone tuttavia non mancano graffiti (sia all'esterno che all'interno) e piccole ma numerose perdite di intonaco.",
 'oggi il magazzino è stato completamente ristrutturato ed utilizzato come centro di assistenze del pronto soccorso.',
 'Lo scalo merci invece è stato completamente smantellato ad eccezione del binario servito dal piano carica

In [12]:
import os
import string
from octis.preprocessing.preprocessing import Preprocessing

# Change the current working directory to the parent directory
# os.chdir(os.path.pardir)

# Initialize the preprocessor with the desired configuration
preprocessor = Preprocessing(
    vocabulary=None,            # No pre-defined vocabulary
    max_features=None,          # No restriction on the maximum number of features
    remove_punctuation=True,    # Remove punctuation from the text
    lemmatize=True,             # Lemmatize words to their base form
    split=False,                # Do not split the dataset into train/test subsets
    language="italian",         # Specify the language for processing
    lowercase=True,             # Convert text to lowercase
    min_chars=3,                # Keep only words with at least 3 characters
    min_words_docs=10            # Retain only documents with at least 10 words
)


split_non_empty_corpus = [doc for doc in split if doc.strip() != ""]

# Save the split corpus into a file for pre-processing
with open("/home/MindHard/fpulera/topic extraction/CTM/my_corpus_folder/split_corpus.txt", 'w', encoding='utf-8') as f:
    f.write("\n".join(split_non_empty_corpus))  # Write each sentence as a new line

# Preprocess the dataset from the saved file
dataset = preprocessor.preprocess_dataset(
    "/home/MindHard/fpulera/topic extraction/CTM/my_corpus_folder/split_corpus.txt"
)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 83111/83111 [06:34<00:00, 210.78it/s]


created vocab
46654


In [13]:
# save the preprocessed dataset
dataset.save('/home/MindHard/fpulera/topic extraction/CTM/total_dataset')

In [14]:
print(len(dataset.get_vocabulary()))
print(len(dataset.get_corpus()))   # number of POIs 37512

46654
37512


## Optimization

In [16]:
from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

In [32]:
dataset_save = '/home/MindHard/fpulera/topic extraction/CTM/total_dataset'

In [33]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dataset_save)

In [None]:
# Import the CTM (Contextualized Topic Models) class
model = CTM(
    num_epochs=10,  # Set the number of training epochs to 10
    inference_type='zeroshot',  # Specify the inference type as "zeroshot" for predicting topics without labeled data
    bert_model="distiluse-base-multilingual-cased-v2",  # Use a pre-trained multilingual BERT model for embeddings
    use_partitions=False  # Disable partitioning of the dataset for training
)

In [35]:
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')
topic_diversity = TopicDiversity(topk=10)

In [36]:
model_output = model.train_model(dataset)   # train the model

Batches: 100%|██████████| 376/376 [03:46<00:00,  1.66it/s]


In [37]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output)
print("NPMI: "+str(npmi_score))


Topic diversity: 0.97
NPMI: 0.031855996525374346


In [38]:
type(model_output['topics'])

list

In [39]:
# Define the search space for hyperparameter optimization
search_space = {
    "num_layers": Categorical([1, 2, 3]),            # Number of layers in the model (1, 2, or 3)
    "num_neurons": Categorical([100, 200, 300]),     # Number of neurons per layer (100, 200, or 300)
    "activation": Categorical(['sigmoid', 'relu', 'softplus']),  # Activation functions to choose from
    "dropout": Real(0.0, 0.95),                      # Dropout rate as a continuous value between 0.0 and 0.95
    "num_topics": Integer(low=5, high=15)            # Number of topics, an integer value between 5 and 15
}

In [40]:
optimization_runs=10
model_runs=1

In [41]:
for t in model_output['topics']:
  print(" ".join(t))

internazionale addetto stagione attivo impresa lavorazione sportivo minimo artigianale azienda
che per non avere anno paese suo questo quello nome
raffigurare pittore tela bambino vergine altare tavola cristo madonna affresco
secolo chiesa antico costruire dedicare palazzo monastero edificare xii xiii
parroco solenne menzionare cattolico religione consacrazione celebrare citazione inventario notarile
cura iccu editore pescarare giuliare edizione isbn gonfalone sbn topografico
feudo ducato impero imperatore contea regno invasione entrare potente dominio
chilometro monte sud nord situare ovest frazione valle versante sasso
arco facciata portale finestra tre due pianta ingresso pietra navata
che più acqua per quello essere uno quale lago cui


### Topic Diversity

In [42]:
optimizer=Optimizer()
optimization_result = optimizer.optimize(
    model, dataset, topic_diversity, search_space, number_of_call=optimization_runs,
    model_runs=model_runs, save_models=False,
    extra_metrics=None, # to keep track of other metrics
    save_path='/home/MindHard/fpulera/topic extraction/CTM/results_td')


Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9


In [43]:
optimization_result.save_to_csv("/home/MindHard/fpulera/topic extraction/CTM/results_td/results.csv")

In [44]:
import pandas as pd

# Load the results CSV file into a pandas DataFrame
df = pd.read_csv("/home/MindHard/fpulera/topic extraction/CTM/results_td/results.csv")

# Find the index of the maximum value in the "Mean(model_runs)" column (represents topic diversity)
indice_max_mean = df['Mean(model_runs)'].idxmax()

# Extract the values of the corresponding parameters for the row with the maximum mean value
parametri_max_mean = df.loc[indice_max_mean, ['Mean(model_runs)', 'num_iteration', 'activation', 'num_layers', 'num_neurons', 'num_topics']]

# Print the parameters associated with the maximum mean in the topic diversity column
print("Parameters associated with the maximum mean in the topic diversity column:")
print(parametri_max_mean)


Parameters associated with the maximum mean in the topic diversity column:
Mean(model_runs)     1.0
num_iteration          8
activation          relu
num_layers             1
num_neurons          300
num_topics             6
Name: 8, dtype: object


In [45]:
# Initialize a CTM (Combined Topic Model) instance with specific parameters
model = CTM(
    num_epochs=50,  # Number of training epochs for the model
    inference_type='zeroshot',  # Inference type, 'zeroshot' indicates no prior topic distribution is assumed
    bert_model="distiluse-base-multilingual-cased-v2",  # Pre-trained BERT model to use for embedding text
    num_topics=6,  # Number of topics to generate
    num_neurons=300,  # Number of neurons in each layer of the neural network
    num_layers=1,  # Number of layers in the neural network
    activation="relu",  # Activation function for the neural network layers
    use_partitions=False  # Disable dataset partitioning for the training process
)

In [46]:
output_TD = model.train_model(dataset)

In [48]:
for t in output_TD['topics']:
  print(" ".join(t))

stampa comitato conferenza convegno direttore pilota sera categoria attesa palio
feudo ducato dominio possedimento duca figlio conte cardinale giurisdizione diocesi
esemplare neve precipitazione raro media uccello autunno coltivare orso paleolitico
che avere essere per quale non questo cui anche come
nord ovest capoluogo statale est frazione situare provincia chilometro strada
destra sormontare sinistra croce sesto destro ligneo rosone tela arco


In [52]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output_TD)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output_TD)
print("Coherence: "+str(npmi_score))

Topic diversity: 1.0
Coherence: -0.06988342389333631


In [49]:
topics = pd.DataFrame(output_TD['topics'])
topics.to_excel('/home/MindHard/fpulera/topic extraction/CTM/results_td/topics.xlsx', index=False)

In [53]:
vocabolario = []
with open('/home/MindHard/fpulera/topic extraction/CTM/total_dataset/vocabulary.txt', 'r', encoding='utf-8') as file:
    for word in file:
        vocabolario.append(word)

In [54]:
# Load the topic-word matrix into a pandas DataFrame
topic_word_matrix = pd.DataFrame(output_TD['topic-word-matrix'])

# Set column names as words
# Assign the vocabulary terms to the columns of the topic-word matrix
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Invert the rows and columns of the DataFrame (transpose)
inverted_topic_word_matrix = topic_word_matrix.transpose()

# Print the transposed DataFrame for verification
print(inverted_topic_word_matrix)

# Save the transposed topic-word matrix to an Excel file
inverted_topic_word_matrix.to_excel(
    '/home/MindHard/fpulera/topic extraction/CTM/results_td/topic_word_matrix.xlsx',
    index=False  # Do not include the index column in the Excel file
)

                     0         1         2         3         4         5
aas\n         0.499561 -0.009614  0.354232 -0.755484 -0.126610 -0.058383
aavv\n        0.281603 -0.060526  0.145802 -0.467328 -0.123038 -0.083871
aba\n         0.576316 -0.082661  0.407671 -0.880255 -0.216697 -0.053976
abaco\n       0.446411 -0.115132  0.419484 -0.591058 -0.279079 -0.066398
abamonte\n    0.472345 -0.003682  0.336097 -0.804069 -0.151784 -0.078110
...                ...       ...       ...       ...       ...       ...
όλυμπος\n     0.380212 -0.013201  0.268041 -0.575642 -0.090428 -0.041187
ἐκπεριάγων\n  0.430534 -0.026804  0.326662 -0.609484 -0.114571 -0.061574
ἰχθύς\n       0.454699 -0.017244  0.318794 -0.729906 -0.123957 -0.055993
ὑπὸ\n         0.427910 -0.026668  0.324767 -0.605563 -0.113885 -0.061292
ﬁume\n        0.344239 -0.028227  0.297799 -0.543475 -0.108893 -0.052335

[46654 rows x 6 columns]


In [55]:
import pandas as pd

corpus_processed = pd.read_csv('/home/MindHard/fpulera/topic extraction/CTM/total_dataset/corpus.tsv', sep='\t', header=None)
corpus_processed
new_df = pd.DataFrame(corpus_processed[0])

In [56]:
num = 1
for topic in output_TD['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6
0,stazione acquasparta essere uno stazione ferro...,0.050502,0.054071,0.019507,0.036044,0.816559,0.023317
1,fabbricato viaggiatore sviluppare due livello ...,0.303326,0.035609,0.368569,0.046658,0.094680,0.151158
2,fabbricato principale diramare poi due corpo m...,0.180764,0.052445,0.225828,0.064456,0.075447,0.401061
3,solo parte piano terra essere fruibile per via...,0.120927,0.096736,0.113702,0.389687,0.130761,0.148187
4,condizione fabbricato essere piuttosto buono t...,0.065393,0.079745,0.297034,0.299632,0.103135,0.155060
...,...,...,...,...,...,...,...
37507,casalbordino cuasàlë abruzzese essere uno comu...,0.206972,0.137339,0.067834,0.105489,0.362710,0.119656
37508,trovare circa chilometro costa adriatica uno c...,0.049050,0.034960,0.119862,0.037417,0.733020,0.025691
37509,lido casalbordino attrezzare frequentare local...,0.019582,0.044954,0.034970,0.053604,0.805272,0.041618
37510,nome volere risalga uno condottiero quello per...,0.138089,0.597970,0.077834,0.059573,0.099490,0.027044


In [57]:
new_df.to_excel('/home/MindHard/fpulera/topic extraction/CTM/results_td/topic-document-matrix.xlsx', index=False)

### Topic Coherence

In [58]:
dataset_save = '/home/MindHard/fpulera/topic extraction/CTM/total_dataset/'


In [59]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dataset_save)

In [None]:
# Import the CTM (Contextualized Topic Models) class
model = CTM(
    num_epochs=30,  # Train the model for 30 epochs (iterations over the dataset)
    inference_type='zeroshot',  # Set inference type to "zeroshot" for topic prediction without labeled data
    bert_model="distiluse-base-multilingual-cased-v2",  # Use a multilingual BERT-based embedding model for contextualized embeddings
    use_partitions=False  # Do not partition the dataset, use the entire dataset as a single block
)

In [61]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')


In [62]:
search_space = {"num_layers": Categorical({1, 2, 3}),
                "num_neurons": Categorical({100, 200, 300}),
                "activation": Categorical({'sigmoid', 'relu', 'softplus'}),
                "dropout": Real(0.0, 0.95),
                "num_topics": Integer(low=5,high=15)
}

In [63]:
optimization_runs=15
model_runs=1

In [64]:
optimizer=Optimizer()
optimization_result = optimizer.optimize(
    model, dataset, npmi, search_space, number_of_call=optimization_runs,
    model_runs=model_runs, save_models=False,
    extra_metrics=None, # to keep track of other metrics
    save_path='/home/MindHard/fpulera/topic extraction/CTM/results_npmi')
optimization_result.save_to_csv("/home/MindHard/fpulera/topic extraction/CTM/results_npmi/results.csv")

Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14


In [None]:
import pandas as pd  # Import the pandas library for data manipulation and analysis

# Load the CSV file containing the results of the topic modeling optimization into a DataFrame
df = pd.read_csv("/home/MindHard/fpulera/topic extraction/CTM/results_npmi/results.csv")

# Find the index of the row with the maximum value in the 'Mean(model_runs)' column
indice_max_mean = df['Mean(model_runs)'].idxmax()

# Extract the parameters corresponding to the maximum mean value
# Includes columns: 'Mean(model_runs)', 'num_iteration', 'activation', 'num_layers', 'num_neurons', 'num_topics'
parametri_max_mean = df.loc[indice_max_mean, ['Mean(model_runs)', 'num_iteration', 'activation', 'num_layers', 'num_neurons', 'num_topics']]

# Print the parameters associated with the maximum mean value for topic diversity
print("Parameters associated with the maximum mean value in the topic diversity column:")
print(parametri_max_mean)

Parametri associati al valore massimo di mean nella colonna topic diversity:
Mean(model_runs)    0.066437
num_iteration              9
activation              relu
num_layers                 1
num_neurons              300
num_topics                13
Name: 9, dtype: object


In [None]:
from octis.models.CTM import CTM  # Import the CTM class from OCTIS for topic modeling

# Define the CTM (Contextualized Topic Model) with the following parameters:
model = CTM(
    num_epochs=30,  # Train the model for 30 epochs
    inference_type='zeroshot',  # Use zeroshot inference type, which works without training data labels
    bert_model="distiluse-base-multilingual-cased-v2",  # Specify the pre-trained BERT model for contextual embeddings
    num_topics=13,  # Set the number of topics to discover in the dataset
    num_neurons=300,  # Number of neurons in the hidden layer(s) of the model
    num_layers=1,  # Use a single hidden layer in the model's architecture
    activation="relu",  # Use ReLU (Rectified Linear Unit) as the activation function
    use_partitions=False  # Disable the use of data partitions for training and validation
)

In [67]:
output_npmi = model.train_model(dataset)

In [69]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output_npmi)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output_npmi)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.9769230769230769
Coherence: 0.09425529019977531


In [68]:
for t in output_npmi['topics']:
  print(" ".join(t))

festa ogni mondiale agosto evento tedesco calcio svolgere guerra settembre
servizio ferrovia fabbricato ferroviario viaggiatore servire binario treno fermata linea
uno che più città tra con cui altro parte essere
media autunno minimo fresco caldo parola distribuire rigido dolce talvolta
bambino tela madonna raffigurare affresco dipinto cristo altare vergine statua
cura istituto stampa accademia guida conferenza pinacoteca civico seminario università
chiesa essere questo che luogo convento nome dedicare cui culto
provincia comune lazio aquila frazione sasso confine meridionale parco situare
arco finestra portale sesto facciata tre rettangolare pianta pietra semplice
feudo famiglia ducato regno pontificio papa possedimento dominio duca imperatore
xii medievale xix secolo risalire epoca fortificato xiii resto villa
strada torrente via sorgente fiume percorso attraversare verso sentiero lungo
produzione attività numero agricolo particolare interesse numeroso economico sviluppo agricoltura


In [70]:
topics = pd.DataFrame(output_npmi['topics'])
topics.to_excel('/home/MindHard/fpulera/topic extraction/CTM/results_npmi/topics.xlsx', index=False)

In [71]:
vocabolario = []
with open('/home/MindHard/fpulera/topic extraction/CTM/total_dataset/vocabulary.txt', 'r', encoding= 'utf-8') as file:
    for word in file:
        vocabolario.append(word)

In [None]:
# Import the pandas library for handling data
import pandas as pd

# Load the topic-word matrix from the output dictionary, assumed to contain the results from the model
topic_word_matrix = pd.DataFrame(output_npmi['topic-word-matrix'])

# Assign column names to the matrix based on the vocabulary terms
# Each column index is replaced by the corresponding word from the vocabulary
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Transpose the topic-word matrix to switch rows and columns
# This allows words to become rows and topics to become columns for easier readability
inverted_topic_word_matrix = topic_word_matrix.transpose()

# Print the transposed matrix to visually inspect the results
print(inverted_topic_word_matrix)

# Save the transposed topic-word matrix to an Excel file for external analysis or reporting
inverted_topic_word_matrix.to_excel(
    '/home/MindHard/fpulera/topic extraction/CTM/results_npmi/topic_word_matrix.xlsx',  # Specify the file path
    index=False  # Exclude row indices in the saved Excel file
)

                    0         1         2         3         4         5   \
aas\n        -0.060464  0.189404 -0.801429  0.290872 -0.108897  0.249331   
aavv\n       -0.060145  0.097203 -0.530242  0.124059 -0.067684  0.346267   
aba\n        -0.054489  0.132721 -0.572458  0.220971 -0.107348  0.190358   
abaco\n      -0.233753  0.204590 -0.749343  0.396925 -0.259049  0.309459   
abamonte\n   -0.085941  0.160350 -0.783521  0.264296 -0.119038  0.221579   
...                ...       ...       ...       ...       ...       ...   
όλυμπος\n    -0.039391  0.162848 -0.630472  0.271110 -0.110032  0.219312   
ἐκπεριάγων\n -0.046403  0.122908 -0.534966  0.189217 -0.080340  0.166193   
ἰχθύς\n      -0.055477  0.194726 -0.913647  0.340350 -0.144077  0.245176   
ὑπὸ\n        -0.045289  0.120458 -0.524359  0.185097 -0.078941  0.162861   
ﬁume\n       -0.071806  0.156424 -0.761235  0.254440 -0.108296  0.222539   

                    6         7         8         9         10        11  \
aas\n      

In [73]:
import pandas as pd

corpus_processed = pd.read_csv('/home/MindHard/fpulera/topic extraction/CTM/total_dataset/corpus.tsv', sep='\t', header=None)
corpus_processed
new_df = pd.DataFrame(corpus_processed[0])

In [None]:
num = 1
for topic in output_npmi['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6
0,stazione acquasparta essere uno stazione ferro...,0.050502,0.054071,0.019507,0.036044,0.816559,0.023317
1,fabbricato viaggiatore sviluppare due livello ...,0.303326,0.035609,0.368569,0.046658,0.094680,0.151158
2,fabbricato principale diramare poi due corpo m...,0.180764,0.052445,0.225828,0.064456,0.075447,0.401061
3,solo parte piano terra essere fruibile per via...,0.120927,0.096736,0.113702,0.389687,0.130761,0.148187
4,condizione fabbricato essere piuttosto buono t...,0.065393,0.079745,0.297034,0.299632,0.103135,0.155060
...,...,...,...,...,...,...,...
37507,casalbordino cuasàlë abruzzese essere uno comu...,0.206972,0.137339,0.067834,0.105489,0.362710,0.119656
37508,trovare circa chilometro costa adriatica uno c...,0.049050,0.034960,0.119862,0.037417,0.733020,0.025691
37509,lido casalbordino attrezzare frequentare local...,0.019582,0.044954,0.034970,0.053604,0.805272,0.041618
37510,nome volere risalga uno condottiero quello per...,0.138089,0.597970,0.077834,0.059573,0.099490,0.027044


In [None]:
import pickle

new_df.to_excel('/home/MindHard/fpulera/topic extraction/CTM/results_npmi/topic-document-matrix.xlsx', index=False)

## Fine-tuned model integration

CTM generates output in the form of topic distributions for each document, which are the contextualized embeddings.

The topic distribution that we saved in an Excel file represents a version of the document representations with respect to the topics extracted from the CTM model. This can be considered as a form of embedding, since each document is represented by a vector of probabilities (or scores) for each topic.

In [15]:
from octis.dataset.dataset import Dataset
from transformers import BertForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

In [16]:
# we take the dataset and save it in a variable
dataset_saved = '/home/MindHard/fpulera/topic extraction/CTM/total_dataset'
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dataset_saved)

In [17]:
print(f"Number of documents in the dataset: {len(dataset.get_corpus())}")
print(f"Example of document: {dataset.get_corpus()[0]}")

Number of documents in the dataset: 37512
Example of document: ['stazione', 'acquasparta', 'essere', 'uno', 'stazione', 'ferroviario', 'servizio', 'acquasparta', 'ferrovia', 'centrale', 'umbrare', 'tratto', 'fra', 'terno', 'perugia']


* We load the model **"dbmdz/bert-base-italian-uncased"**, a version of BERT pre-trained on Italian texts. The "uncased" suffix indicates that the model is not case-sensitive (for example, "casa" and "Casa" are treated the same).
* With **BertForMaskedLM.from_pretrained(model_name)**, we are loading a pre-trained version of the BERT model designed for Masked Language Modeling (MLM) tasks.
    * The MLM task is to predict missing (or masked) words in a text, for example transforming the sentence "The cat [MASK] on the roof" into "The cat sleeps on the roof".
    * When using a pre-trained BERT model on a general dataset (such as Wikipedia or books), the model may not fully capture the nuances or specific domain of your data.
    * Fine-tuning BERT with MLM allows the model to refine semantic representations, making them more suitable for your POI corpus.
    * Fine-tuning with MLM does not require labels. It is perfect for problems like ours, where we want to improve the representation without directly supervising a task.
    * After refining the representations, we can use the embeddings produced by BERT as input to our Combined Topic Model (CTM). A BERT model fine-tuned on our data provides embeddings that better reflect the topics present in our corpus.
    * MLM is one of the most common techniques for unsupervised fine-tuning and has been shown to significantly improve results in downstream tasks (such as topic modeling or classification).
* With **AutoTokenizer.from_pretrained(model_name)**, we are loading the tokenizer associated with the specified BERT model.
The tokenizer is responsible for transforming the raw text into a numeric representation (called input IDs) that the model can understand. For BERT, this includes tokenizing into subwords and adding special tokens (such as [CLS] and [SEP]).

In [18]:
# Let's load the template and tokenizer for Italian
model_name = "dbmdz/bert-base-italian-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at dbmdz/bert-base-italian-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Let's set up a data collator to prepare the data to be fed to the model during the training or fine-tuning process on a Masked Language Modeling (MLM) task. Here are the details:

* **DataCollatorForLanguageModeling**: This is a tool provided by the Hugging Face library to automate data preparation. When we pass a batch of data to the model, the data collator:
    * Masks some tokens in the input according to a certain probability.
    * Ensures that the data format is correct for the MLM task.
* The value 0.15 means that 15% of the tokens in the text will be masked, which is a standard choice for BERT training.

**How ​​does masking work?**
When the data collator masks a token:
* 80% of the time, the token is replaced with [MASK].
* 10% of the time, the token is replaced with another random token.
* 10% of the time, the original token is kept unchanged.
This approach makes the model more robust by preventing it from relying too much on the [MASK] token during training.

In [19]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Percentage of masked tokens (15%)
)

We prepare the text corpus for further processing, i.e. fine-tuning the model:

In [20]:
corpus = dataset.get_corpus()
corpus_strings = [' '.join(doc) for doc in corpus] 

This code defines a function to tokenize the corpus data so that it is ready to be used by the model.

Specifically:
* examples["text"]: the list of strings to tokenize.
* padding=True: adds padding to each tokenized sequence to have all sequences of the same length, useful when working with batch inputs.
* truncation=True: truncates each tokenized sequence to a specified maximum length (here max_length=512) to avoid it being too long for the model.
* max_length=512: specifies the maximum length of the sequence (512 is the limit for many BERT models).

This function prepares data to be fed to the model:

* Transforms text into numbers that the model can use.
* Handles variable-length sequences with padding and truncation, which is essential for working with batches of data.
* It is often used with methods like map to efficiently tokenize the entire dataset.

In [21]:
# We prepare the data (corpus already preprocessed, so no further preprocessing operations are needed)
def tokenize_function(examples):
    print(examples["text"])  # To check if it contains a list of strings
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

In [22]:
from datasets import Dataset

# Let's convert the corpus into a format compatible with HuggingFace
data = {"text": corpus_strings}  # corpus_strings is the list of descriptions as strings
d = Dataset.from_dict(data)

In [23]:
# Split data in training and validation set (80% train, 20% validation)
d = d.train_test_split(test_size=0.2)

In [24]:
# Data preprocessing
train_dataset = d["train"].map(tokenize_function, batched=True)
val_dataset = d["test"].map(tokenize_function, batched=True)

Map:   0%|          | 0/30009 [00:00<?, ? examples/s]

Map:   7%|▋         | 2000/30009 [00:00<00:01, 16438.48 examples/s]

['borgo medievale venire progressivamente abbandonare partire xviii secolo popolazione spostare più monte uno località che prendere stesso nome', 'confluivare piccolo comunità religioso soppresso per problema spazio per problema economico', 'necessità approvvigionamento materia primo avere portare tutto casualmente scoperta tronco fossile dunarobba', 'corpo buono stato conservazione rinvenuto essere molto seguito causa disinteresse iniziale mutato condizione climatico maldestre operazione mantenimento conservazione numero corpo ridurre circa', 'territorio controllare formazione brigata partigiano garibaldo comprendere fino inizio mucciafora comune cascia norcia monteleone spoleto comune alto valnerino loro frazione fino limite preci ruscio castiglione arrone valle ferentillo', 'stalagmire stalattito rare colonna mammellone rivestimento cristallo pseudo macro pisolito vasca velo velo cascato gur ecc', 'base media trentennale riferimento temperatura medio mese più freddo gennaio attestar

Map:  20%|█▉        | 6000/30009 [00:00<00:01, 14479.36 examples/s]

['questo recente scoprire rivesgere notevole importanza artistico sentimentale per comunità frate quanto unico segno visibile presenza spagna questo convento per quale realizzare suo opera maggiore', 'estensione domini monastero allargare fino distretto perugia cortonare toccare località muccignare badia casciano valle fiume nestore aggia', 'per quanto riguardare corredo tomba femminile continuare contenere gioielli mentre quello maschile arma', 'influenza umbro marchigiane dialetto monterealese essere evidenziare uso frequente articolo maschile posto aquilano', 'opera con madonna con bambino santo apostolo biagio vescovo sebastiano essere dipingere niccolò circignano rappresentare alto vergine beatitudina con bambino circondare angelo mentre parte inferiore quattro santo', 'rimanere tuttora visibile sala ottagonale mosaico che ricoprire pavimento essere rimanere intatto alcuno pezzo', 'iniziativa essere essere promuovere fine valorizzare luogo con peculiarità carattere storico cultura

Map:  27%|██▋       | 8000/30009 [00:00<00:01, 12017.15 examples/s]

['essere essere eseguire uno pittore anonimo ambito scuola perugino rappresentare padre eterno angelo madonna con bambino tra santo cristoforo rocco san romano', 'misura avere obiettivo ridurre intervento trasformazione semplificazione ambientale attraverso regolamentazione particolare attività selvo colturale agricolo', 'uno lettera novembre scrittare superiore padre cavare bucchianico parlare uno immagine', 'quinto ultimo libro denudato comminare mugliere ogni comune cui cittadino avere fare sconfinare proprio bestiame territorio gemino', 'monumento caduto lavoro cui scultura essere opera lustro essere inaugurare dicembre', 'lenanolenare essere uno frazione posta lungo strada provinciale pettino costa montagna campello', 'toponimo compare documento archivio fino xiv secolo come insula romano romanesco', 'xix secolo lago avere presentare massima variazione che conoscare metro escursione venti anno', 'questo ritratto eseguire olio tela vescovo essere raffigurare sedere con cotta bianco

Map:  40%|███▉      | 12000/30009 [00:00<00:01, 13480.81 examples/s]

['ancora sacro convento essere legare convento chiesa arce secolo xiii poco fuori abitato rocca angelo comune assisi che essere uno primo insediamento osservante conservare prezioso documento arte pittorico umbrare soprattutto quattrocentesco', 'abate benedettino grimoaldo che ottenere protezione pontificiare papa urbano ricevendone per primo bacolo pastorale che andare sostituire scettro imperiale come insegna politico abbazia intraprendere ricostruzione chiesa che essere riconsacrare solennemente', 'interessante essere cappella santissimo sacramento stile gotico con uno prezioso trittico con uno madonna con bambino tra santo pietro vittorino realizzare tra matteo inoltre potere ammirare uno annunciazione destra uno madonna trono parete destro san vittorino', 'tode quello epoca essere molto influente sia per alleanza con perugia sia per domini proprio che andare terno amelia terra conto marsciare con uno forte dominazione anche foligno', 'interno estremamente spoglio conservare uno qu

Map:  53%|█████▎    | 16000/30009 [00:01<00:01, 11002.64 examples/s]

['noto passato con nome castrum cupparum poi poggio coppa essere dominio ducato spoleto cui pagare dazio signore narnese cardole', 'altro parte via per tutto lunghezza essere accompagnare uno filare platano medio grandezza', 'castello poggio castello fondare come feudo imperiale uno preesistente torre bizantino vii sec', 'qualche anno uno cuoco avere presentare uno gara culinario internazionale vincendola con uno nome assai altisonante che depurare ricercatezza linguistico significare semplicemente polpette uovo formaggio sugo', 'inizio xix secolo essere anche preparare uno adeguato documentazione con scopo motivare richiesta autonomia caldaro tre villa ossia torre rogatto iubatto', 'fino smantellamento campo sportivo collestatto piano avere militare campionato dilettantistico umbro calcio promuovere anche uno florido settore giovanile che vedere protagonista non solo ragazzo bambino locale frazione collestatta torre orsino anche frazione valnerina città terno', 'tornare pace tutto pen

Map:  60%|█████▉    | 18000/30009 [00:01<00:00, 12110.91 examples/s]

['frazione essere lambire ferrovia aquila linea secondario binario unico trazione diesel che servire con fermata aquila gregorio porre circa centro storico', 'numeroso importante intervento restauro edificio sacro avere con tempo fare perdere ulteriore elemento databile xiii secolo tuttavia essere certo che già xvi secolo santuario essere già metare pellegrinaggio essere presente uno altare lapideo fare dono vergine popolazione', 'monumento presumere essere essere rinnovare secolo successivo suo costruzione abbandonare seguito decadenza amiternum', 'occasione termine grande guerra essere essere porre uno grande lapide con decorazione allegorio vittoria alato con uno lastra marmo centrale che mostrare nome caduto', 'alcuno osservazione potere fare struttura presente sfondo che non essere certo classico capanna presepe probabilmente divin pittore avere scegliere realizzare così per non coprire eccessivamente paesaggio che per lui avere uno notevole importanza probabilmente essere per ste

Map:  73%|███████▎  | 22000/30009 [00:01<00:00, 12373.65 examples/s]

['frate avere obbligo suonare campana convento per apertura chiusura porta ingresso castrum', 'stazione collo monte bove essere scalo ferroviario porre comune carsole che essere essere costruire servizio frazione collo monte bove', 'questo animale alloctone distruggere habitat specie autoctone mettere pericolo sopravvivenza anche predandolo privandola risorsa necessario', 'fortezza essere risultato vario intervento restauro ampliamengere protrattisi inizio suo elevazione fino termine secolo considerare come modifica necessario per adeguare migliorare scopo sviluppo funzionale', 'principale strada che interessare territorio paese essere via monte falterona via maceratolo che collegare località rispettivamente frazione foligno', 'per ottimo posizione alto collina trasimeno paese essere meto turismo presentare uno forte sviluppo agriturismo', 'epoca alto medievale essere qui documentare presenza uno chiesa intitolare giovanne battista località prugneto sede parrocchia partire colle avere 

Map:  87%|████████▋ | 26000/30009 [00:02<00:00, 13785.20 examples/s]

['località torre essere occupare uno lago artificiale dire angelo per rifornire città risorsa idroelettrico anche successivamente favorire uno turismo escursionistico cambiare clima solitamente rigido rendendolo più mito durante inverno', 'casolo benché sotto protezione britannico subire alcuno cannoneggiamente tedesco giugno provenire piana guardiagrele', 'nome paese derivare probabilmente quello dunnia legionare cui ottaviano distribuire questo territorio intorno secolo uno spiegazione alternativo dire invece che trattare apposizione termine longobardo duna suo significato latino roba', 'storico medievale indicare questo sito come quello uno tempio romano dedicare bona mater', 'stazione caprareccia essere uno stazione ferroviario porre lungo linea ferroviario scartamento ridurre spoleto norcia uno area territorio comunale spoleto', 'origariamente collocare lato scala che portare tomba santo essere ora sistemare ordine inferiore chiostro sisto ridosso parete est', 'qui sala attigue tr

Map: 100%|██████████| 30009/30009 [00:02<00:00, 12946.45 examples/s]


['avere impianto rettangolare con esterno mattone vista facciare dividere cornicione ornare portale architravare con timpano triangolare lunetta rilievo', 'paese distendere pianura tra strada statale flaminia riva destro fiume topino uno altitudine chilometro foligno essere popolare circa abitante', 'rocca essere edificare inglobare uno antichissimo luogo culto dedicare michela arcangelo fondare epoca longobardare secolo viii chiamare santi angelo flea', 'scoperta due vaso cinerare conservare antiquarium palazzo comunale segnalare presenza umano uno periodo compreso tra viii secolo tra iii secolo formarere numeroso nucleo abitato genere piccolo dimensione dedire prevalentemente attività agricolo quello artigianale', 'imperatore ottone iii fare dono paese conte longobardo offredo figlio conte nocera discendere monaldesco orvieto come ricompensa per avere militare con lui', 'chiesa marco pomeriis essere uno piccolo chiesa origine molto antico situare spoleto fondo via felice primo suburb

Map:  27%|██▋       | 2000/7503 [00:00<00:00, 10090.34 examples/s]

['giocatore dovere suo tiro tenere conto sasso pendenza buca etc venire venire praticare via rurale domenica come svago divertimento', 'lettura affresco parte alto lunettone andare sinistra verso destra per arrivare fino altezza uomo con ultimo scena', 'costruzione chiesa nuovo campanile essere completare giugno essere celebrare primo funzione', 'collegiata santo essere uno luogo culto cattolico centro storico orvieto provincia terno sire piazza repubblica', 'abitato essere lotta con rocca mezzo infeudazione volere filippo chalon principe essere concedere benagalzer', 'giare però passare tode essere fare edificare rocca regia rocca tode per volere gregorio', 'qui essere situare copia famoso statua michela arcangelo quanto originale essere custodire presso museo gente abruzzo', 'cappella orsino giovanni battista essere uno cappella situare fondo transetto sinistro basilica inferiore assisi', 'vedere grande boom avere mercato sempre più spesso organizzare anche mercato fuori classico gio

Map:  53%|█████▎    | 4000/7503 [00:00<00:00, 12884.05 examples/s]

['tettonicail centro abitato poggiare uno sinclinale proseguimento fossa tettonico valle orta', 'società essere nicola becchetto quinto europeo faddo settimo europeo oltre ottimo atleta come sonaglia lascialfario che detengere record mondiale atterramento uno minuto', 'chiesa avere pianta rettangolare movimentato facciare includere base uno portico arco che precedere ingresso dare portale romanico tutto sesto con strombatura', 'con suo chilometro lunghezza chilometro larghezza pari chilometro quadrato superficie marsile rappresentare uno vulcano più estese', 'chiamare essere cappella tega cognome suo vecchio proprietario scopritore affresco che cappellere essere internamente costituire uno ambiente unico pianta rettangolare volta crociera', 'abbandono attività molitorio fiorente medioevo avere fare che mulino subissero uno inevitabile degrado accompagnare crescita uno rigoglioso vegetazione', 'unico beneficio che paese ricavò essere introduzione uno maestro fino terzo elementare creazi

Map: 100%|██████████| 7503/7503 [00:00<00:00, 11721.13 examples/s]

['con uno lettera papa giovanno xxii tra primo testimonianza scrivere tale proprietà vescovo simone bagnoregio rivendicare possedimento condannare decisione uno suo predecessore rustico circa vendere oltre uno secolo prima templare', 'primo metà cinquecento chiesa avere uno solo navata con uno altare maggiore rialzato rispetto piano pavimento due altario laterale uno cornu epistolae altro cornu evangelo dedicare rispettivamente sacro cuore madonna rosario', 'agosto ora essere essere riaprire con uno senso unico alternato regolare semaforo', 'tra altario laterale segnalare quello giovanni battista cui pala attribuire fiorentino bernardino lorenzo essere donare chiesa principe antonio medico quando visitare stato capestrare che suo famiglia avere acquistare piccolomine', 'luogo prediletgere turista per suo posizione incantevole essere anche sede uno cava', 'grazie anche sviluppo industriale essere divenire sede fabbrica come arrow produzione impianto scarico essere sede anche nardo stori




In [25]:
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

In [26]:
print(train_dataset[0])  # Let's check that it contains input_ids, token_type_ids, attention_mask

{'input_ids': [102, 6497, 10265, 2698, 16839, 255, 10507, 2039, 10237, 1863, 2585, 14221, 796, 2965, 552, 2695, 115, 159, 1785, 935, 985, 103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


We start training a fine-tuned HuggingFace model on the custom dataset.

**TrainingArguments:**
* output_dir="./results": La directory in cui verranno salvati i risultati del modello, inclusi i checkpoint.
* evaluation_strategy="epoch": La valutazione verrà eseguita ad ogni epoca.
* learning_rate=2e-5: Imposta il tasso di apprendimento.
* per_device_train_batch_size=16: Imposta la dimensione del batch per ogni dispositivo (GPU o CPU).
* num_train_epochs=3: Numero di epoche per l'addestramento.
* weight_decay=0.01: Decadimento del peso per evitare l'overfitting.
* save_strategy="epoch": Salva il modello ad ogni epoca.
* logging_dir="./logs": La directory in cui verranno salvati i log dell'addestramento.

In [27]:
# Let's set up training
training_args = TrainingArguments(
    output_dir="./fine_tuned_bert_POI",           # Directory to save templates
    evaluation_strategy="epoch",     
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
)



**Trainer:**
* model=model: The model to train.
* args=training_args: The training parameters previously defined.
* train_dataset=train_dataset: The training dataset.
* eval_dataset=val_dataset: The validation dataset.
* tokenizer=tokenizer: The tokenizer to use for tokenizing the text.
* data_collator=data_collator: A collator for batch handling during training, which helps manage padding.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tuning
trainer.train()

In [32]:
import torch

# We save the model by making the tensors contiguous
state_dict = trainer.model.state_dict()
for key, param in state_dict.items():
    if not param.is_contiguous():
        state_dict[key] = param.contiguous()

In [34]:
tokenizer.save_pretrained("./fine_tuned_bert_POI")
torch.save(state_dict, "./fine_tuned_bert_POI/pytorch_model.bin")

In [35]:
# Define the path to the fine-tuned BERT model
bert_model_path = "./fine_tuned_bert_POI"  # Path to the directory containing the fine-tuned BERT model

# Initialize a CTM (Contextualized Topic Model) using the fine-tuned BERT model
model_finetuned = CTM(
    bert_path=bert_model_path,  # Specify the fine-tuned BERT model for the CTM
    num_epochs=50,             # Set the number of training epochs to 50
    num_topics=10,             # Define the number of topics to extract
    inference_type='zeroshot', # Use zero-shot inference, which doesn't require labeled data
    use_partitions=False       # Disable partitioning to work with the entire dataset at once
)

In [36]:
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')
topic_diversity = TopicDiversity(topk=10)
model_outputs_finetuned = model_finetuned.train_model(dataset)   # train the model

Batches: 100%|██████████| 376/376 [09:53<00:00,  1.58s/it]


In [37]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_outputs_finetuned)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_outputs_finetuned)
print("NPMI: "+str(npmi_score))

Topic diversity: 1.0
NPMI: -0.020117144899126853


In [38]:
for t in model_outputs_finetuned['topics']:
  print(" ".join(t))

specie temperatura agricoltura media animale freddo impresa medio economia diffuso
monastero feudo abbazia papa xii benedettino iii monace viii famiglia
ferrovia fermata stazione circoscrizione viterbo unione treno dialetto ferroviario linea
domenica patronale girare guida pagina annibale tipografia iccu festeggiare giuliare
madonna bambino raffigurare affresco tela santo vergine altare cristo dipinto
nota registro travertino fabbricato angolare leggermente fila ciascuno acere suddividere
uno essere interno arco piccolo cui mentre chiesa resto con
monte est valle nord fiume sud alto ovest torrente sasso
per che questo avere non anche suo come quello tutto
museo culturale biblioteca civico archeologico museale studio arte reperto archivio


### Topic Coherence Optimization

In [39]:
search_space = {"num_layers": Categorical({1, 2, 3}),
                "num_neurons": Categorical({100, 200, 300}),
                "activation": Categorical({'sigmoid', 'relu', 'softplus'}),
                "dropout": Real(0.0, 0.95),
                "num_topics": Integer(low=5,high=15)
}

In [40]:
optimization_runs=15
model_runs=1

In [42]:
optimizer=Optimizer()
optimization_result = optimizer.optimize(
    model_finetuned, dataset, npmi, search_space, number_of_call=optimization_runs,
    model_runs=model_runs, save_models=False,
    extra_metrics=None, # to keep track of other metrics
    save_path='/home/MindHard/fpulera/topic extraction/CTM/results_finetuned')
optimization_result.save_to_csv("/home/MindHard/fpulera/topic extraction/CTM/results_finetuned/results.csv")

Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14


In [43]:
import pandas as pd  # Import the pandas library for data manipulation and analysis

# Load the CSV file containing the results of the topic modeling optimization into a DataFrame
df = pd.read_csv("/home/MindHard/fpulera/topic extraction/CTM/results_finetuned/results.csv")

# Find the index of the row with the maximum value in the 'Mean(model_runs)' column
indice_max_mean = df['Mean(model_runs)'].idxmax()

# Extract the parameters corresponding to the maximum mean value
# Includes columns: 'Mean(model_runs)', 'num_iteration', 'activation', 'num_layers', 'num_neurons', 'num_topics'
parametri_max_mean = df.loc[indice_max_mean, ['Mean(model_runs)', 'num_iteration', 'activation', 'num_layers', 'num_neurons', 'num_topics']]

# Print the parameters associated with the maximum mean value for topic diversity
print("Parameters associated with the maximum mean value in the topic diversity column:")
print(parametri_max_mean)

Parameters associated with the maximum mean value in the topic diversity column:
Mean(model_runs)    0.080905
num_iteration              7
activation          softplus
num_layers                 1
num_neurons              300
num_topics                15
Name: 7, dtype: object


In [44]:
from octis.models.CTM import CTM  # Import the CTM class from OCTIS for topic modeling

# Define the CTM (Contextualized Topic Model) with the following parameters:
finetuned = CTM(
    num_epochs=30,  # Train the model for 30 epochs
    inference_type='zeroshot',  # Use zeroshot inference type, which works without training data labels
    bert_path=bert_model_path,  # Specify the pre-trained BERT model for contextual embeddings
    num_topics=15,  # Set the number of topics to discover in the dataset
    num_neurons=300,  # Number of neurons in the hidden layer(s) of the model
    num_layers=1,  # Use a single hidden layer in the model's architecture
    activation="softplus",  
    use_partitions=False  # Disable the use of data partitions for training and validation
)

In [45]:
finetuned_npmi = finetuned.train_model(dataset)

In [46]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(finetuned_npmi)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(finetuned_npmi)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.9666666666666667
Coherence: 0.025738422017605907


In [47]:
for t in finetuned_npmi['topics']:
  print(" ".join(t))

castello città fino abbazia romano suo quando poi fare questo
area alto lago parco valle naturale sud tra orientale nord
aula tetto ciascuno nota triangolare registro cornico organo mattone soffitto
cura guida stampa edizione pescarare conferenza ufficio episcopale iccu isbn
uno piano piccolo due pietra centrale ingresso arco sopra presentare
diffuso artigianale condizione causare lavorazione coltivazione grave sismico danno economia
bambino vergine angelo raffigurare tela caterina attribuire maestro madonna opera
che non quello questo essere avere potere molto per più
per teatro archeologico locale periodo realizzazione opera reperto anno avere
xii risalire xix sec secolo preesistente xiv viii risalge xiii
affluente raggiungibile collina vetta pianura bove distanza dominare nestore panorama
medico figlio duca cardinale marchese conte papa titolo nipote fratello
squadra calcio campionato festa categoria svolgere agosto mondiale settimana domenica
chiesa convento essere dedicare palazzo

In [49]:
topics = pd.DataFrame(finetuned_npmi['topics'])
topics.to_excel('/home/MindHard/fpulera/topic extraction/CTM/results_finetuned/topics.xlsx', index=False)

In [51]:
vocabolario = []
with open('/home/MindHard/fpulera/topic extraction/CTM/total_dataset/vocabulary.txt', 'r', encoding='utf-8') as file:
    for word in file:
        vocabolario.append(word)

In [52]:
# Import the pandas library for handling data
import pandas as pd

# Load the topic-word matrix from the output dictionary, assumed to contain the results from the model
topic_word_matrix = pd.DataFrame(finetuned_npmi['topic-word-matrix'])

# Assign column names to the matrix based on the vocabulary terms
# Each column index is replaced by the corresponding word from the vocabulary
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Transpose the topic-word matrix to switch rows and columns
# This allows words to become rows and topics to become columns for easier readability
inverted_topic_word_matrix = topic_word_matrix.transpose()

# Print the transposed matrix to visually inspect the results
print(inverted_topic_word_matrix)

# Save the transposed topic-word matrix to an Excel file for external analysis or reporting
inverted_topic_word_matrix.to_excel(
    '/home/MindHard/fpulera/topic extraction/CTM/results_finetuned/topic_word_matrix.xlsx',  # Specify the file path
    index=False  # Exclude row indices in the saved Excel file
)

                    0         1         2         3         4         5   \
aas\n        -0.474970 -0.436221  0.117872  0.188375 -0.343957  0.068956   
aavv\n       -0.454619 -0.375474  0.045820  0.172344 -0.340137  0.016513   
aba\n        -0.527001 -0.485624  0.171582  0.163997 -0.226910  0.041067   
abaco\n      -0.734229 -0.681137  0.100066  0.250640 -0.354185  0.116954   
abamonte\n   -0.466051 -0.449265  0.120388  0.177584 -0.344603  0.073326   
...                ...       ...       ...       ...       ...       ...   
όλυμπος\n    -0.472407 -0.444008  0.145022  0.221812 -0.339362  0.089982   
ἐκπεριάγων\n -0.426525 -0.412013  0.099106  0.159357 -0.310009  0.054073   
ἰχθύς\n      -0.477525 -0.444306  0.155568  0.157094 -0.361132  0.057064   
ὑπὸ\n        -0.425898 -0.411389  0.098950  0.159109 -0.309557  0.053964   
ﬁume\n       -0.472349 -0.404042  0.104350  0.201090 -0.336174  0.053653   

                    6         7         8         9         10        11  \
aas\n      

In [53]:
import pandas as pd

corpus_processed = pd.read_csv('/home/MindHard/fpulera/topic extraction/CTM/total_dataset/corpus.tsv', sep='\t', header=None)
corpus_processed
new_df = pd.DataFrame(corpus_processed[0])

In [54]:
num = 1
for topic in finetuned_npmi['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,stazione acquasparta essere uno stazione ferro...,0.002442,0.008499,0.008433,0.004648,0.007279,0.005415,0.002249,0.003716,0.014422,0.004267,0.004649,0.001983,0.001500,0.002626,0.927871
1,fabbricato viaggiatore sviluppare due livello ...,0.013940,0.010078,0.067213,0.114659,0.410038,0.033829,0.026232,0.025990,0.030273,0.037029,0.032145,0.053111,0.032926,0.044474,0.068063
2,fabbricato principale diramare poi due corpo m...,0.010893,0.008271,0.090539,0.091143,0.456140,0.034805,0.018377,0.021038,0.020162,0.038421,0.023066,0.045293,0.037134,0.038211,0.066509
3,solo parte piano terra essere fruibile per via...,0.010100,0.013677,0.023056,0.063137,0.492502,0.017117,0.016817,0.037110,0.175241,0.023550,0.015661,0.023874,0.020774,0.037473,0.029910
4,condizione fabbricato essere piuttosto buono t...,0.024626,0.038266,0.290221,0.029117,0.077790,0.100343,0.050992,0.041032,0.054734,0.037286,0.056443,0.041234,0.062457,0.071992,0.023465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37507,casalbordino cuasàlë abruzzese essere uno comu...,0.016275,0.030045,0.013096,0.014986,0.012244,0.005271,0.005299,0.004897,0.020887,0.010421,0.022451,0.003743,0.007585,0.025653,0.807148
37508,trovare circa chilometro costa adriatica uno c...,0.002642,0.021910,0.011588,0.029645,0.016016,0.009380,0.016480,0.010726,0.005351,0.013942,0.797817,0.007598,0.012495,0.005977,0.038435
37509,lido casalbordino attrezzare frequentare local...,0.062639,0.595576,0.010677,0.018091,0.029250,0.026486,0.012800,0.028359,0.019649,0.014303,0.101381,0.013802,0.016970,0.024580,0.025436
37510,nome volere risalga uno condottiero quello per...,0.019133,0.040674,0.031503,0.009853,0.015250,0.027630,0.020089,0.022348,0.006757,0.049381,0.015759,0.575413,0.130421,0.016809,0.018979


In [55]:
new_df.to_excel('/home/MindHard/fpulera/topic extraction/CTM/results_finetuned/topic-document-matrix.xlsx', index=False)