In [1]:
import os
os.chdir(os.path.pardir)
from dataset.dataset import Dataset
from evaluation_metrics.diversity_metrics import Topic_diversity
from evaluation_metrics.topic_significance_metrics import KL_uniform
from skopt import gp_minimize, forest_minimize, dummy_minimize
from optimization.optimizer import Optimizer
from skopt.space.space import Real, Integer
import multiprocessing as mp
from models import TorchETM
import torch
import numpy as np

In [2]:
dataset = Dataset()
dataset.load("preprocessed_datasets/newsgroup/newsgroup_lemmatized_10")

True

In [3]:
# Load model
model = TorchETM.ETM_Wrapper()

In [7]:
model.hyperparameters['num_epochs'] = 3
model.hyperparameters['enc_drop'] = 0.1

In [8]:
model.partitioning(True)

In [10]:
model.train_model(dataset, model.hyperparameters, top_words= 10,
                  topic_word_matrix=False,
                    topic_document_matrix=False)

True pre_set_model
True pre_set_default_hyp
model: ETM(
  (t_drop): Dropout(p=0.1, inplace=False)
  (theta_act): ReLU()
  (rho): Linear(in_features=300, out_features=1268, bias=False)
  (alphas): Linear(in_features=300, out_features=10, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=1268, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=10, bias=True)
  (logsigma_q_theta): Linear(in_features=800, out_features=10, bias=True)
)
True post_set_model
****************************************************************************************************
Epoch----->1 .. LR: 0.005 .. KL_theta: 0.06 .. Rec_loss: 515.16 .. NELBO: 515.22
****************************************************************************************************
****************************************************************************************************
Epoch----->2 .. L

{'topics': [['victim',
   'playoff',
   'card',
   'library',
   'turn',
   'past',
   'license',
   'be',
   'school',
   'generation'],
  ['victim',
   'playoff',
   'license',
   'be',
   'past',
   'school',
   'library',
   'method',
   'turn',
   'card'],
  ['victim',
   'playoff',
   'license',
   'be',
   'past',
   'library',
   'card',
   'school',
   'method',
   'generation'],
  ['victim',
   'playoff',
   'be',
   'license',
   'past',
   'library',
   'card',
   'school',
   'turn',
   'method'],
  ['victim',
   'playoff',
   'turn',
   'past',
   'library',
   'card',
   'license',
   'be',
   'school',
   'generation'],
  ['victim',
   'license',
   'be',
   'playoff',
   'past',
   'school',
   'method',
   'soldier',
   'volume',
   'encryption'],
  ['victim',
   'playoff',
   'past',
   'license',
   'be',
   'library',
   'card',
   'school',
   'turn',
   'method'],
  ['victim',
   'playoff',
   'turn',
   'library',
   'card',
   'past',
   'be',
   'license',
   

In [14]:
# Topic diversity
topic_diversity = Topic_diversity()

# KL_Uniform
#kl_uniform = KL_uniform()

In [15]:
# Define optimization parameters
opt_params = {}
opt_params["n_calls"] = 30
opt_params["minimizer"] = forest_minimize
opt_params["different_iteration"] = 3
opt_params["n_random_starts"] = 5
#opt_params["extra_metrics"] = [kl_uniform] # List of extra metrics
opt_params["n_jobs"] = mp.cpu_count() -1 # Enable multiprocessing
opt_params["verbose"] = True
opt_params["save_path"] = "results" #create folder if it doesn't exist

In [20]:
# Create search space for optimization
search_space = {
    "num_epochs": Integer(low=5, high=10),
    #"eta": Real(low=0.01, high=5.0)
}

In [21]:
# Initialize optimizer
optimizer = Optimizer(
    model,
    dataset,
    topic_diversity,
    search_space,
    opt_params)

In [22]:
# Disable computing of topic document matrix to optimize performance
optimizer.topic_document_matrix = False
optimizer.topic_word_matrix = False

In [23]:
# Optimize
res = optimizer.optimize()

print(res.hyperparameters) # Best values for the hyperparameters
print(res.function_values) # Score of the optimized metric
print("Optimized metric: "+res.optimized_metric)


------------------------------------------
------------------------------------------
Bayesian optimization parameters:
-n_calls:  30 
-optimization_runs:  3 
-model_runs:  10 
-n_random_starts:  5 
-minimizer:  forest_minimize
-acq_func:  LCB
------------------------------------------
Iteration No: 1 started. Evaluating function at random point.
True pre_set_model
True pre_set_default_hyp
model: ETM(
  (t_drop): Dropout(p=0.1, inplace=False)
  (theta_act): ReLU()
  (rho): Linear(in_features=300, out_features=1268, bias=False)
  (alphas): Linear(in_features=300, out_features=10, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=1268, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=10, bias=True)
  (logsigma_q_theta): Linear(in_features=800, out_features=10, bias=True)
)
True post_set_model
****************************************************

Exception: Words in topics are less than 10