In [1]:
import os
os.chdir(os.path.pardir)
from dataset.dataset import Dataset
from evaluation_metrics.diversity_metrics import Topic_diversity
from evaluation_metrics.topic_significance_metrics import KL_uniform
from skopt import gp_minimize, forest_minimize, dummy_minimize
from optimization.optimizer import Optimizer
from skopt.space.space import Real, Integer
import multiprocessing as mp
from models.TorchAvitm import TorchAvitm
from gensim.models import Word2Vec

In [2]:
# Load dataset
dataset = Dataset()
dataset.load("preprocessed_datasets/newsgroup/newsgroup_lemmatized_5")

True

In [3]:
# Load model
model = TorchAvitm()

In [4]:
# Set model hyperparameters
model.hyperparameters['prior_variance'] = 0.2
model.hyperparameters['n_components'] = 5
model.hyperparameters['num_epochs'] = 1

In [5]:
model.hyperparameters


{'prior_variance': 0.2, 'n_components': 5, 'num_epochs': 1}

In [6]:
model.test_set(True)

In [7]:
model.train_model(dataset, model.hyperparameters)

Settings: 
               N Components: 5
               Topic Prior Mean: 0.0
               Topic Prior Variance: 0.2
               Model Type: prodLDA
               Hidden Sizes: (100, 100)
               Activation: softplus
               Dropout: 0.2
               Learn Priors: True
               Learning Rate: 0.002
               Momentum: 0.99
               Reduce On Plateau: False
               Save Dir: None
Epoch: [1/1]	Samples: [10840/10840]	Train Loss: 16937.352196725093	Time: 0:00:01.644874


{'topics': [['ability',
   'furthermore',
   'cap',
   'attach',
   'month',
   'rational',
   'flight',
   'serve',
   'mind',
   'public'],
  ['economy',
   'guess',
   'become',
   'guarantee',
   'efficient',
   'clean',
   'loss',
   'stone',
   'always',
   'compression'],
  ['stone',
   'become',
   'always',
   'efficient',
   'guarantee',
   'typical',
   'clean',
   'reliable',
   'moral',
   'price'],
  ['ability',
   'furthermore',
   'cap',
   'rational',
   'have',
   'normal',
   'child',
   'cause',
   'flight',
   'horse'],
  ['conversation',
   'rent',
   'expensive',
   'input',
   'reasonably',
   'fed',
   'programming',
   'far',
   'online',
   'ill']],
 'topic-word-matrix': array([[ 0.0501453 , -0.04953362, -0.06668814, ..., -0.12839419,
         -0.09486026, -0.08121524],
        [ 0.03862135, -0.02269747, -0.03935468, ...,  0.071284  ,
          0.0573212 ,  0.11826298],
        [ 0.04886413, -0.04023396, -0.04842158, ..., -0.00322858,
         -0.00502012,  0

In [8]:
model.inference()

[tensor([ 309,  457, 1897, 1984,  359,  341,    2, 1625, 1060,  528],
        device='cuda:0'),
 tensor([   0,  120,  457, 2146,  600, 1897, 1097, 1169, 2038,  952],
        device='cuda:0'),
 tensor([1169, 1888,  904, 1526, 1277, 1026,  539,  357,  636, 1399],
        device='cuda:0'),
 tensor([   0,  120, 1169, 1526,  952, 1888,  539,  904,  600, 1277],
        device='cuda:0'),
 tensor([1169,    0,  120, 1526,  904,  539, 1277,  952, 1888,  636],
        device='cuda:0'),
 tensor([1942,   93, 1112, 1546,   12,  765, 1012, 1890, 2050,  638],
        device='cuda:0'),
 tensor([ 357, 1156,  453,  234, 1277, 1276, 1888,  539,  961, 1948],
        device='cuda:0'),
 tensor([ 144, 1311, 1663, 1062, 1521, 1287, 1708, 1444, 2058,   95],
        device='cuda:0'),
 tensor([ 794,  898, 2033,  728, 2069,  936,  924,  909,  741,  278],
        device='cuda:0'),
 tensor([1184,  984, 1120, 2019, 1304, 1846,  680, 1282, 1719,  803],
        device='cuda:0'),
 tensor([   0,  120, 1169, 1526, 1888,  

In [5]:
# Topic diversity
topic_diversity = Topic_diversity()

# KL_Uniform
#kl_uniform = KL_uniform()

In [6]:
# Define optimization parameters
opt_params = {}
opt_params["n_calls"] = 30
opt_params["minimizer"] = forest_minimize
opt_params["different_iteration"] = 3
opt_params["n_random_starts"] = 5
#opt_params["extra_metrics"] = [kl_uniform] # List of extra metrics
opt_params["n_jobs"] = mp.cpu_count() -1 # Enable multiprocessing
opt_params["verbose"] = True
opt_params["save_path"] = "results" #create folder if it doesn't exist 

In [7]:
# Create search space for optimization
search_space = {
    "num_epochs": Integer(low=1, high=50),
    #"eta": Real(low=0.01, high=5.0)
}

In [8]:
# Initialize optimizer
optimizer = Optimizer(
    model,
    dataset,
    topic_diversity,
    search_space,
    opt_params)

In [9]:
# Disable computing of topic document matrix to optimize performance
optimizer.topic_document_matrix = False
optimizer.topic_word_matrix = False

In [10]:
# Optimize
res = optimizer.optimize()

print(res.hyperparameters) # Best values for the hyperparameters
print(res.function_values) # Score of the optimized metric
print("Optimized metric: "+res.optimized_metric)

------------------------------------------
------------------------------------------
Bayesian optimization parameters:
-n_calls:  30 
-different_iteration:  3 
-n_random_starts:  5 
-minimizer:  forest_minimize 
-acq_func:  LCB 
-kernel:  1**2 * Matern(length_scale=1, nu=1.5)
------------------------------------------
Iteration No: 1 started. Evaluating function at random point.


KeyError: 'n_components'