In [1]:
import os
os.chdir(os.path.pardir)
from dataset.dataset import Dataset
from evaluation_metrics.diversity_metrics import Topic_diversity
from evaluation_metrics.topic_significance_metrics import KL_uniform
from skopt import gp_minimize, forest_minimize, dummy_minimize
from optimization.optimizer import Optimizer
from skopt.space.space import Real, Integer
import multiprocessing as mp
from models.TorchAvitm import TorchAvitm
from gensim.models import Word2Vec

In [16]:
# Load dataset
dataset = Dataset()
dataset.load("preprocessed_datasets/newsgroup/newsgroup_lemmatized_10")

True

In [17]:
# Load model
model = TorchAvitm()

In [18]:

# Set model hyperparameters
model.hyperparameters['prior_variance'] = 0.2
model.hyperparameters['n_components'] = 5
model.hyperparameters['num_epochs'] = 1

In [19]:
model.hyperparameters


{'prior_variance': 0.2, 'n_components': 5, 'num_epochs': 1}

In [39]:
#Set true for splitting the dataset into train and test
model.test_set(True)

In [40]:
#Train the model
model.train_model(dataset, model.hyperparameters)

Settings: 
               N Components: 5
               Topic Prior Mean: 0.0
               Topic Prior Variance: 0.2
               Model Type: prodLDA
               Hidden Sizes: (100, 100)
               Activation: softplus
               Dropout: 0.2
               Learn Priors: True
               Learning Rate: 0.002
               Momentum: 0.99
               Reduce On Plateau: False
               Save Dir: None
Epoch: [1/1]	Samples: [10758/10758]	Train Loss: 572.939630205247	Time: 0:00:01.410117


{'topics': [['think',
   'point',
   'take',
   'time',
   'much',
   'keep',
   'could',
   'want',
   'would',
   'first'],
  ['make',
   'different',
   'first',
   'can',
   'follow',
   'time',
   'life',
   'want',
   'show',
   'go'],
  ['reaction',
   'bug',
   'additional',
   'sick',
   'worry',
   'upgrade',
   'electronic',
   'assumption',
   'notice',
   'relation'],
  ['popular',
   'relatively',
   'external',
   'definitely',
   'component',
   'damn',
   'hate',
   'wave',
   'aware',
   'nee'],
  ['take',
   'point',
   'time',
   'could',
   'play',
   'go',
   'good',
   'may',
   'look',
   'need']],
 'topic-word-matrix': array([[-0.06387644, -0.02884979, -0.12014884, ..., -0.06780745,
         -0.02669834,  0.00446412],
        [ 0.0072318 ,  0.03961246,  0.01168362, ...,  0.02178026,
          0.00681433,  0.00666754],
        [ 0.12584908, -0.12996095,  0.0762203 , ...,  0.13398783,
         -0.06785965,  0.02153581],
        [-0.03130642,  0.07545217,  0.08238

In [41]:
#Apply the predict method for the test set
model.inference()

{'topics': [['think',
   'point',
   'take',
   'time',
   'much',
   'keep',
   'could',
   'want',
   'would',
   'first'],
  ['make',
   'different',
   'first',
   'can',
   'follow',
   'time',
   'life',
   'want',
   'show',
   'go'],
  ['reaction',
   'bug',
   'additional',
   'sick',
   'worry',
   'upgrade',
   'electronic',
   'assumption',
   'notice',
   'relation'],
  ['popular',
   'relatively',
   'external',
   'definitely',
   'component',
   'damn',
   'hate',
   'wave',
   'aware',
   'nee'],
  ['take',
   'point',
   'time',
   'could',
   'play',
   'go',
   'good',
   'may',
   'look',
   'need']],
 'topic-word-matrix': array([[-0.06387644, -0.02884979, -0.12014884, ..., -0.06780745,
         -0.02669834,  0.00446412],
        [ 0.0072318 ,  0.03961246,  0.01168362, ...,  0.02178026,
          0.00681433,  0.00666754],
        [ 0.12584908, -0.12996095,  0.0762203 , ...,  0.13398783,
         -0.06785965,  0.02153581],
        [-0.03130642,  0.07545217,  0.08238

In [30]:
'''
Check vocabolary keys differences
diz_tr = model.X_train.__dict__['idx2token']
inv_map = {v: k for k, v in diz_tr.items()}
diz_da = dataset.get_vocabulary()
set(inv_map.keys()) == set(diz_da.keys())
len(diz_da.keys())
len(inv_map.keys())
list1 = list(inv_map.keys())
list2 = list(diz_da.keys())
set_difference = set(list2) - set(list1)
list_difference = list(set_difference)
list_difference
'''

In [None]:
# Topic diversity
topic_diversity = Topic_diversity()

# KL_Uniform
#kl_uniform = KL_uniform()

In [None]:
# Define optimization parameters
opt_params = {}
opt_params["n_calls"] = 30
opt_params["minimizer"] = forest_minimize
opt_params["different_iteration"] = 3
opt_params["n_random_starts"] = 5
#opt_params["extra_metrics"] = [kl_uniform] # List of extra metrics
opt_params["n_jobs"] = mp.cpu_count() -1 # Enable multiprocessing
opt_params["verbose"] = True
opt_params["save_path"] = "results" #create folder if it doesn't exist 

In [None]:
# Create search space for optimization
search_space = {
    "num_epochs": Integer(low=1, high=50),
    #"eta": Real(low=0.01, high=5.0)
}

In [None]:
# Initialize optimizer
optimizer = Optimizer(
    model,
    dataset,
    topic_diversity,
    search_space,
    opt_params)

In [None]:
# Disable computing of topic document matrix to optimize performance
optimizer.topic_document_matrix = False
optimizer.topic_word_matrix = False

In [None]:
# Optimize
res = optimizer.optimize()

print(res.hyperparameters) # Best values for the hyperparameters
print(res.function_values) # Score of the optimized metric
print("Optimized metric: "+res.optimized_metric)