<a href="https://colab.research.google.com/github/JuanJoseMV/neuraltextgen/blob/main/grid_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialization

In [1]:
%%capture
!git clone --recursive https://github.com/JuanJoseMV/neuraltextgen.git
!pip install -r /content/neuraltextgen/texygen/requirements.txt
!pip install simpletransformers

In [12]:
import sys
import os
import numpy as np

os.chdir("/content/neuraltextgen/")
from NeuralTextGenerator import BertTextGenerator

APEX_AVAILABLE = False
NUM_TEST = 5 # number of runs for each set of parameters

In [13]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: ignored

In [None]:
%%writefile setup.sh
export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [None]:
%%capture
!sh setup.sh

APEX_AVAILABLE = True

# Evaluation

## Texygen

In [14]:
import nltk
nltk.download('punkt')

os.chdir("/content/neuraltextgen/texygen")
from utils.metrics.Bleu import Bleu
from utils.metrics.SelfBleu import SelfBleu
os.chdir("/content/neuraltextgen/")

n_grams = 4
WIKI103_PATH = '/content/neuraltextgen/data/wiki103.5k.txt'
TBC_PATH = '/content/neuraltextgen/data/tbc.5k.txt'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
def evaluate_texygen(file_path, n_grams):
  bleu_score_tbc = Bleu(file_path, TBC_PATH, gram = n_grams).get_bleu()
  bleu_score_wiki_en = Bleu(file_path, WIKI103_PATH, gram = n_grams).get_bleu()

  return (bleu_score_tbc, bleu_score_wiki_en)

#Grid search

In [16]:
model = BertTextGenerator("bert-base-uncased", use_apex = APEX_AVAILABLE)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

## Seting log file for results

In [17]:
# ROOT_PATH = '/content/drive/MyDrive/neuraltextgen/'
ROOT_PATH = '/content'
LOG_FILE_PATH = os.path.join(ROOT_PATH, 'results.log')

In [18]:
import logging
logging.basicConfig(level=logging.DEBUG, 
            format=f'%(asctime)s\t{model.model_version}\t%(message)s\t',
            datefmt='%m/%d/%Y %H:%M:%S')
logger = logging.getLogger(__name__)  # generally use __name__
logger.propagate = False

# setup
file_h = logging.FileHandler(LOG_FILE_PATH)
file_h.setLevel(logging.INFO)

# formatter
formatter = logging.Formatter(f'%(asctime)s\t{model.model_version}\t%(message)s\t',
            datefmt='%m/%d/%Y %H:%M:%S')
file_h.setFormatter(formatter)

logger.addHandler(file_h)

## Text generation

We define two parameters dictionaries


*   fixed_parameters: for the parameters that should not be tested in the grid search
*   parameters_to_test: as a dict with keys the parameters  and values a list of values to test



In [19]:
#VARIABLE
# max_iter=[100, 500]
# std_len=[0, 5]
# init_mask_prob= [0, 1]
# temperature = [0.1,  1, 10]
# generation_method=["parallel", 'sequential', 'attention']
# masked_portion = [0.15, 1, 1.0]
# sample = [False, True]
# top_k = [0, 100]

# burnin=None

In [20]:
# 'generation_method':'parallel', 'masked_portion':0.15
# 'generation_method':'parallel', 'masked_portion':1
# 'generation_method':'parallel', 'masked_portion':1.0
# 'generation_method':'sequential', 'masked_portion':1
# 'generation_method': 'attention', 'masked_portion':1

fixed_parameters = {'n_sentences': 50, 'batch_size': 10, 'avg_len':40, 'max_len':50, 'generation_method':'parallel', 'masked_portion':0.15}

parameters_to_test = {'max_iter' : [100, 500],
                      'std_len' : [0, 5],
                      'init_mask_prob' : [0, 1],
                      'temperature' : [0.1,  1, 10],
                      'sample' : [False, True],
                      'top_k' : [0, 100]
                      }

In [21]:
from itertools import product

tot_num_tests = np.prod([len(x) for x in parameters_to_test.values()])

for i,p in enumerate(product(*parameters_to_test.values())):

  parameters = {**fixed_parameters, **dict(zip(parameters_to_test.keys(), p))} 

  parameters = {**fixed_parameters, **dict(zip(parameters_to_test.keys(), p))} 
  print(f'ITERATION - [{i+1}/{tot_num_tests}]')
  print(parameters)

  parameters_str = ",".join([f"{k}={v}" for k, v in parameters.items()])
  

  for j in range(NUM_TEST):
    #change as you prefer
    file_path = os.path.join(ROOT_PATH, f'test{j}-' + parameters_str+".txt")
  
    bert_sents = model.generate(save_to_path = file_path, **parameters)

    scores = []
    scores += evaluate_texygen(file_path, n_grams=4)
    scores = [f"{x:.3f}" for x in scores]
    print(f'\t Test {j+1}: bleu-tbc '+ scores[0] +' wiki_bleu ' + scores[1])
    scores_str = "\t".join(scores)

    logger.info(parameters_str + '\t' + scores_str)

  break

ITERATION - [1/96]
{'n_sentences': 50, 'batch_size': 10, 'avg_len': 40, 'max_len': 50, 'generation_method': 'parallel', 'masked_portion': 0.15, 'max_iter': 100, 'std_len': 0, 'init_mask_prob': 0, 'temperature': 0.1, 'sample': False, 'top_k': 0}


KeyboardInterrupt: ignored