<a href="https://colab.research.google.com/github/JuanJoseMV/neuraltextgen/blob/main/texygen_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!git clone --recursive https://github.com/JuanJoseMV/neuraltextgen.git
!pip install -r /content/neuraltextgen/texygen/requirements.txt
!pip install simpletransformers
! pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [3]:
%%writefile setup.sh
export CUDA_HOME=/usr/local/cuda-10.1
! git clone https://github.com/NVIDIA/apex
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [4]:
%%capture
! sh setup.sh

In [5]:
import nltk
nltk.download('punkt')

import sys, os

os.chdir("/content/neuraltextgen/texygen")
from utils.metrics.Bleu import Bleu
from utils.metrics.SelfBleu import SelfBleu

os.chdir("/content/neuraltextgen/")
from NeuralTextGenerator import BertTextGenerator

os.chdir("/content")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
bert_model = BertTextGenerator('bert-base-uncased')

parameters = {'n_sentences': 100,  # 1000
              'batch_size': 50,  # 50
              'max_len': 40,
              'top_k': 100,
              'temperature': 1,
              'burnin': 250,
              'sample': True,
              'max_iter': 500,
              'seed_text': ""
              }

# "key1=val1_key2=val2_...txt"
file_path = "_".join([f"{k}={v}" for k, v in parameters.items()])+".txt"

bert_sents = bert_model.generate(save_to_path=file_path, **parameters)


In [None]:
print("\nEnglish text generated: ")
for sent in bert_sents:
  print(f"\t{sent}")


English text generated: 
	oh, but there was something wonderful, wonderful about ian mackenzie and his family, and the wonderful people they all were.
	it's not like i will be here all night, or all day. i will just be burrowed inside.
	" and so am i. " " is that so, little one? " the queen asked, fully awake now.
	''i must go, and i will not go back there without you.'' my voice comes out hoarse.
	mr. thomas as the chief medical officer of the st mary's hospital. mr. john thomas as medical officer.
	peter ( peter ) davies, alan bateman, alan davies, alan davies. writer and director : david dench.
	' heven, leave the girls alone. leave the bride and groom alone.'i looked up into his eyes.
	he is an executive and management consultant working in the international divisions of ici group, among others, and companies worldwide.
	the japanese version also features the cover art of the japanese version of the album, along with the japanese version released internationally.
	it was as loud a

#Evaluation - Original

In [None]:
! git clone https://github.com/nyu-dl/bert-gen
wiki103_file = 'bert-gen/data/wiki103.5k.txt'
tbc_file = 'bert-gen/data/tbc.5k.txt'

wiki_data = prepare_wiki(wiki103_file)
tbc_data = prepare_tbc(tbc_file)

In [7]:
from nltk.translate import bleu_score as bleu

def prepare_data(data_file, replacements={}, uncased=True):
    data = [d.strip().split() for d in open(data_file, 'r').readlines()]
    if uncased:
        data = [[t.lower() for t in sent] for sent in data]
        
    for k, v in replacements.items():
        data = [[t if t != k else v for t in sent] for sent in data]
 
    return data

def prepare_wiki(data_file, uncased=True):
    replacements = {"@@unknown@@": "[UNK]"}
    return prepare_data(data_file, replacements=replacements, uncased=uncased)

def prepare_tbc(data_file):        
    replacements = {"``": "\"", "\'\'": "\""}
    return prepare_data(data_file, replacements=replacements)

def corpus_bleu(generated, references):
    """ Compute similarity between two corpora as measured by
    comparing each sentence of `generated` against all sentences in `references` 
    
    args:
        - generated (List[List[str]]): list of sentences (split into tokens)
        - references (List[List[str]]): list of sentences (split into tokens)
        
    returns:
        - bleu (float)
    """    
    return bleu.corpus_bleu([references for _ in range(len(generated))], generated)

Try to evaluate using original functions and no cleaning of wiki-data

In [9]:
print("BERT-TBC BLEU: %.2f" % (100 * corpus_bleu(bert_sents, tbc_data)))
print("BERT-Wiki103 BLEU: %.2f" % (100 * corpus_bleu(bert_sents, wiki_data)))
print("BERT-{TBC + Wiki103} BLEU: %.2f" % (100 * corpus_bleu(bert_sents, tbc_data[:2500] + wiki_data[:2500])))

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BERT-TBC BLEU: 19.98
BERT-Wiki103 BLEU: 27.60
BERT-{TBC + Wiki103} BLEU: 27.31


Try to evaluate after cleaning

In [None]:
def cleaner(data):
  len_mask = []
  
  for i in range(len(data)):
    if len(data[i]) < 4:
      len_mask.append(False)
    else:
      len_mask.append(True)

  data = [b for a, b in zip(len_mask, data) if a]
  return data

wiki_data = cleaner(wiki_data)
tbc_data = cleaner(tbc_data)

print("BERT-TBC BLEU: %.2f" % (100 * corpus_bleu(bert_sents, tbc_data)))
print("BERT-Wiki103 BLEU: %.2f" % (100 * corpus_bleu(bert_sents, wiki_data)))
print("BERT-{TBC + Wiki103} BLEU: %.2f" % (100 * corpus_bleu(bert_sents, tbc_data[:2500] + wiki_data[:2500])))

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BERT-TBC BLEU: 2.09


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BERT-Wiki103 BLEU: 27.48
BERT-{TBC + Wiki103} BLEU: 26.46


## Evaluation - Texygen

In [10]:
bleu_score_tbc = Bleu(file_path, tbc_file)
bleu_score_wiki = Bleu(file_path, wiki103_file)

print("(Texygen) BERT-TBC BLEU: %.2f" % (100 * bleu_score_tbc.get_bleu()))
print("(Texygen) BERT-Wiki103 BLEU: %.2f" % (100 * bleu_score_wiki.get_bleu()))

(Texygen) BERT-TBC BLEU: 30.90
(Texygen) BERT-Wiki103 BLEU: 21.87


In [11]:
self_bleu_score = SelfBleu(file_path)

print("(Texygen) BERT- SelfBLEU: %.2f" % (100 * self_bleu_score.get_bleu())) ## Oddly behaving
print("(Texygen) BERT- SelfBLEU: %.2f" % (100 * self_bleu_score.get_bleu_parallel())) ## Expected results

(Texygen) BERT- SelfBLEU: 100.00
(Texygen) BERT- SelfBLEU: 11.20


In [12]:
## (Texygen) methods testing

import numpy as np
def get_reference(test_data):
    if True:
        reference = list()
        with open(test_data) as real_data:
            for text in real_data:
                text = nltk.word_tokenize(text)
                reference.append(text)
        # self.reference = reference
        return reference

def get_bleu(test_data):
    ngram = 4
    bleu = list()
    reference = get_reference(test_data)
    weight = tuple((1. / ngram for _ in range(ngram)))

    with open(test_data) as test_data:
        for the_hypothesis in test_data:
            the_hypothesis = nltk.word_tokenize(the_hypothesis)
            ## Fix: Exclude hypothesis
            index = reference.index(the_hypothesis)
            cleaned_reference = reference[:index] + reference[index + 1:]
            ##
            score = nltk.translate.bleu_score.sentence_bleu(cleaned_reference, the_hypothesis, weight) # Missing SmoothingFunction
            bleu.append(score)
    return sum(bleu) / len(bleu)

get_bleu(file_path)

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.5311765501135502

In [13]:
## Paper self-bleu scoring
bleu.corpus_bleu([[s for (j, s) in enumerate(bert_sents) if j != i] for i in range(len(bert_sents))], bert_sents)

0.7922691299230896