<a href="https://colab.research.google.com/github/JuanJoseMV/neuraltextgen/blob/main/texygen_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
!git clone --recursive https://github.com/JuanJoseMV/neuraltextgen.git
!pip install -r /content/neuraltextgen/texygen/requirements.txt
!pip install simpletransformers

In [3]:
%%capture
! pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [4]:
%%writefile setup.sh
export CUDA_HOME=/usr/local/cuda-10.1
! git clone https://github.com/NVIDIA/apex
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [5]:
%%capture
! sh setup.sh

In [None]:
## Attempt
# ! pip install FastBLEU

In [6]:
import nltk
nltk.download('punkt')

import sys, os
import os
os.chdir("/content/neuraltextgen/texygen")
from utils.metrics.Bleu import Bleu
from utils.metrics.SelfBleu import SelfBleu

os.chdir("/content/neuraltextgen/")
from NeuralTextGenerator import BertTextGenerator

os.chdir("/content")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
bert_model = BertTextGenerator('bert-base-uncased')

parameters = {'n_sentences': 100,  # 1000
              'batch_size': 50,  # 50
              'max_len': 40,
              'top_k': 100,
              'temperature': 1,
              'burnin': 250,
              'sample': True,
              'max_iter': 500,
              'seed_text': ""
              }

# "key1=val1_key2=val2_...txt"
file_path = "_".join([f"{k}={v}" for k, v in parameters.items()])+".txt"

bert_sents = bert_model.generate(save_to_path=file_path, **parameters)


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
print("\nEnglish text generated: ")
for sent in bert_sents:
  print(f"\t{sent}")


English text generated: 
	oh, but there was something wonderful, wonderful about ian mackenzie and his family, and the wonderful people they all were.
	it's not like i will be here all night, or all day. i will just be burrowed inside.
	" and so am i. " " is that so, little one? " the queen asked, fully awake now.
	''i must go, and i will not go back there without you.'' my voice comes out hoarse.
	mr. thomas as the chief medical officer of the st mary's hospital. mr. john thomas as medical officer.
	peter ( peter ) davies, alan bateman, alan davies, alan davies. writer and director : david dench.
	' heven, leave the girls alone. leave the bride and groom alone.'i looked up into his eyes.
	he is an executive and management consultant working in the international divisions of ici group, among others, and companies worldwide.
	the japanese version also features the cover art of the japanese version of the album, along with the japanese version released internationally.
	it was as loud a

#Evaluation - Original

In [8]:
from nltk.translate import bleu_score as bleu

def prepare_data(data_file, replacements={}, uncased=True):
    data = [d.strip().split() for d in open(data_file, 'r').readlines()]
    if uncased:
        data = [[t.lower() for t in sent] for sent in data]
        
    for k, v in replacements.items():
        data = [[t if t != k else v for t in sent] for sent in data]
 
    return data

def prepare_wiki(data_file, uncased=True):
    replacements = {"@@unknown@@": "[UNK]"}
    return prepare_data(data_file, replacements=replacements, uncased=uncased)

def prepare_tbc(data_file):        
    replacements = {"``": "\"", "\'\'": "\""}
    return prepare_data(data_file, replacements=replacements)

def corpus_bleu(generated, references):
    """ Compute similarity between two corpora as measured by
    comparing each sentence of `generated` against all sentences in `references` 
    
    args:
        - generated (List[List[str]]): list of sentences (split into tokens)
        - references (List[List[str]]): list of sentences (split into tokens)
        
    returns:
        - bleu (float)
    """    
    return bleu.corpus_bleu([references for _ in range(len(generated))], generated)

In [10]:
! git clone https://github.com/nyu-dl/bert-gen
wiki103_file = 'bert-gen/data/wiki103.5k.txt'
tbc_file = 'bert-gen/data/tbc.5k.txt'

wiki_data = prepare_wiki(wiki103_file)
tbc_data = prepare_tbc(tbc_file)

Cloning into 'bert-gen'...
remote: Enumerating objects: 78, done.[K
remote: Total 78 (delta 0), reused 0 (delta 0), pack-reused 78[K
Unpacking objects: 100% (78/78), done.


Try to evaluate using original functions and no cleaning of wiki-data

In [11]:
print("BERT-TBC BLEU: %.2f" % (100 * corpus_bleu(bert_sents, tbc_data)))
print("BERT-Wiki103 BLEU: %.2f" % (100 * corpus_bleu(bert_sents, wiki_data)))
print("BERT-{TBC + Wiki103} BLEU: %.2f" % (100 * corpus_bleu(bert_sents, tbc_data[:2500] + wiki_data[:2500])))

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BERT-TBC BLEU: 2.09


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BERT-Wiki103 BLEU: 27.48
BERT-{TBC + Wiki103} BLEU: 26.46


Try to evaluate after cleaning

In [12]:
def cleaner(data):
  len_mask = []
  for i in range(len(data)):
    if len(data[i]) < 4:
      len_mask.append(False)
    else:
      len_mask.append(True)

  data = [b for a, b in zip(len_mask, data) if a]
  return data

wiki_data = cleaner(wiki_data)
tbc_data = cleaner(tbc_data)

print("BERT-TBC BLEU: %.2f" % (100 * corpus_bleu(bert_sents, tbc_data)))
print("BERT-Wiki103 BLEU: %.2f" % (100 * corpus_bleu(bert_sents, wiki_data)))
print("BERT-{TBC + Wiki103} BLEU: %.2f" % (100 * corpus_bleu(bert_sents, tbc_data[:2500] + wiki_data[:2500])))

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BERT-TBC BLEU: 2.09


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BERT-Wiki103 BLEU: 27.48
BERT-{TBC + Wiki103} BLEU: 26.46


## Evaluation - Texygen

In [13]:
bleu_score_tbc = Bleu(file_path, tbc_file)
bleu_score_wiki = Bleu(file_path, wiki103_file)

print("(Texygen) BERT-TBC BLEU: %.2f" % (100 * bleu_score_tbc.get_bleu()))
print("(Texygen) BERT-Wiki103 BLEU: %.2f" % (100 * bleu_score_wiki.get_bleu()))

(Texygen) BERT-TBC BLEU: 29.86
(Texygen) BERT-Wiki103 BLEU: 22.26


In [73]:
self_bleu_score = SelfBleu(file_path)

print("(Texygen) BERT- SelfBLEU: %.2f" % (100 * self_bleu_score.get_bleu()))
print("(Texygen) BERT- SelfBLEU: %.2f" % (100 * self_bleu_score.get_bleu_parallel()))

(Texygen) BERT- SelfBLEU: 100.00
(Texygen) BERT- SelfBLEU: 14.49


In [72]:
## (Texygen) methods testing

import numpy as np
def get_reference(test_data):
    if True:
        reference = list()
        with open(test_data) as real_data:
            for text in real_data:
                text = nltk.word_tokenize(text)
                reference.append(text)
        # self.reference = reference
        return reference

def get_bleu(test_data):
    ngram = 4
    bleu = list()
    reference = get_reference(test_data)
    weight = tuple((1. / ngram for _ in range(ngram)))

    with open(test_data) as test_data:
        for the_hypothesis in test_data:
            the_hypothesis = nltk.word_tokenize(the_hypothesis)
            ## Exclude hypothesis
            index = reference.index(the_hypothesis)
            cleaned_reference = reference[:index] + reference[index + 1:]
            ##
            score = nltk.translate.bleu_score.sentence_bleu(cleaned_reference, the_hypothesis, weight) # Missing SmoothingFunction
            bleu.append(score)
    return sum(bleu) / len(bleu)

get_bleu(file_path)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.5701824132971612

In [44]:
## Testing
ref = [['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd'], ['b', 'r', 't', 'f'], ['d','t','f','r']]
matrix = []
weight = tuple((1. / 2 for _ in range(2)))

for index in range(0, len(ref)):
    hypothesis = ref[index]
    other = ref[:index] + ref[index+1:]
    matrix.append([hypothesis])
    print(other, hypothesis, nltk.translate.bleu_score.sentence_bleu(other, hypothesis, weight))

print(ref[1:], ref[0])
print( nltk.translate.bleu_score.sentence_bleu(ref[1:], ref[0], weight) )

[['a', 'b', 'c', 'd'], ['b', 'r', 't', 'f'], ['d', 't', 'f', 'r']] ['a', 'b', 'c', 'd'] 1.0
[['a', 'b', 'c', 'd'], ['b', 'r', 't', 'f'], ['d', 't', 'f', 'r']] ['a', 'b', 'c', 'd'] 1.0
[['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd'], ['d', 't', 'f', 'r']] ['b', 'r', 't', 'f'] 0.5773502691896257
[['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd'], ['b', 'r', 't', 'f']] ['d', 't', 'f', 'r'] 0.5773502691896257
[['a', 'b', 'c', 'd'], ['b', 'r', 't', 'f'], ['d', 't', 'f', 'r']] ['a', 'b', 'c', 'd']
1.0


In [28]:
## Paper self-bleu scoring
bleu.corpus_bleu([[s for (j, s) in enumerate(bert_sents) if j != i] for i in range(len(bert_sents))], bert_sents)

0.7974059181131615

In [19]:
##
## (Texygen) Self-bleu class
##

import os
from multiprocessing import Pool

import nltk
from nltk.translate.bleu_score import SmoothingFunction

from utils.metrics.Metrics import Metrics


class SelfBleu(Metrics):
    def __init__(self, test_text='', gram=3):
        super().__init__()
        self.name = 'Self-Bleu'
        self.test_data = test_text
        self.gram = gram
        self.sample_size = 500
        self.reference = None
        self.is_first = True

    def get_name(self):
        return self.name

    def get_score(self, is_fast=True, ignore=False):
        if ignore:
            return 0
        if self.is_first:
            self.get_reference()
            self.is_first = False
        if is_fast:
            return self.get_bleu_fast()
        return self.get_bleu_parallel()

    def get_reference(self):
        if self.reference is None:
            reference = list()
            with open(self.test_data) as real_data:
                for text in real_data:
                    text = nltk.word_tokenize(text)
                    reference.append(text)
            self.reference = reference
            return reference
        else:
            return self.reference

    def get_bleu(self):
        ngram = self.gram
        bleu = list()
        reference = self.get_reference()
        weight = tuple((1. / ngram for _ in range(ngram)))
        with open(self.test_data) as test_data:
            for hypothesis in test_data:
                hypothesis = nltk.word_tokenize(hypothesis)
                bleu.append(nltk.translate.bleu_score.sentence_bleu(reference, hypothesis, weight,
                                                                    smoothing_function=SmoothingFunction().method1))
        return sum(bleu) / len(bleu)

    def calc_bleu(self, reference, hypothesis, weight):
        return nltk.translate.bleu_score.sentence_bleu(reference, hypothesis, weight,
                                                       smoothing_function=SmoothingFunction().method1)

    def get_bleu_fast(self):
        reference = self.get_reference()
        # random.shuffle(reference)
        reference = reference[0:self.sample_size]
        return self.get_bleu_parallel(reference=reference)

    def get_bleu_parallel(self, reference=None):
        ngram = self.gram
        if reference is None:
            reference = self.get_reference()
        weight = tuple((1. / ngram for _ in range(ngram)))
        pool = Pool(os.cpu_count())
        result = list()
        sentence_num = len(reference)
        for index in range(sentence_num):
            hypothesis = reference[index]
            other = reference[:index] + reference[index+1:]
            result.append(pool.apply_async(self.calc_bleu, args=(other, hypothesis, weight)))

        score = 0.0
        cnt = 0
        for i in result:
            score += i.get()
            cnt += 1
        pool.close()
        pool.join()
        return score / cnt