<a href="https://colab.research.google.com/github/JuanJoseMV/neuraltextgen/blob/main/RNN/RNN_TextGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalations

###### Apex

In [1]:
%%capture
!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
%%writefile setup.sh
export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
# Writing setup.sh

Writing setup.sh


In [3]:
%%capture
!sh setup.sh

In [4]:
! git clone --recursive https://github.com/JuanJoseMV/neuraltextgen.git

Cloning into 'neuraltextgen'...
remote: Enumerating objects: 944, done.[K
remote: Counting objects: 100% (458/458), done.[K
remote: Compressing objects: 100% (381/381), done.[K
remote: Total 944 (delta 211), reused 171 (delta 74), pack-reused 486[K
Receiving objects: 100% (944/944), 13.85 MiB | 18.16 MiB/s, done.
Resolving deltas: 100% (375/375), done.
Submodule 'texygen' (https://github.com/geek-ai/Texygen.git) registered for path 'texygen'
Cloning into '/content/neuraltextgen/texygen'...
remote: Enumerating objects: 888, done.        
remote: Total 888 (delta 0), reused 0 (delta 0), pack-reused 888        
Receiving objects: 100% (888/888), 21.85 MiB | 17.20 MiB/s, done.
Resolving deltas: 100% (537/537), done.
Submodule path 'texygen': checked out '3104e22ac75f3cc2070da2bf5e2da6d2bef149ad'


# Imports

In [5]:
# import gensim.models.wrappers.fasttext
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel, BertConfig, AutoConfig
from collections import Counter

# Cleaning the dataset


In [None]:
## When using Wiki.tokens (not wiki.5k)

with open('/content/wiki.train.tokens') as f:
  content = f.readlines()

clean = []
for c in content:
  clean.append(c.replace('\n', '[EOS]'))

In [9]:
with open('/content/neuraltextgen/data/tbc.5k.txt') as f:
  content = f.readlines()

clean_first = []
for c in content:
  clean_first.append(c.replace("``", "\""))

clean = []
for c in clean_first:
  clean.append(c.replace("\'\'", "\""))

In [10]:
! touch '/content/neuraltextgen/data/cleaned_tbc.5k.txt'

In [11]:
with open('/content/neuraltextgen/data/cleaned_tbc.5k.txt', 'w') as f:
    for item in clean:
        f.write("%s\n" % item)

# Downloading pre-trained wordembeddings

In [None]:
# It takes some minutes, avoid if won't use
model = gensim.models.KeyedVectors.load_word2vec_format('/content/wiki-news-300d-1M.vec')
word_vectors = model.wv

weights = torch.FloatTensor(word_vectors.vectors)
embedding = nn.Embedding.from_pretrained(weights)

  This is separate from the ipykernel package so we can avoid doing imports until


# Train the network

In [14]:
os.chdir('/content/neuraltextgen/RNN/')
from RNNGenerator import RNNGenerator

params = {
    "seq_size": 512, 
    "batch_size": 32, 
    "embedding_size": 300, 
    "lstm_size": 128,
    "lstm_num_layers": 3, 
    "lstm_bidirectional": True, 
    "lstm_dropout": 0.5, 
    "gradients_norm": 5,
    "predict_top_k": 10, 
    "training_epocs": 300, 
    "lr": 0.01, 
    "weights": None
}

train_file = '../data/cleaned_tbc.5k.txt'
generator = RNNGenerator(**params)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_net = generator.train(device, train_file)

# list of sentences
sentences = generator.predict(device, trained_net, n_sentences=100)
sentences

Vocabulary size 8678
Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Epoch: 24/300 Iteration: 100 Loss: 2.8042151927948
Epoch: 49/300 Iteration: 200 Loss: 0.7751246690750122
Epoch: 74/300 Iteration: 300 Loss: 0.19095322489738464
Epoch: 99/300 Iteration: 400 Loss: 0.07505667209625244
Epoch: 124/300 Iteration: 500 Loss: 0.039560843259096146
Epoch: 14

['... mid-air to well take say never try at mine against save up tightly on before question right question after since be something all then watch can kill dare probably beside saw doing all yet and far were afternoon and lucas in ophella at bet against become from bet until pleasure dare st. youre bundled youre sigh youre distraught so distraught me distraught about distraught can yuell just afford so earth just landing youve hot how office ,',
 'macbugall smiling said when said if rhys , wait for okay her number that dress is dress her place her under in as line as in about was something . me . him , you something do something ? something " nothing he anything was',
 'camera witnessed human tree figure without meeting except hours somehow coffee somehow hundred flash bowl taking older older school hours older tread hours older older women second human strands',
 "playboy wiggleigh explained said ; said although well if humans because humans something crazy , violet , 'scaped as ash a

# Save text

In [15]:
os.chdir('/content/')

In [16]:
! touch RNN_generated.txt

In [17]:
with open("RNN_generated_TBC.txt", "w") as text_file:
  for sentence in sentences:
    text_file.write(sentence + '\n')

# Evaluate text

In [19]:
%%capture
!pip install -r /content/neuraltextgen/texygen/requirements.txt

In [20]:
import nltk
nltk.download('punkt')

import sys, os
import os
os.chdir("/content/neuraltextgen/texygen")
from utils.metrics.Bleu import Bleu
from utils.metrics.SelfBleu import SelfBleu

os.chdir("/content")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [23]:
from nltk.translate import bleu_score as bleu

def prepare_data(data_file, replacements={}, uncased=True):
    data = [d.strip().split() for d in open(data_file, 'r').readlines()]
    if uncased:
        data = [[t.lower() for t in sent] for sent in data]
        
    for k, v in replacements.items():
        data = [[t if t != k else v for t in sent] for sent in data]
 
    return data

def prepare_wiki(data_file, uncased=True):
    replacements = {"@@unknown@@": "[UNK]"}
    return prepare_data(data_file, replacements=replacements, uncased=uncased)

def prepare_tbc(data_file):        
    replacements = {"``": "\"", "\'\'": "\""}
    return prepare_data(data_file, replacements=replacements)

def corpus_bleu(generated, references):
    """ Compute similarity between two corpora as measured by
    comparing each sentence of `generated` against all sentences in `references` 
    
    args:
        - generated (List[List[str]]): list of sentences (split into tokens)
        - references (List[List[str]]): list of sentences (split into tokens)
        
    returns:
        - bleu (float)
    """    
    return bleu.corpus_bleu([references for _ in range(len(generated))], generated)
    
wiki103_file = './neuraltextgen/data/wiki103.5k.txt'
tbc_file = './neuraltextgen/data/tbc.5k.txt'

wiki_data = prepare_wiki(wiki103_file)
tbc_data = prepare_tbc(tbc_file)

In [24]:
file_path = '/content/RNN_generated_TBC.txt'
bleu_score_tbc = Bleu(file_path, tbc_file)
bleu_score_wiki = Bleu(file_path, wiki103_file)

print("(Texygen) BERT-TBC BLEU: %.2f" % (100 * bleu_score_tbc.get_bleu()))
print("(Texygen) BERT-Wiki103 BLEU: %.2f" % (100 * bleu_score_wiki.get_bleu()))

(Texygen) BERT-TBC BLEU: 7.70
(Texygen) BERT-Wiki103 BLEU: 4.78


In [25]:
self_bleu_score = SelfBleu(file_path)

print("(Texygen) BERT- SelfBLEU: %.2f" % (100 * self_bleu_score.get_bleu_parallel())) ## Expected results

(Texygen) BERT- SelfBLEU: 7.45


In [27]:
os.chdir("/content/neuraltextgen/texygen")
from utils.metrics.UniqueGram import UniqueGram
from utils.metrics.Bleu import Bleu

path = "/content/neuraltextgen/data/tbc.5k.txt"
file = open(path, "r")
tbc = file.readlines()

path = "/content/neuraltextgen/data/wiki103.5k.txt"
file = open(path, "r")
wiki = file.readlines()

from nltk.util import ngrams
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case="uncased" in "bert-base-uncased")

def getGrams(sents, n):
  grams = []
  for line in sents:
    line = tokenizer.convert_tokens_to_ids(line.split(" "))
    grams += UniqueGram(gram=n).get_gram(line)
  dictGrams = Counter(grams)
  return dictGrams

def compareUniqueGrams(pred_ngrams, ref_ngrams, max_n):
  pct_unique={}
  for i in range(2, max_n + 1):
    pred_ngram_counts = set(pred_ngrams[i].keys())
    total = sum(pred_ngrams[i].values())
    ref_ngram_counts = set(ref_ngrams[i].keys())
    pct_unique[i] = len(pred_ngram_counts.difference(ref_ngram_counts)) / total

  return pct_unique

def selfUniqueGrams(pred_ngrams, max_n):
  pct_unique={}
  for i in range(2, max_n+1):
    n_unique = len([k for k, v in pred_ngrams[i].items() if v == 1])
    total = sum(pred_ngrams[i].values())
    pct_unique[i] = n_unique/total

  return pct_unique


maxGrams = 4
wikiGrams={}
tbcGrams={}
for i in range(2, maxGrams+1):
  wikiGrams[i] = getGrams(wiki, i)
  tbcGrams[i] = getGrams(tbc, i)

import itertools

#supporting function
def _split_into_words(sentences):
  """Splits multiple sentences into words and flattens the result"""
  return list(itertools.chain(*[_.split(" ") for _ in sentences]))

#supporting function
def _get_word_ngrams(n, sentences):
  """Calculates word n-grams for multiple sentences.
  """
  assert len(sentences) > 0
  assert n > 0

  words = _split_into_words(sentences)
  return _get_ngrams(n, words)

#supporting function
def _get_ngrams(n, text):
  """Calcualtes n-grams.
  Args:
    n: which n-grams to calculate
    text: An array of tokens
  Returns:
    A set of n-grams
  """
  ngram_set = set()
  text_length = len(text)
  max_index_ngram_start = text_length - n
  for i in range(max_index_ngram_start + 1):
    ngram_set.add(tuple(text[i:i + n]))
  return ngram_set

def rouge_n(reference_sentences, evaluated_sentences, n=2):
  """
  Computes ROUGE-N of two text collections of sentences.
  Source: http://research.microsoft.com/en-us/um/people/cyl/download/
  papers/rouge-working-note-v1.3.1.pdf
  Args:
    evaluated_sentences: The sentences that have been picked by the summarizer
    reference_sentences: The sentences from the referene set
    n: Size of ngram.  Defaults to 2.
  Returns:
    recall rouge score(float)
  Raises:
    ValueError: raises exception if a param has len <= 0
  """
  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
    raise ValueError("Collections must contain at least 1 sentence.")

  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
  reference_ngrams = _get_word_ngrams(n, reference_sentences)
  reference_count = len(reference_ngrams)
  evaluated_count = len(evaluated_ngrams)

  # Gets the overlapping ngrams between evaluated and reference
  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
  overlapping_count = len(overlapping_ngrams)

  # Handle edge case. This isn't mathematically correct, but it's good enough
  if evaluated_count == 0:
    precision = 0.0
  else:
    precision = overlapping_count / evaluated_count

  if reference_count == 0:
    recall = 0.0
  else:
    recall = overlapping_count / reference_count

  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))

  #just returning recall count in rouge, useful for our purpose
  return [precision,recall,f1_score]

file = open(path, "r")
pred = file.readlines()

modelGrams = {}
for i in range(2, maxGrams+1):
  modelGrams[i] = getGrams(pred, i)

pct_uniques_self = selfUniqueGrams(modelGrams, maxGrams)
pct_uniques_wiki = compareUniqueGrams(modelGrams, wikiGrams, maxGrams)
pct_uniques_tbc = compareUniqueGrams(modelGrams, tbcGrams, maxGrams)

rougeWiki = []
rougeTBC = []
for k in range(1,5):
  rougeWiki += rouge_n(wiki, pred, n=k)
  rougeTBC += rouge_n(wiki, pred, n=k)

NameError: ignored