<a href="https://colab.research.google.com/github/JuanJoseMV/neuraltextgen/blob/main/RNN_TextGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalations

###### Apex

In [1]:
%%capture
!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
%%writefile setup.sh
export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
# Writing setup.sh

Writing setup.sh


In [3]:
%%capture
!sh setup.sh

In [5]:
! git clone --recursive https://github.com/JuanJoseMV/neuraltextgen.git

Cloning into 'neuraltextgen'...
remote: Enumerating objects: 645, done.[K
remote: Counting objects: 100% (159/159), done.[K
remote: Compressing objects: 100% (152/152), done.[K
remote: Total 645 (delta 47), reused 11 (delta 5), pack-reused 486[K
Receiving objects: 100% (645/645), 7.58 MiB | 17.45 MiB/s, done.
Resolving deltas: 100% (211/211), done.
Submodule 'texygen' (https://github.com/geek-ai/Texygen.git) registered for path 'texygen'
Cloning into '/content/neuraltextgen/texygen'...
remote: Enumerating objects: 888, done.        
remote: Total 888 (delta 0), reused 0 (delta 0), pack-reused 888        
Receiving objects: 100% (888/888), 21.85 MiB | 21.39 MiB/s, done.
Resolving deltas: 100% (537/537), done.
Submodule path 'texygen': checked out '3104e22ac75f3cc2070da2bf5e2da6d2bef149ad'


# Imports

In [4]:
import gensim.models.wrappers.fasttext
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel, BertConfig, AutoConfig
from collections import Counter

# Cleaning the dataset


In [None]:
## When using Wiki.tokens (not wiki.5k)

with open('/content/wiki.train.tokens') as f:
  content = f.readlines()

clean = []
for c in content:
  clean.append(c.replace('\n', '[EOS]'))

In [6]:
with open('/content/neuraltextgen/data/wiki103.5k.txt') as f:
  content = f.readlines()

clean = []
for c in content:
  clean.append(c.replace('@@UNKNOWN@@', ''))

In [7]:
! touch '/content/neuraltextgen/data/cleaned_wiki103.5k.txt'

In [8]:
with open('/content/neuraltextgen/data/cleaned_wiki103.5k.txt', 'w') as f:
    for item in clean:
        f.write("%s\n" % item)

# Downloading pre-trained wordembeddings

In [None]:
# It takes some minutes, avoid if won't use
model = gensim.models.KeyedVectors.load_word2vec_format('/content/wiki-news-300d-1M.vec')
word_vectors = model.wv

weights = torch.FloatTensor(word_vectors.vectors)
embedding = nn.Embedding.from_pretrained(weights)

  This is separate from the ipykernel package so we can avoid doing imports until


# Train the network

In [10]:
os.chdir('/content/neuraltextgen/')
from RNNGenerator import RNNGenerator

params = {
    "seq_size": 512, 
    "batch_size": 32, 
    "embedding_size": 300, 
    "lstm_size": 128,
    "lstm_num_layers": 3, 
    "lstm_bidirectional": True, 
    "lstm_dropout": 0.5, 
    "gradients_norm": 5,
    "predict_top_k": 10, 
    "training_epocs": 300, 
    "lr": 0.01, 
    "weights": None
}

train_file = 'data/cleaned_wiki103.5k.txt'
generator = RNNGenerator(**params)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_net = generator.train(device, train_file)

# list of sentences
sentences = generator.predict(device, trained_net, n_sentences=100)
sentences

Vocabulary size 14104
Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Epoch: 14/300 Iteration: 100 Loss: 3.014544725418091
Epoch: 28/300 Iteration: 200 Loss: 0.9625660181045532
Epoch: 42/300 Iteration: 300 Loss: 0.3162761628627777
Epoch: 57/300 Iteration: 400 Loss: 0.15330085158348083
Epoch: 71/300 Iteration: 500 Loss: 0.06945610046386719
Epoch: 85

['kosher Campbell Supervisor Trophy Johann baseman Inoki Central Service general Assistant presented Murray double Murray Service Pop Service Johann editorial Inoki member soon Regiment naval soon highly Inoki riot Inoki wrestling highly bell Thomas Brock watertight Inoki LIKE keep LIKE keep Station Championship Abdul highly HMS strikeout earlier Station earlier Station de Thomas highly More t Don Thomas triple highly triple air triple air Thomas triple highly Station',
 'Me highly Me triple frequently',
 'Adelaide the Philippine production request river Ireland episode treasure in treasure',
 'explicit Peter andesite von Binnie Karl Philipp Gerard Abdul LIKE LIKE Thomas LIKE LIKE Se Roy Razak W. Miguel Ray W. Thomas Ray Fox Miguel LIKE climate Center Miguel Razak highly Center shooting Razak HMS Institute HMS Abdul air Abdul Thomas Abdul falls Fox Razak highly Center air s air Thomas t s advanced t Service Abdul t Citizenship Miguel Center t Miguel Miguel Abdul Thomas Willie air Willi

# Save text

In [None]:
os.chdir('/content/')

In [12]:
! touch RNN_generated.txt

In [13]:
with open("RNN_generated.txt", "w") as text_file:
  for sentence in sentences:
    text_file.write(sentence + '\n')

# Evaluate text

In [14]:
!pip install -r /content/neuraltextgen/texygen/requirements.txt

Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: colorama
Successfully installed colorama-0.4.4


In [15]:
import nltk
nltk.download('punkt')

import sys, os
import os
os.chdir("/content/neuraltextgen/texygen")
from utils.metrics.Bleu import Bleu
from utils.metrics.SelfBleu import SelfBleu

os.chdir("/content")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [17]:
from nltk.translate import bleu_score as bleu

def prepare_data(data_file, replacements={}, uncased=True):
    data = [d.strip().split() for d in open(data_file, 'r').readlines()]
    if uncased:
        data = [[t.lower() for t in sent] for sent in data]
        
    for k, v in replacements.items():
        data = [[t if t != k else v for t in sent] for sent in data]
 
    return data

def prepare_wiki(data_file, uncased=True):
    replacements = {"@@unknown@@": "[UNK]"}
    return prepare_data(data_file, replacements=replacements, uncased=uncased)

def prepare_tbc(data_file):        
    replacements = {"``": "\"", "\'\'": "\""}
    return prepare_data(data_file, replacements=replacements)

def corpus_bleu(generated, references):
    """ Compute similarity between two corpora as measured by
    comparing each sentence of `generated` against all sentences in `references` 
    
    args:
        - generated (List[List[str]]): list of sentences (split into tokens)
        - references (List[List[str]]): list of sentences (split into tokens)
        
    returns:
        - bleu (float)
    """    
    return bleu.corpus_bleu([references for _ in range(len(generated))], generated)
    
    !git clone https://github.com/nyu-dl/bert-gen
wiki103_file = 'bert-gen/data/wiki103.5k.txt'
tbc_file = 'bert-gen/data/tbc.5k.txt'

wiki_data = prepare_wiki(wiki103_file)
tbc_data = prepare_tbc(tbc_file)

In [18]:
file_path = 'neuraltextgen/RNN_generated.txt'
bleu_score_tbc = Bleu(file_path, tbc_file)
bleu_score_wiki = Bleu(file_path, wiki103_file)

print("(Texygen) BERT-TBC BLEU: %.2f" % (100 * bleu_score_tbc.get_bleu()))
print("(Texygen) BERT-Wiki103 BLEU: %.2f" % (100 * bleu_score_wiki.get_bleu()))

(Texygen) BERT-TBC BLEU: 2.64
(Texygen) BERT-Wiki103 BLEU: 5.70
