<a href="https://colab.research.google.com/github/JuanJoseMV/neuraltextgen/blob/main/RNN_TextGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalations

In [1]:
%%capture
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
! unzip '/content/wiki-news-300d-1M.vec.zip'
! pip install transformers
! git clone https://github.com/JuanJoseMV/neuraltextgen.git

# Imports

In [2]:
import gensim.models.wrappers.fasttext
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel, BertConfig, AutoConfig
from collections import Counter

# Cleaning the dataset


In [None]:
## When using Wiki.tokens (not wiki.5k)

with open('/content/wiki.train.tokens') as f:
  content = f.readlines()

clean = []
for c in content:
  clean.append(c.replace('\n', '[EOS]'))

# Downloading pre-trained wordembeddings

In [None]:
# It takes some minutes, avoid if won't use
model = gensim.models.KeyedVectors.load_word2vec_format('/content/wiki-news-300d-1M.vec')
word_vectors = model.wv

weights = torch.FloatTensor(word_vectors.vectors)
embedding = nn.Embedding.from_pretrained(weights)

# Train the network

In [6]:
os.chdir('/content/neuraltextgen/')
from RNNGenerator import RNNGenerator

params = {
    "seq_size": 32, 
    "batch_size": 16, 
    "embedding_size": 64, 
    "lstm_size": 64,
    "lstm_num_layers": 3, 
    "lstm_bidirectional": True, 
    "lstm_dropout": 0.5, 
    "gradients_norm": 5,
    "predict_top_k": 5, 
    "training_epocs": 200, 
    "lr": 0.01, 
    "weights": None
}

train_file = 'data/wiki103.5k.txt'
generator = RNNGenerator(**params)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_net = generator.train(device, train_file)

# list of sentences
sentences = generator.predict(device, trained_net, n_sentences=100)
sentences

Vocabulary size 3775
Epoch: 4/200 Iteration: 100 Loss: 5.550518989562988
Epoch: 8/200 Iteration: 200 Loss: 4.805933952331543
Epoch: 13/200 Iteration: 300 Loss: 3.967970848083496
Epoch: 17/200 Iteration: 400 Loss: 3.6054868698120117
Epoch: 21/200 Iteration: 500 Loss: 2.920198440551758
Epoch: 26/200 Iteration: 600 Loss: 2.650226354598999
Epoch: 30/200 Iteration: 700 Loss: 1.9219039678573608
Epoch: 34/200 Iteration: 800 Loss: 1.5465679168701172
Epoch: 39/200 Iteration: 900 Loss: 1.1541169881820679
Epoch: 43/200 Iteration: 1000 Loss: 0.7567822337150574
Epoch: 47/200 Iteration: 1100 Loss: 0.6856549978256226
Epoch: 52/200 Iteration: 1200 Loss: 0.5759510397911072
Epoch: 56/200 Iteration: 1300 Loss: 0.47042080760002136
Epoch: 60/200 Iteration: 1400 Loss: 0.3537713885307312
Epoch: 65/200 Iteration: 1500 Loss: 0.3175928294658661
Epoch: 69/200 Iteration: 1600 Loss: 0.24685680866241455
Epoch: 73/200 Iteration: 1700 Loss: 0.2082928568124771
Epoch: 78/200 Iteration: 1800 Loss: 0.1899767965078354
Epo

['justice @ 3rd @,@ 000 may 400 school 500 may 330 placed debt debt debt Mark 96 Mark license Mark much @,@ 48 @. 500 throughout left yd million 96 placed 96 debt placed debt placed 96 Mark 48 48 96 48 cm 48 96 placed debt debt placed debt pointing wanted 96 since 48 wanted 48 69 placed 96 debt pointing debt 96 69 placed 96 debt placed since since since nine debt 70 debt forecast 69 at 96 from Olympia with a two its The a first a convection , also may its bones was claimed also she " named a',
 'sea came of temple in along , along of Manila of history . Throughout , although with 2008 with Craigie that signed that however with WWE , it and however a first " was a " at @@UNKNOWN@@ usually it such have known are may who modern 1 much like much about 367 @,@ 13 @,@ range @. 8 ( 12 is directed ( 11 ( 1 \'s work were discovered a men Marines finds case . 71 @. 000 @. heavily @,@ they been 8 @,@ 000 @. 8 @,@ 0 @,@ 11 – 8 ( 11 were 000 – crossed',
 'implementation Three gun from / win Is have