#  RNN Text Generation for keywords SEO.




Generate keywords using a pretrained neural network with a few lines of code, or easily train your own text-generating neural network of any size and complexity, **for free on a GPU using Collaboratory!**



For more about textgenrnn, you can visit [this GitHub repository](https://github.com/minimaxir/textgenrnn).


To get started:

1. Copy this notebook to your Google Drive to keep it and save your changes.
2. Make sure you're running the notebook in Google Chrome.
3. Run the cells below:

In [None]:
# Installation of external packages
!pip install advertools
!pip install emot

# Modules and libs
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import requests
import nltk
import math
import time
import os

# Modules and libs for neural network
from keras.utils import np_utils
from keras.initializers import RandomNormal
from keras.initializers import he_normal
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout

# Processing
from io import BytesIO
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Text processing and text-exploration libs
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.sparse import hstack
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.callbacks import ModelCheckpoint

# TensorFlow
import tensorflow as tf
import tensorflow_hub as hub

# Visualisations and warnings
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(style = "tableau-colorblind10")
plt.rcParams['figure.figsize'] = (15, 10)
%matplotlib inline
import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot , plot
init_notebook_mode(connected = True)
import warnings
warnings.filterwarnings("ignore")
from google.colab import files

# https://pypi.org/project/advertools/
# https://advertools.readthedocs.io/en/master/advertools.html#module-advertools.serp
# https://pypi.org/project/textgenrnn/
import advertools as adv

Collecting advertools
[?25l  Downloading https://files.pythonhosted.org/packages/4a/1b/5e513f896f6d0fd93fa3cf8fa4735b1e9a854f2ec5d34b25048069abdd47/advertools-0.10.6-py2.py3-none-any.whl (242kB)
[K     |████████████████████████████████| 245kB 3.4MB/s 
Collecting scrapy
[?25l  Downloading https://files.pythonhosted.org/packages/eb/9f/81a270190802cf02d49a495a2ee9291ea1d21f969a900880285dd7444d74/Scrapy-2.2.1-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 10.6MB/s 
[?25hCollecting twython
  Downloading https://files.pythonhosted.org/packages/24/80/579b96dfaa9b536efde883d4f0df7ea2598a6f3117a6dd572787f4a2bcfb/twython-3.8.2-py3-none-any.whl
Collecting w3lib>=1.17.0
  Downloading https://files.pythonhosted.org/packages/a3/59/b6b14521090e7f42669cafdb84b0ab89301a42f1f1a82fcf5856661ea3a7/w3lib-1.22.0-py2.py3-none-any.whl
Collecting cssselect>=0.9.1
  Downloading https://files.pythonhosted.org/packages/3b/d4/3b5c17f00cce85b9a1e6f91096e1cc8e8ede2e1be8e96b87ce1ed09

Using TensorFlow backend.
  import pandas.util.testing as tm


In [None]:
!pip install -q textgenrnn
from google.colab import files
from textgenrnn import textgenrnn
from datetime import datetime
import os

## Google Drive and fileSystem connection.

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authorisation and client connection
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive()
from google.colab import drive
drive.mount('/content/drive')

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
%cd /content/drive/My\ Drive/Colab\ Notebooks/RNN_neural_Network_for_keywords_generation

[Errno 2] No such file or directory: '/content/drive/My Drive/Colab Notebooks/RNN_neural_Network_for_keywords_generation'
/content


In [None]:
%ls

adc.json  [0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
# File importing
# from google.colab import files
# uploaded = files.upload()

keywords = pd.read_csv('mostUsedWords.csv')
keywords = keywords['keywords'].values.tolist()
keywords[:3]

['girl', 'call', 'girls']

Set the textgenrnn model configuration here: the default parameters here give good results for most workflows. (see the [demo notebook](https://github.com/minimaxir/textgenrnn/blob/master/docs/textgenrnn-demo.ipynb) for more information about these parameters)

If you are using an input file where documents are line-delimited, make sure to set `line_delimited` to `True`.

In [None]:
model_cfg = {
    'word_level': True,   # set to True if want to train a word-level model (requires more data and smaller max_length)
    'rnn_size': 256,   # number of LSTM cells of each layer (128/256 recommended)
    'rnn_layers': 4,   # number of LSTM layers (>=2 recommended)
    'rnn_bidirectional': True,   # consider text both forwards and backward, can give a training boost
    'max_length': 3,   # number of tokens to consider before predicting the next (20-40 for characters, 5-10 for words recommended)
    'max_words': 1000,   # maximum number of words to model; the rest will be ignored (word-level model only)
}

train_cfg = {
    'line_delimited': True,   # set to True if each text has its own line in the source file
    'num_epochs': 1020,   # set higher to train the model for longer
    'gen_epochs': 5,   # generates sample text from model after given number of epochs
    'train_size': 0.8,   # proportion of input data to train on: setting < 1.0 limits model from learning perfectly
    'dropout': 0.0,   # ignore a random proportion of source tokens each epoch, allowing model to generalize better
    'validation': False,   # If train__size < 1.0, test on holdout dataset; will make overall training slower
    'is_csv': True   # set to True if file is a CSV exported from Excel/BigQuery/pandas
}

In the Colaboratory Notebook sidebar on the left of the screen, select *Files*. From there you can upload files:

![alt text](https://i.imgur.com/TGcZT4h.png)

Upload **any text file** and update the file name in the cell below, then run the cell.

In [None]:
file_name = "mostUsedWords.csv"

model_name = 'keywordGen'   # change to set file name of resulting trained models/texts

The next cell will start the actual training. And thanks to the power of Keras's CuDNN layers, training is super-fast when compared to CPU training on a local machine!

Ideally, you want a training loss less than `1.0` in order for the model to create sensible text consistently.

In [None]:
# !pip install tensorflow==1.15.3
import tensorflow as tf
import keras.backend.tensorflow_backend as tfback

print("tf.__version__ is", tf.__version__)
print("tf.keras.__version__ is:", tf.keras.__version__)

def _get_available_gpus():
    """Get a list of available gpu devices (formatted as strings).

    # Returns
        A list of available GPU devices.
    """
    #global _LOCAL_DEVICES
    if tfback._LOCAL_DEVICES is None:
        devices = tf.config.list_logical_devices()
        tfback._LOCAL_DEVICES = [x.name for x in devices]
    return [x for x in tfback._LOCAL_DEVICES if 'device:gpu' in x.lower()]

tfback._get_available_gpus = _get_available_gpus


from tensorflow.python.compiler.tensorrt import trt_convert as trt

tf.__version__ is 2.2.0
tf.keras.__version__ is: 2.3.0-tf


In [None]:
textgen = textgenrnn(name = model_name)

train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

train_function(
    file_path=file_name,
    new_model=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=1024,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings=100,
    word_level=model_cfg['word_level'])

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
new

single

Epoch 346/1020
Epoch 347/1020
Epoch 348/1020
Epoch 349/1020
Epoch 350/1020
####################
Temperature: 0.2
####################
meet girls near

make guy like

meet girls near

####################
Temperature: 0.5
####################
tamil

marriage

kiss first kiss

####################
Temperature: 1.0
####################
girl

years old women

desi

Epoch 351/1020
Epoch 352/1020
Epoch 353/1020
Epoch 354/1020
Epoch 355/1020
####################
Temperature: 0.2
####################
meet celebrity

meet girls near

meet girls near

####################
Temperature: 0.5
####################
dating sites free

meet

make guy like

####################
Temperature: 1.0
####################
kiss sex

service

kerala

Epoch 356/1020
Epoch 357/1020
Epoch 358/1020
Epoch 359/1020
Epoch 360/1020
####################
Temperature: 0.2
####################
meet

dating sites seniors smile

meet

You can download a large amount of generated text from your model with the cell below. Rerun the cell as many times as you want for even more text.

In [None]:
# this temperature schedule cycles between 1 very unexpected token, 1 unexpected token, 2 expected tokens, repeat.
# changing the temperature schedule can result in wildly different output!
temperature = [1.0, 0.5, 0.2, 0.2]   
prefix = ""   # if you want each generated text to start with a given seed text

if train_cfg['line_delimited']:
  #n = 1000
  n = 1000
  #max_gen_length = 60 if model_cfg['word_level'] else 300
  max_gen_length = 10
else:
  n = 1
  max_gen_length = 150
  #max_gen_length = 2000 if model_cfg['word_level'] else 10000
  
timestring = datetime.now().strftime('%Y%m%d_%H%M%S')
gen_file = '{}_gentext_{}.txt'.format(model_name, timestring)

textgen.generate_to_file(gen_file,
                         temperature=temperature,
                         prefix=prefix,
                         n=n,
                         max_gen_length=max_gen_length)
files.download(gen_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

You can download the weights and configuration files in the cell below, allowing you recreate the model on your own computer

In [None]:
files.download('{}_weights.hdf5'.format(model_name))
files.download('{}_vocab.json'.format(model_name))
files.download('{}_config.json'.format(model_name))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

To recreate the model on your own computer, after installing textgenrnn and TensorFlow, you can create a Python script with:

```
from textgenrnn import textgenrnn
textgen = textgenrnn(weights_path='colaboratory_weights.hdf5',
                       vocab_path='colaboratory_vocab.json',
                       config_path='colaboratory_config.json')
                       
textgen.generate_samples(max_gen_length=1000)
textgen.generate_to_file('textgenrnn_texts.txt', max_gen_length=1000)
```

Have fun with your new model! :)

# Etcetera

If the model fails to load on a local machine due to a model-size-not-matching bug (common in >30MB weights), this is due to a file export bug from Colaboratory. To work around this issue, save the weights to Google Drive with the two cells below and download from there.

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import files
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
uploaded = drive.CreateFile({'title': '{}_weights.hdf5'.format(model_name)})
uploaded.SetContentFile('{}_weights.hdf5'.format(model_name))
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/__init__.py", line 36, in autodetect
    from google.appengine.api import memcache
ModuleNotFoundError: No module named 'google.appengine'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 37, in <module>
    from oauth2client.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.locked_file'

During handling of the above exception, another exceptio

Uploaded file with ID 13CjtVnnrhLjYVDaNmULMt9a4zVslpwe7


If the notebook has errors (e.g. GPU Sync Fail), force-kill the Colaboratory virtual machine and restart it with the command below:

In [None]:
!kill -9 -1