In [1]:
from textgenrnn import textgenrnn
from datetime import datetime
import os

Using TensorFlow backend.


In [2]:
model_cfg = {
    'word_level': False,   # set to True if want to train a word-level model (requires more data and smaller max_length)
    'rnn_size': 128,   # number of LSTM cells of each layer (128/256 recommended)
    'rnn_layers': 4,   # number of LSTM layers (>=2 recommended)
    'rnn_bidirectional': True,   # consider text both forwards and backward, can give a training boost
    'max_length': 50,   # number of tokens to consider before predicting the next (20-40 for characters, 5-10 for words recommended)
    'max_words': 10000,   # maximum number of words to model; the rest will be ignored (word-level model only)
}

train_cfg = {
    'line_delimited': True,   # set to True if each text has its own line in the source file
    'num_epochs': 20,   # set higher to train the model for longer
    'gen_epochs': 5,   # generates sample text from model after given number of epochs
    'train_size': 0.8,   # proportion of input data to train on: setting < 1.0 limits model from learning perfectly
    'dropout': 0.0,   # ignore a random proportion of source tokens each epoch, allowing model to generalize better
    'validation': False,   # If train__size < 1.0, test on holdout dataset; will make overall training slower
    'is_csv': False   # set to True if file is a CSV exported from Excel/BigQuery/pandas
}

In [3]:
file_name = "wa2.txt"
model_name = 'WA'   # change to set file name of resulting trained models/texts

In [4]:
# Code for importing model from files
textgen = textgenrnn(weights_path='WA_weights.hdf5',
                       vocab_path='WA_vocab.json',
                       config_path='WA_config.json')

In [4]:
textgen = textgenrnn(name=model_name)

train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

train_function(
    file_path=file_name,
    new_model=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=1024,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings=100,
    word_level=model_cfg['word_level'])

44,792 texts collected.
Training new model w/ 4-layer, 128-cell Bidirectional LSTMs
Training on 1,412,001 character sequences.
Epoch 1/20








Epoch 2/20








Epoch 3/20








Epoch 4/20








Epoch 5/20








####################
Temperature: 0.2
####################
Kevin Algera: Heb je me niet meer verstuurd dan

Kevin Algera: Hahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahah

Kevin Algera: Hahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahah

####################
Temperature: 0.5
####################
Kevin Algera: Dus dan weet ik dan

Kevin Algera: Is het nu te verwaren in een dag?

Koen: Dit is al best best ken ik ze weten en duurder van t weggegokt

####################
Temperature: 1.0
####################
Kevin Algera: En j







Epoch 7/20








Epoch 8/20








Epoch 9/20








Epoch 10/20








####################
Temperature: 0.2
####################
Kevin Algera: Hahahahahahahahahahahahahahahahahahahahahahahahaahhahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahah

Kevin Algera: Hahahahahahahahahahahahahahahahahahhahahahahahahaaahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahaha

Kevin Algera: Hahahahahahahahahahahahahahahahahahahahahahahahaahhahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahahah

####################
Temperature: 0.5








Epoch 12/20








Epoch 13/20








Epoch 14/20








Epoch 15/20








####################
Temperature: 0.2
####################
Kevin Algera: Haha ja dat was een keer op de avond

Kevin Algera: Oh dan kan je niet meer genoeg van gehad idd

Kevin Algera: Ja dat was een paar dagen dan toch?

####################
Temperature: 0.5
####################
Kevin Algera: Hahahahahahahahaha koen

Kevin Algera: Nee maar gelooft heeft de politie verwacht

Thijs van der Putten: Waarom hebben ze dat niet zo aan te doen

####################
Temperature: 1.0
####################
Koen: Te nooi om Verkaart

Gijs Cunnen: ?*

Kevin Algera: SRiAEOEU

Epoch 16/20








Epoch 17/20








Epoch 18/20








Epoch 19/20








Epoch 20/20








####################
Temperature: 0.2
####################
Kevin Algera: Haha ja dat is een beetje aan de bus dan in je moeder te doen komen hebben geen pilsje van je vader terug gebruiken

Kevin Algera: <Media omitted>

Kevin Algera: Heb je die gekke geld

####################
Temperature: 0.5
####################
Kevin Algera: Ja die moest ik m niet meer

Thijs van der Putten: <Media omitted>

Kevin Algera: Of andere plaats t niet werken hoor

####################
Temperature: 1.0
####################
Wescel Manders: Nais

Thijs van der Putten: truste consumptie?

Koen: Goed gef!



In [5]:
textgen.generate_samples(temperatures=[0.2, 0.5, 1.0, 1.2, 1.5])

####################
Temperature: 0.2
####################
Kevin Algera: Dat is t nog wel een keer dat ik het m niet wat laten want dan komt ook nog wel een goeie tijd hebben

Kevin Algera: Heb je nog een beetje gezegd

Kevin Algera: Dan moet je wel in t vliegtuig wordt t niet

####################
Temperature: 0.5
####################
Kevin Algera: Ja klopt

Kevin Algera: Gewoon andere dagen

Kevin Algera: Haha

####################
Temperature: 1.0
####################
Yoram Carboex: Jep

Kevin Algera: Hz gaat dat dag

Gijs Cunnen: Kan ook mee?

####################
Temperature: 1.2
####################
Kevin Algera: <Media omitted>

Kevin Algera: Casa

Kevin Algera: Das a ahademicsed uit

####################
Temperature: 1.5
####################
Kevin Algera: Hmmm aangevuld

Kevin Algera: Mijn bamre 

Thijs van der Putten: kun je morgen pas collen.



In [5]:
# this temperature schedule cycles between 1 very unexpected token, 1 unexpected token, 2 expected tokens, repeat.
# changing the temperature schedule can result in wildly different output!
temperature = [1.0, 0.5, 0.2, 0.2]   
prefix_list = [
    "Koen Niemeijer",
    "Kevin Algera",
    "Willem Smits",
    "Gijs Cunnen",
    "Thijs van der Putten",
    "Wescel Manders",
    "Thomas Bardoel",
    "Michiel Arts",
    "Yoram Carboex"
]

if train_cfg['line_delimited']:
  n = 1000
  max_gen_length = 60 if model_cfg['word_level'] else 300
else:
  n = 1
  max_gen_length = 2000 if model_cfg['word_level'] else 10000

# Generate for everyone
gen_file = '{}_gentext_{}.txt'.format(model_name, "")
textgen.generate_to_file(gen_file,
                         temperature=temperature,
                         prefix="",
                         n=3000,
                         max_gen_length=max_gen_length)

# Generate for rest
for prefix in prefix_list:
    gen_file = '{}_gentext_{}.txt'.format(model_name, prefix)
    textgen.generate_to_file(gen_file,
                         temperature=temperature,
                         prefix=prefix,
                         n=n,
                         max_gen_length=max_gen_length)



100%|██████████████████████████████████████| 3000/3000 [31:47<00:00,  1.88it/s]
100%|██████████████████████████████████████| 1000/1000 [07:42<00:00,  2.14it/s]
100%|██████████████████████████████████████| 1000/1000 [07:29<00:00,  1.87it/s]
100%|██████████████████████████████████████| 1000/1000 [06:23<00:00,  3.05it/s]
100%|██████████████████████████████████████| 1000/1000 [07:02<00:00,  3.24it/s]
100%|██████████████████████████████████████| 1000/1000 [07:27<00:00,  3.31it/s]
100%|██████████████████████████████████████| 1000/1000 [08:00<00:00,  1.66it/s]
100%|██████████████████████████████████████| 1000/1000 [07:42<00:00,  2.25it/s]
100%|██████████████████████████████████████| 1000/1000 [06:44<00:00,  2.15it/s]
100%|██████████████████████████████████████| 1000/1000 [07:16<00:00,  2.29it/s]
