In [0]:
!pip install -q keras

In [0]:
!pip install -q numpy

In [0]:
!pip install -q pandas

In [4]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [5]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adam, RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import pandas as pd
import sys
import io
import re
import random

Using TensorFlow backend.


In [6]:
!wget https://raw.githubusercontent.com/ivan-liljeqvist/ailyrics/master/corpus.txt -P "/content/drive/My Drive/app"

--2018-12-19 16:44:18--  https://raw.githubusercontent.com/ivan-liljeqvist/ailyrics/master/corpus.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7709143 (7.4M) [text/plain]
Saving to: ‘/content/drive/My Drive/app/corpus.txt’


2018-12-19 16:44:18 (94.4 MB/s) - ‘/content/drive/My Drive/app/corpus.txt’ saved [7709143/7709143]



In [0]:
df = pd.read_csv("/content/drive/My Drive/app/corpus.txt")

In [8]:
with io.open("/content/drive/My Drive/app/corpus.txt", encoding='utf-8') as f:
  text = f.read().lower()
print("Corpus length: ",len(text))

Corpus length:  7709143


In [9]:
chars = sorted(list(set(text)))
print("Total chars: ", len(chars))

Total chars:  40


In [0]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [0]:
maxlen = 40
steps = 3
sentences = []
next_chars = []

In [12]:
for i in range(0, len(text) - maxlen, steps):
  sentences.append(text[i: i + maxlen])
  next_chars.append(text[i + maxlen])
print("nb sentences : ", len(sentences))

nb sentences :  2569701


In [13]:
print("Vectorization...")
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

Vectorization...


In [0]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [15]:
print("Bulding model...")
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dense(len(chars), activation = 'softmax'))

Bulding model...


In [0]:
optimizer = RMSprop(lr = 0.001)
model.compile(loss = 'categorical_crossentropy', optimizer = optimizer, metrics = ['accuracy'])

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 40, 128)           86528     
_________________________________________________________________
dropout_1 (Dropout)          (None, 40, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 40, 400)           846400    
_________________________________________________________________
dropout_2 (Dropout)          (None, 40, 400)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 400)               1281600   
_________________________________________________________________
dense_1 (Dense)              (None, 40)                16040     
Total params: 2,230,568
Trainable params: 2,230,568
Non-trainable params: 0
_________________________________________________________________


In [0]:
def sample(preds, temperature = 1.0):
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds)/temperature
  exp_preds = np.exp(preds)
  preds = exp_preds/np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

In [0]:
def on_epoch_end(epoch, logs):
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [20]:
print_callback = LambdaCallback(on_epoch_end = on_epoch_end)

model.fit(x, y, batch_size=700, epochs = 30, callbacks = [print_callback])

Epoch 1/30
  28700/2569701 [..............................] - ETA: 40:25 - loss: 2.9979 - acc: 0.1997

KeyboardInterrupt: ignored

In [0]:
model.save_weights("model.h5")