In [None]:
# Tensorflow >= 1.12 required
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, Conv1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np


In [None]:
# Check tensorflow version
tf.__version__


'2.4.1'

# Text preprocessing 
* The input corpus is read as an input
* The corpus is splited to lines
* The array of lines tokenized in next step. It means each word is extracted and labeled.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_saadi = open('/content/drive/MyDrive/data/poetry_saadi.txt').read()
data_rumi = open('/content/drive/MyDrive/data/poetry_rumi.txt').read()

corpus_saadi = data_saadi.lower().split("\n")
corpus_rumi = data_rumi.lower().split("\n")


In [None]:
tokenizer = Tokenizer()
# TODO You can use other tokenizers and preprocessings like clean text algorithms[Optional].
corpus = corpus_saadi+ corpus_rumi
corpus = list(filter(None,corpus))
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)


{'و': 1, 'از': 2, 'را': 3, 'که': 4, 'در': 5, 'آن': 6, 'تو': 7, 'به': 8, 'بر': 9, 'او': 10, 'این': 11, 'چون': 12, 'من': 13, 'تا': 14, 'هر': 15, 'است': 16, 'ز': 17, 'بود': 18, 'ای': 19, 'ما': 20, 'با': 21, 'گر': 22, 'چه': 23, 'جان': 24, 'شد': 25, 'دل': 26, 'خود': 27, 'گفت': 28, 'نه': 29, 'نیست': 30, 'سر': 31, 'اندر': 32, 'پیش': 33, 'دست': 34, 'دوست': 35, 'روی': 36, 'چو': 37, 'آب': 38, 'چشم': 39, 'کرد': 40, 'کند': 41, 'همه': 42, 'هم': 43, 'باشد': 44, 'حق': 45, 'یک': 46, 'عشق': 47, 'یا': 48, 'آمد': 49, 'کو': 50, 'سعدی': 51, 'دو': 52, 'خویش': 53, 'بی': 54, 'آتش': 55, 'شود': 56, 'صد': 57, 'کن': 58, 'اگر': 59, 'سوی': 60, 'کز': 61, 'آید': 62, 'باز': 63, 'جهان': 64, 'ور': 65, 'نظر': 66, 'پس': 67, 'مرد': 68, 'زان': 69, 'مرا': 70, 'اوست': 71, 'یکی': 72, 'یار': 73, 'شیر': 74, 'نور': 75, 'هست': 76, 'کان': 77, 'جمله': 78, 'خوش': 79, 'جز': 80, 'کی': 81, 'غم': 82, 'بد': 83, 'صورت': 84, 'پای': 85, 'خلق': 86, 'وز': 87, 'خاک': 88, 'زین': 89, 'دید': 90, 'وی': 91, 'گوش': 92, 'باد': 93, 'بعد': 94, 'همچو': 9

## Convert texts to the sequence of numbers
* Each text lines should convert to the sequence of numbers to process with network.
* Commonly `training_sequences = tokenizer.texts_to_sequences training_sentences)` is enough.

In [None]:
input_sequences = []
labels = []
for line in corpus_saadi:
    token_list = tokenizer.texts_to_sequences([line])[0]
    input_sequences.append(token_list)
    labels.append(0)
for line in corpus_rumi:
    token_list = tokenizer.texts_to_sequences([line])[0]
    input_sequences.append(token_list)
    labels.append(1)


## Pad sequences
Sequences are padded to have same sizes. The sequnces which have less words than specified threshould, 0s will added to the start of sequence.

In [None]:
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


## Split sequences to x and y

In [None]:
# create predictors and label
xs, ys = input_sequences, np.array(labels)


In [None]:
# Split sequences to train and test data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(xs, ys, test_size=0.2, random_state=42)



## Define and Train Model

In [None]:
# Stacked LSTM
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len))
model.add(LSTM(units = 180, return_sequences = True))
model.add(Dropout(0.2))
model.add(Conv1D(filters=128, kernel_size=9, activation='relu'))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=1, activation='relu'))

model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
model.summary()
history = model.fit(X_train, Y_train, epochs=15, batch_size = 32, verbose=1, validation_data=(X_test, Y_test))


In [None]:
# TODO save model
from numpy import loadtxt
from keras.models import load_model

model.save('RNN_model.h5')
# model = load_model('model.h5')

In [None]:
# Evaluate
loss, acc = model.evaluate(X_test, Y_test, verbose=0)
print('Accuracy: %f' % (acc*100))

Accuracy: 85.023963


In [None]:
# TODO Evaluate model accuracy
predicted = model.predict(X_test)
print(predicted)


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], color='blue', label='train_acc')
plt.plot(history.history['val_accuracy'], color='orange', label='val_acc')
plt.legend()
plt.show()


In [None]:

## Generate new text lines


In [None]:

# seed_text = "خلایق در تو حیرانند و جای حیرتست الحق"
# seed_text = "به نام خداوند جان و خرد"
# seed_text = "در خرقه چو آتش زدی ای سالک زاهد جهدی کن و از سرحلقه رندان جهان باش"
# seed_text = " ز دانش دل پیر برنا بود توانا بود هر که دانا بود "
# seed_text = "ای بی خبر بکوش صاحب خبر شوی تا راهرو نباشی کی راهبر شوی"
seed_text = "کان درد به صد هزار درمان ندهم"
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding='pre')
predicted = model.predict_classes(token_list, verbose=0)
print(predicted)

'''
########### Output ############
[[1]]
###############################
'''


[[[0]
  [0]]]




'\n########### Output ############\n[[1]]\n###############################\n'