In [1]:
import tensorflow as tf
import numpy as np
import time
import os

In [2]:
text = open('khayyam.txt', 'rb').read().decode(encoding='utf-8')

In [3]:
text[:10]

'|برخیز بتا'

In [4]:
vocabolaries = sorted(set(text))

In [5]:
vocabolaries

['\n',
 '\r',
 ' ',
 '!',
 '|',
 '\xa0',
 '،',
 '؟',
 'آ',
 'أ',
 'ئ',
 'ا',
 'ب',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ف',
 'ق',
 'ل',
 'م',
 'ن',
 'ه',
 'و',
 'ٌ',
 'َ',
 'ُ',
 'ِ',
 'ّ',
 'ٔ',
 'پ',
 'چ',
 'ژ',
 'ک',
 'گ',
 'ۀ',
 'ی']

In [6]:
len(vocabolaries)

50

In [7]:
char2index = {u:i for i, u in enumerate(vocabolaries)}
index2char = np.array(vocabolaries)

In [8]:
char2index

{'\n': 0,
 '\r': 1,
 ' ': 2,
 '!': 3,
 '|': 4,
 '\xa0': 5,
 '،': 6,
 '؟': 7,
 'آ': 8,
 'أ': 9,
 'ئ': 10,
 'ا': 11,
 'ب': 12,
 'ت': 13,
 'ث': 14,
 'ج': 15,
 'ح': 16,
 'خ': 17,
 'د': 18,
 'ذ': 19,
 'ر': 20,
 'ز': 21,
 'س': 22,
 'ش': 23,
 'ص': 24,
 'ض': 25,
 'ط': 26,
 'ظ': 27,
 'ع': 28,
 'غ': 29,
 'ف': 30,
 'ق': 31,
 'ل': 32,
 'م': 33,
 'ن': 34,
 'ه': 35,
 'و': 36,
 'ٌ': 37,
 'َ': 38,
 'ُ': 39,
 'ِ': 40,
 'ّ': 41,
 'ٔ': 42,
 'پ': 43,
 'چ': 44,
 'ژ': 45,
 'ک': 46,
 'گ': 47,
 'ۀ': 48,
 'ی': 49}

In [9]:
index2char

array(['\n', '\r', ' ', '!', '|', '\xa0', '،', '؟', 'آ', 'أ', 'ئ', 'ا',
       'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص',
       'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'ٌ',
       'َ', 'ُ', 'ِ', 'ّ', 'ٔ', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ۀ', 'ی'],
      dtype='<U1')

In [10]:
index2char[4]

'|'

In [11]:
text_as_integer = np.array([char2index[c] for c in text])

In [12]:
text_as_integer

array([ 4, 12, 20, ..., 49,  1,  0])

In [13]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_integer)

In [14]:
char_dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [15]:
for i in char_dataset.take(10):
    print(index2char[i.numpy()])

|
ب
ر
خ
ی
ز
 
ب
ت
ا


In [16]:
sequences = char_dataset.batch(30, drop_remainder=True)
sequences

<BatchDataset element_spec=TensorSpec(shape=(30,), dtype=tf.int32, name=None)>

In [17]:
for i in sequences.take(3):
    print('--->', ''.join(index2char[i.numpy()]))

---> |برخیز بتا بیا ز بهر دل ما
|ح
---> ل کن به جمال خویشتن مشکل ما
|
---> یک کوزه شراب تا به هم نوش کنیم


In [18]:
def sit(batch):
    input_text = batch[:-1]
    target_text = batch[1:]
    return input_text, target_text
dataset = sequences.map(sit)

In [19]:
dataset

<MapDataset element_spec=(TensorSpec(shape=(29,), dtype=tf.int32, name=None), TensorSpec(shape=(29,), dtype=tf.int32, name=None))>

In [20]:
for i in dataset.take(1):
    print(''.join(index2char[i[0].numpy()]))
    print(''.join(index2char[i[1].numpy()]))

|برخیز بتا بیا ز بهر دل ما
|
برخیز بتا بیا ز بهر دل ما
|ح


In [21]:
dataset = dataset.batch(64, drop_remainder=True)
dataset

<BatchDataset element_spec=(TensorSpec(shape=(64, 29), dtype=tf.int32, name=None), TensorSpec(shape=(64, 29), dtype=tf.int32, name=None))>

In [22]:
len(vocabolaries)

50

In [23]:
vocabolary_size = len(vocabolaries)
embedding_dim = 25
rnn_unit = 1024

In [24]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabolary_size, 25),
    tf.keras.layers.GRU(1024, return_sequences=True),
    tf.keras.layers.Dense(vocabolary_size)
])

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 25)          1250      
                                                                 
 gru (GRU)                   (None, None, 1024)        3228672   
                                                                 
 dense (Dense)               (None, None, 50)          51250     
                                                                 
Total params: 3,281,172
Trainable params: 3,281,172
Non-trainable params: 0
_________________________________________________________________


In [26]:
for input_text, target_text in dataset.take(1):
    output = model.predict(input_text)
    print(output[0])

[[ 1.1216772e-03 -2.7554780e-03 -9.4512384e-04 ... -4.8349025e-03
   9.6095179e-04  1.9941560e-03]
 [ 1.2400657e-03  9.9574425e-04 -1.3991991e-03 ... -1.9555248e-04
   5.2148928e-03 -1.4674064e-04]
 [-1.1573792e-03  5.4802760e-03  1.6255752e-03 ... -2.2589290e-03
   4.9562617e-03 -2.9845359e-03]
 ...
 [-2.7027156e-04 -4.3290053e-03 -1.1794905e-03 ...  5.6784949e-03
  -5.4286155e-03 -1.7461486e-03]
 [-5.3574331e-05 -2.9755058e-03  2.4312641e-05 ...  7.1092006e-03
  -3.4592347e-04  1.3967545e-04]
 [ 9.9565822e-04 -3.7932012e-03 -2.5426992e-04 ... -8.4391749e-04
   7.6627708e-04  2.1886984e-03]]


In [27]:
si = tf.random.categorical(output[0], num_samples=1)
si

<tf.Tensor: shape=(29, 1), dtype=int64, numpy=
array([[38],
       [16],
       [ 6],
       [47],
       [40],
       [ 8],
       [26],
       [22],
       [40],
       [ 9],
       [27],
       [43],
       [13],
       [40],
       [13],
       [45],
       [40],
       [19],
       [29],
       [19],
       [36],
       [ 0],
       [16],
       [30],
       [35],
       [ 2],
       [39],
       [ 8],
       [43]], dtype=int64)>

In [28]:
tf.squeeze(si, axis=-1).numpy()

array([38, 16,  6, 47, 40,  8, 26, 22, 40,  9, 27, 43, 13, 40, 13, 45, 40,
       19, 29, 19, 36,  0, 16, 30, 35,  2, 39,  8, 43], dtype=int64)

In [29]:
''.join(index2char[tf.squeeze(si, axis=-1).numpy()])

'َح،گِآطسِأظپتِتژِذغذو\nحفه ُآپ'

In [30]:
output[0][0]

array([ 1.1216772e-03, -2.7554780e-03, -9.4512384e-04,  1.4137855e-03,
       -5.4788259e-03,  4.4993372e-03,  4.3929080e-03,  2.3201024e-03,
       -3.7120283e-04,  9.4999053e-04,  1.2124276e-03, -8.1880053e-04,
       -1.2441836e-03,  3.5518524e-03, -3.8265083e-03, -2.5469675e-03,
        1.9241204e-03,  3.9603573e-04, -1.9656394e-03,  2.0249197e-03,
        1.7273048e-03,  8.8853622e-04, -5.9465994e-03,  8.4378850e-03,
       -1.5465207e-03,  5.3120581e-03, -6.1906585e-03,  7.8680535e-04,
        3.0195042e-03,  3.2530373e-03, -4.8523210e-03,  2.9584877e-03,
       -1.4797749e-03,  4.1607181e-03,  5.1735959e-04, -2.3956872e-03,
        2.1656197e-03,  4.8482576e-03,  1.5829917e-03, -2.7708930e-04,
       -1.5890987e-03, -3.8435415e-03,  3.4430774e-04,  2.3822922e-03,
        2.9882998e-05, -3.9898711e-03,  2.3829078e-03, -4.8349025e-03,
        9.6095179e-04,  1.9941560e-03], dtype=float32)

In [31]:
def loss_f(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(optimizer='adam', loss=loss_f)

In [32]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='khayyam/checkpoints', save_weights_only=True)

In [33]:
# history = model.fit(dataset, epochs=10, callbacks=[checkpoint])

In [34]:
tf.train.latest_checkpoint('khayyam')

'khayyam\\checkpoints'

In [35]:
model_2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabolary_size, 25),
    tf.keras.layers.GRU(1024, return_sequences=True),
    tf.keras.layers.Dense(vocabolary_size)
])

In [36]:
model_2.load_weights(tf.train.latest_checkpoint('khayyam'))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x15c96bbf9d0>

In [37]:
model_2.build(tf.TensorShape([1, None]))

In [38]:
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 25)          1250      
                                                                 
 gru_1 (GRU)                 (None, None, 1024)        3228672   
                                                                 
 dense_1 (Dense)             (None, None, 50)          51250     
                                                                 
Total params: 3,281,172
Trainable params: 3,281,172
Non-trainable params: 0
_________________________________________________________________


In [39]:
num_generate = 1000
first_string = 'به نام خداوند جان و خرد کزین برتر'
input_eval = [char2index[s] for s in first_string]
input_eval = tf.expand_dims(input_eval, 0)
input_eval

<tf.Tensor: shape=(1, 33), dtype=int32, numpy=
array([[12, 35,  2, 34, 11, 33,  2, 17, 18, 11, 36, 34, 18,  2, 15, 11,
        34,  2, 36,  2, 17, 20, 18,  2, 46, 21, 49, 34,  2, 12, 20, 13,
        20]])>

In [40]:
model_2.reset_states()

In [None]:
text_generated = []
for i in range(10):
    predictions = model_2.predict(input_eval)
    predictions = tf.squeeze(predictions, 0)
    predicted_ids = tf.random.categorical(predictions, num_samples=1).numpy()
    input_eval = tf.expand_dims(tf.squeeze(predicted_ids, axis=-1).numpy(), 0).numpy()
    text_generated.append(index2char[tf.squeeze(predicted_ids, axis=-1).numpy()])



In [None]:
for i in text_generated:
    print(''.join(i))

In [None]:
len(text_generated[1])