In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import numpy as np
import os
import time
import glob

<module 'tensorflow._api.v2.version' from '/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/version/__init__.py'>


In [None]:
# Read, then decode for py2 compat.
text = open('/content/sample_data/praise-poems_dataset.txt', 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

# remove some exteranous chars 
execluded = '!()*-.1:=[]«»;؛−,،~?؟#\u200f\ufeff'
out = ""
for char in text:
  if char not in execluded:
    out += char
text = out
text = text.replace("\t\t\t", "\t")
text = text.replace("\r\r\n", "\n")
text = text.replace("\r\n","\n")
text = text.replace("\t\n", "\n")
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

Length of text: 1155335 characters
46 unique characters


In [None]:
# Take a look at the first 250 characters in text
print(text[:200])

عَبَّاسُ يا خَيْرَ المُلُوكِ عَدَالَةً
وَأَجَلَّ مَنْ نَطَقَ امْرُؤٌ بِثَنَائِهِ
أَوْلَيْتَنِي مِنْكَ الرِّضَا وجَلَوْتَ لِي
وَجْهاً قَرَأْتُ البِشْرَ في أَثْنَائِهِ
فاسْلَمْ لِمُلْكٍ أَنْتَ بَدْرُ سَ


In [None]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

In [None]:
print('{')
for char,_ in zip(char2idx, range(47)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  'ء' :   2,
  'آ' :   3,
  'أ' :   4,
  'ؤ' :   5,
  'إ' :   6,
  'ئ' :   7,
  'ا' :   8,
  'ب' :   9,
  'ة' :  10,
  'ت' :  11,
  'ث' :  12,
  'ج' :  13,
  'ح' :  14,
  'خ' :  15,
  'د' :  16,
  'ذ' :  17,
  'ر' :  18,
  'ز' :  19,
  'س' :  20,
  'ش' :  21,
  'ص' :  22,
  'ض' :  23,
  'ط' :  24,
  'ظ' :  25,
  'ع' :  26,
  'غ' :  27,
  'ف' :  28,
  'ق' :  29,
  'ك' :  30,
  'ل' :  31,
  'م' :  32,
  'ن' :  33,
  'ه' :  34,
  'و' :  35,
  'ى' :  36,
  'ي' :  37,
  'ً' :  38,
  'ٌ' :  39,
  'ٍ' :  40,
  'َ' :  41,
  'ُ' :  42,
  'ِ' :  43,
  'ّ' :  44,
  'ْ' :  45,
  ...
}


In [None]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:28]), text_as_int[:28]))

'عَبَّاسُ يا خَيْرَ المُلُوكِ' ---- characters mapped to int ---- > [26 41  9 44 41  8 20 42  1 37  8  1 15 41 37 45 18 41  1  8 31 32 42 31
 42 35 30 43]


In [None]:
# The maximum length sentence we want for a single input in characters
seq_length = 200
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

ع
َ
ب
ّ
َ


In [None]:
print(text_as_int)

[26 41  9 ... 18 37  8]


In [None]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'عَبَّاسُ يا خَيْرَ المُلُوكِ عَدَالَةً\nوَأَجَلَّ مَنْ نَطَقَ امْرُؤٌ بِثَنَائِهِ\nأَوْلَيْتَنِي مِنْكَ الرِّضَا وجَلَوْتَ لِي\nوَجْهاً قَرَأْتُ البِشْرَ في أَثْنَائِهِ\nفاسْلَمْ لِمُلْكٍ أَنْتَ بَدْرُ سَر'
'ِيرِهِ\nوَعِمَادُ قُوَّتِهِ ونَصْرُ لِوائِهِ\nيأَيُّها الصَّادِي إِلى نَيْلِ الْمُنَى\nرِدْ بَحْرَ سُدَّتِهِ تَفُزْ بِوَلائِهِ\nهُوَ ذَلِكَ الْمَلِكُ الَّذِي وَرِث الْعُلا\nعَنْ نَفْسِهِ شَرَفا وعَن آبائِهِ'
'\nالْعَدْلُ مِنْ أَخْلاقِهِ والْعِلْمُ مِنْ\nأَوْصافِهِ والْحِلْمُ مِنْ أَسْمَائِهِ\nلا غَرْوَ أَنْ جَمَعَ المَحَامِدَ يافِعاً\nوَسَمَا بِهِمَّتِهِ عَلَى نُظَرائِهِ\nفالْعَينُ وَهْيَ صَغِيرَةٌ في حَجْمِها\nت'
'َسَعُ الفَضَاءَ بِأَرْضِهِ وسَمائِهِ\n\nوإني حين تشتجر العوالي\nأعيد الرمح في أثر الجراح\nشديد البأس ليس بذي عياء\nولكني أبوء إلى الفلاح\nسألبس ثوبها وأذب عنها\nبأطراف العوالي والصفاحف\nما يبقى لعترته ذليل\nفتم'
'نعه من القدر المتاح\nوأجمل من حياة الذل موت\nوبعض العار لا يمحوه ماح\n\nوَلَو بَرَزَ الزَمانُ إِلَيَّ شَخصاً\nلَخَضَّبَ شَعرَ مَفرِقِهِ حُسامي\nوَما بَلَغَت مَ

In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
#Print the first examples input and target values:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'عَبَّاسُ يا خَيْرَ المُلُوكِ عَدَالَةً\nوَأَجَلَّ مَنْ نَطَقَ امْرُؤٌ بِثَنَائِهِ\nأَوْلَيْتَنِي مِنْكَ الرِّضَا وجَلَوْتَ لِي\nوَجْهاً قَرَأْتُ البِشْرَ في أَثْنَائِهِ\nفاسْلَمْ لِمُلْكٍ أَنْتَ بَدْرُ سَ'
Target data: 'َبَّاسُ يا خَيْرَ المُلُوكِ عَدَالَةً\nوَأَجَلَّ مَنْ نَطَقَ امْرُؤٌ بِثَنَائِهِ\nأَوْلَيْتَنِي مِنْكَ الرِّضَا وجَلَوْتَ لِي\nوَجْهاً قَرَأْتُ البِشْرَ في أَثْنَائِهِ\nفاسْلَمْ لِمُلْكٍ أَنْتَ بَدْرُ سَر'


In [None]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:8], target_example[:8])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 26 ('ع')
  expected output: 41 ('َ')
Step    1
  input: 41 ('َ')
  expected output: 9 ('ب')
Step    2
  input: 9 ('ب')
  expected output: 44 ('ّ')
Step    3
  input: 44 ('ّ')
  expected output: 41 ('َ')
Step    4
  input: 41 ('َ')
  expected output: 8 ('ا')
Step    5
  input: 8 ('ا')
  expected output: 20 ('س')
Step    6
  input: 20 ('س')
  expected output: 42 ('ُ')
Step    7
  input: 42 ('ُ')
  expected output: 1 (' ')


In [None]:
#Create training batches
# Batch size
BATCH_SIZE = 128 
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000 

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((128, 200), (128, 200)), types: (tf.int64, tf.int64)>

In [None]:
#Build The Model
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
#function to build the model.
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
        return_sequences=True,
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [None]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(128, 200, 46) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 256)          11776     
_________________________________________________________________
lstm (LSTM)                  (128, None, 1024)         5246976   
_________________________________________________________________
dense (Dense)                (128, None, 46)           47150     
Total params: 5,305,902
Trainable params: 5,305,902
Non-trainable params: 0
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [None]:
#This gives us, at each timestep, a prediction of the next character index:
print(len(sampled_indices))
sampled_indices

200


array([16, 16, 20, 34, 31, 34, 19, 17, 28,  2, 45, 18, 20, 19,  8, 37, 41,
       10, 45,  7, 30, 31, 28, 33, 26,  1, 30, 19, 43, 19, 34, 17, 14, 21,
       18, 15, 10,  2, 38, 14, 10,  7, 12, 34, 16, 44,  4, 10, 15,  0,  3,
        3, 27, 42,  0,  0, 41, 21, 31, 39, 45, 36, 39, 21, 19, 11,  9, 39,
        9,  7, 22,  2, 29,  2, 30, 32, 39,  4, 22, 40, 45, 41,  2, 13, 27,
       10, 36, 30, 35, 41, 13, 21, 36, 28, 44, 37, 12, 35,  9, 17, 24, 44,
        7,  4,  8, 38,  3, 23, 21, 29,  0, 28, 27, 13,  5, 34, 39, 17, 15,
        9,  0,  7, 24, 42,  6, 10, 37, 31, 40, 40, 34, 40, 40, 39, 24, 27,
        2, 25, 31,  2, 42, 19,  6, 45,  0, 44, 23, 21, 12, 20, 43,  5, 36,
        7, 21, 40, 40, 11,  5, 27,  1, 42, 43, 12,  0, 24, 34,  9, 42, 10,
       42, 44, 41, 29, 26,  3, 11, 26, 42, 19, 11, 28, 14,  7, 41,  8, 14,
       39, 38, 35,  5, 35, 45, 39,  9,  5, 36, 35, 43, 39])

In [None]:
#Decode these to see the text predicted by this untrained model:
print("Input: \n", repr("".join(idx2char[input_example_batch[0].numpy()])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 ' وأبقاكَ آخرَ الدهر عصرا\nفي سرورٍ يُريك شهرك يوماً\nوحبورٍ يريك عامك شهرا\nقلت لما بدا الهلالُ ضئيلاً\nقد كستْه سُرى ثلاثين ضُمرا\nعجباً للهلال كيف استهلّو\nهُ هلالاً هلَّا استهلوه بدرا\nكان لما بدا وأنت أم'

Next Char Predictions: 
 'ددسهلهزذفءْرسزايَةْئكلفنع كزِزهذحشرخةءًحةئثهدّأةخ\nآآغُ\n\nَشلٌْىٌشزتبٌبئصءقءكمٌأصٍَْءجغةىكوَجشىفّيثوبذطّئأاًآضشق\nفغجؤهٌذخب\nئطُإةيلٍٍهٌٍٍطغءظلءُزإْ\nّضشثسِؤىئشٍٍتؤغ ُِث\nطهبُةَُّقعآتعُزتفحئَاحًٌوؤوٌْبؤىوٌِ'


Attach an optimizer, and a loss function
The standard tf.keras.losses.sparse_softmax_crossentropy loss function works in this case because it is applied across the last dimension of the predictions.
Because our model returns logits, we need to set the from_logits flag.

In [None]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (128, 200, 46)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.8289192


In [None]:
model.compile(optimizer='adam', loss=loss)
    #optimizer = tf.optimizers.Adam(),loss = loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS=300

In [None]:
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch,
                    callbacks=[checkpoint_callback])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_300'

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            11776     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 46)             47150     
Total params: 5,305,902
Trainable params: 5,305,902
Non-trainable params: 0
_________________________________________________________________


In [None]:

def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)
  global candidates_text
  # Number of characters to generate
  num_generate = 400 

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0 
  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a multinomial distribution to predict the word returned by the model
      predictions = predictions / temperature

      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      #print(tf.multinomial(predictions, num_samples=1).shape)
      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])
  candidates_text=start_string + ''.join(text_generated)
  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string="كم تَطْلُبُونَ لَنَا عَيْباً فيُعجِزُكمْ"))

كم تَطْلُبُونَ لَنَا عَيْباً فيُعجِزُكمْ وذوْلُوا
إِلى ذا الحُسامُ عَلى الرِسالِ بِهِ
مِن كاسِ شَرَّ الواحُ مِن طَيرِ أَجرَدِ
رَعى مَردىً لَم يَخلُ مِنهُ ماكَ مُبدِعِ
أَسالَت سَلاماً وَلا بَلَغتُ شِعالبي أُمَّ القُرون
 الرومُ ماذِلُ عِلمِ قُروبٍ دارِ

وَمِن عَجيبٍ لِغَيرِ الهلم مُشتَسيا طالِباً
شَغَفَ الإِصليقُ نَقَّكَ واِعجَدى
مُزهَر مَدَّت حصالَ للمجدِ بُرْعَه
يُبْدَى نُجومُ العاذَنْ إذا ما عجا
فَتَرى لَهُم فيه الطَّرَا الْعَزْمُ مِنّ


In [None]:
print(generate_text(model, start_string="وَأَحسَنُ مِنكَ لَم تَرَ قَطُّ عَيني"))

وَأَحسَنُ مِنكَ لَم تَرَ قَطُّ عَيني وَأَعظُمُ غُرَّةٍ
يُعازِلُ في حَقٍّ اِتَّخَدتَ بِها
وَفي العُراقِ إِذا أَضَّلتَ حَبّاً
هُوَ المَصرُ لِلدُنيا ضَليلِ الصُبحِ
مَعفونَفَينِ بِهِ
مَكانَ عَلى أَيّامِن الجَنزِلبِسِ وَهوَ سُوري
بِالوَصلُ مِن حُسنِ وَصنِ ذي الإِلَهِ

يا أَيُّها النَفسُ ابَةُ الغَيَّ يَقْفى الْمُصَفَّرَه
وَلدَّلِي أُوُوسٍ
عَلَيْهِ ابنهُ مالُهُ في حُجَّةٍ مِنَ النَدى
ذَريعٌ كَما غَدَّ الغَمامُ عَلى الدُجى
سَمى وَجفَكرُها 


In [None]:
print(generate_text(model, start_string="تَصغُرُ في عَينِ العَظيمِ العَظائِمُ"))

تَصغُرُ في عَينِ العَظيمِ العَظائِمُ
وَما بَينَ مُنتَمِعٍ عِندَ نَعماكَ
 إِذ يَرجَعُ سُلطانَ حاجِبِ
وَقَد جاءَ عَن كُلِّ الهَوى في إِجلالِهِ
وَكَأَنَّ عادَ الخَليفَةِ ن كُنتَ قَولاً
لَم تَضرِ يَدنو الَّذي تَلهَب مَنازِلَها
بِالعيدِ وَالدُنيا إِلى اللَهوِ الُ
ناجٍ منهمُ وتعدرُ الشي
رَ ولا من تَحْدهُدَ مَشْرى الهُداما
وَمَن كُنتُ كَالمُقَفصيد كَأَنَّها أَحواهُ
بُقُيَةُ اللَّبِذينَ تَمَسُّغُه
دُ ظُلَّةٌ تُمَدّى بِها
وَذَناءُ الحَشاوِلِ


In [None]:
print(generate_text(model, start_string="الخيلُ والليلُ والبَيداءُ تعرِفُني"))

الخيلُ والليلُ والبَيداءُ تعرِفُني
مُتَنَجَّةٌ بَينَ ذَيّالمولَها
إِذا اِستَحسَنَت بِالباعِ با رامَ الأَمرَ كالخَخرِ
 بِها يَعدو شَفاعَيها يَجومُ
العُلا السَّبقْ
خَفِيعتَ من أسقيرِهِمْ وخُزْلاً
تَمُنَّ الأرْضُ من نهجِزامَ إذا الحق اندثر
لا الذي يحرقُ امجاداً لهُ
أسعدَتهُ مثلَ حُلمٍ يُحتقر
أتى اغتزامُ
عَهدُ اللَهِ لَم تُفقِن إِلى وَجهِ جَمعِ المَوكِبِ
حَولى وَحَقِّقُهُ أَضنى مَقامُ
مزَى عَنْهُ أَثْوَابَ الْفَنَاءِ وَرَفْرَفَتْ
إِلَ


In [None]:
print(generate_text(model, start_string="تجاوزت مقدار الشجاعة والنهى"))

تجاوزت مقدار الشجاعة والنهى
تفّاء التي تُحظي بوان
هذي عبدٌ للشرس موساً
يضيعا أرواحُ كأني مُذْعِبٌ
لين ضؤوف كف شَاءَ بَعدَ تَفَرُّقٍ
ما لِلِقاءِ وَلِلفُراقِ دَوامُ
سَيَشُدُّ أَزرَكَ وَالشَدائِدُ جُمعَكالٌ يَحتَسي
لِغَيرِكَ إِن أَطيقٌ فَيُتلَقَظُ
أَنا لِلأَغَرِّ مُخَلِّدينَ بِها
جِنّي بِلادٌ تُنتَ فَاِلفَقِدَت
بِهِ أَو قالونِ مَأْتُهُمْ
فَلَمَّا أَتَيْنَا كَريمِهِ وَالرَشاها غَيرَ مُرتائِهِ
وَكِلا بِقَرآٍ مِثلَ خَلقِ اللَولِ 


In [None]:
!pip install -q nltk


In [None]:
import io
# Read, then decode for py2 compat.
with io. open('/content/sample_data/praise-poems_dataset.txt', 'r',encoding='utf8')as f:
   text = f.read()
# remove some exteranous chars 
execluded = '!()/*-.1:=[]«»;؛−,،~?؟#\u200f\ufeff'
#execluded2 ='"'
#execluded3 ="'"
out = ""

for char in text:
  if char not in (execluded):
    out += char
out = out.replace("\t\t\t", "\t")
out = out.replace("\r\r\n", "\n")
out = out.replace("\r\n","\n")
out = out.replace("\t\n", "\n")
out = out.replace("\n\n", "\n")
out = out.replace('"', "")
out = out.replace("'", "")

# process Unicode text
with io.open('/content/sample_data/Cleandatabase.txt', 'w', encoding='utf8') as f:
    f.write(out)

In [None]:
reference_text= open('/content/sample_data/Cleandatabase.txt', 'rb').read().decode(encoding='utf-8')

verse_Lines=reference_text.splitlines()
references=list()
for i in range(len(verse_Lines)):
    s=(verse_Lines[i].split())
    if s :
     references.append(s)
    
print(len(references))
                       
#----------------------------------------------------------------------
#print(type(candidates_text))
new_verse_Lines=candidates_text.splitlines()
candidates=list()
for i in range(len(new_verse_Lines)):
    s=(new_verse_Lines[i].split())
    if s :
     candidates.extend(s)
    


34462


In [None]:
# n-gram individual BLEU
from nltk.translate.bleu_score import sentence_bleu
print('BLEU scores-Individual 1-gram: %f' % sentence_bleu(references, candidates, weights=(1, 0, 0, 0)))
print('BLEU scores-Individual 2-gram: %f' % sentence_bleu(references, candidates, weights=(0, 1, 0, 0)))
print('BLEU scores-Individual 3-gram: %f' % sentence_bleu(references, candidates, weights=(0, 0, 1, 0)))
print('BLEU scores-Individual 4-gram: %f' % sentence_bleu(references, candidates, weights=(0, 0, 0, 1)))

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU scores-Individual 1-gram: 0.600000
BLEU scores-Individual 2-gram: 0.109375
BLEU scores-Individual 3-gram: 0.031746
BLEU scores-Individual 4-gram: 1.000000
