In [1]:
!git clone https://github.com/Gheith-Abandah/classify-arabic-poetry.git

Cloning into 'classify-arabic-poetry'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 39 (delta 17), reused 0 (delta 0), pack-reused 9[K
Unpacking objects: 100% (39/39), done.


In [2]:
%cd classify-arabic-poetry/

/content/classify-arabic-poetry


In [3]:
!unzip APCD_plus_porse_all.zip

Archive:  APCD_plus_porse_all.zip
  inflating: APCD_plus_porse_all.csv  


In [4]:
from __future__ import print_function
import tensorflow as tf
print(tf.__version__)

from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Embedding, GRU
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import re

print('Experiment: Classify poems, batch size 64')

batch_size = 64 # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 64  # Latent dimensionality of the encoding space.
print('Bach size', batch_size)
print('Epochs', epochs)
print('Latent dim', latent_dim)

data_path = 'APCD_plus_porse_all.csv'

2.8.0
Experiment: Classify poems, batch size 64
Bach size 64
Epochs 100
Latent dim 64


In [6]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()

with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[: len(lines) - 1]:
    input_text, target_text = re.split(',', line)
    for shatter in input_text.split('¤'):
        input_texts.append(shatter.strip())
        target_texts.append(target_text)
        for char in shatter:
            if char not in input_characters:
                input_characters.add(char)

print('Number of samples:', len(input_texts))
max_seq_length = max([len(txt) for txt in input_texts])
print('Max sequence length:', max_seq_length)

input_characters = sorted(list(input_characters))
num_tokens = len(input_characters)
print('Number of tokens:', num_tokens)

Number of samples: 1657003
Max sequence length: 128
Number of tokens: 45


In [7]:
input_token_index = dict( [(char, i) for i, char in enumerate(input_characters)])

input_data = np.zeros((len(input_texts), max_seq_length), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        input_data[i, t] = input_token_index[char] + 1.

encoder=OneHotEncoder(sparse=False)
out= np.array(target_texts).reshape(-1, 1)
output_data=encoder.fit_transform(out)
classes = output_data.shape[1]
print('Number of classes:', classes)

Number of classes: 17


In [19]:
target_texts_np, input_texts_np = np.array(target_texts), np.array(input_texts)
target_texts_np[target_texts_np=='المضارع']
input_texts_np[target_texts_np=='المضارع']

array(['ففي السَّلْم طودُ حلْمٍ لزانِئيهِ عِثارُ',
       'على الجوِّ من نَداهُ ومَسْعاتِه عِطارُ',
       'أنيسٌ إِلى المعالي وعن عارِها نَوارُ',
       'جمالُ الورى المُشارُ اذا عُدِّدَ الفَخارُ',
       'فيا مُرتْضى الخِلا فةِ والمُرْتضى اختيارُ',
       'مُطاعاً لكَ السَّعادةُ في قُطْبِها شِعارُ',
       'فَتَرْوى من الكُماةِ ومن كومِهِ الشِّفارُ',
       'هَنيئاً لكَ المواسمُ كَرّارَةً تُدارُ',
       'وبالقصر أرْيحيٌّ به يمنعُ الذِّمارُ',
       'وفي الجودِ وهو جَمٌّ إِلى السائل اعتِذارُ',
       'مُجيرُ الأنام يَحمي اذا اُسْلِم الجِوارُ',
       'وفي الحربِ ليثُ غابٍ جَرِيٌّ به سُعارُ',
       'حَسوداهُ في عُلاهُ ظُبى البيض والقُطار',
       'سَنا البِشْرِ في دُجاهُ لطُرَّاقِهِ نهارُ',
       'وعُمِّرْتَ ألفَ عيدٍ لما تأمُرُ ائتِمارُ',
       'عليها بك اغتباطٌ وفيها بكَ افتِخارُ',
       'اذا جادَ فهو غيْثٌ واِن صالَ فهو نارُ',
       'رَضِيُّ الاِمامِ ذُو الف ضْلِ والصاحب المُشار',
       'تخافُ الكماةُ منهُ كما خافتِ العِشارُ',
       'رَعابيبُ مِن نُمَيرٍ جَلابيبُها تَضوعُ',


In [8]:
model = Sequential()
model.add(Embedding(num_tokens+1, 32, input_length=max_seq_length, mask_zero=True))
model.add(Bidirectional(GRU(latent_dim, return_sequences=True,)))
model.add(Bidirectional(GRU(latent_dim, return_sequences=True,)))
model.add(Bidirectional(GRU(latent_dim, return_sequences=True,)))
model.add(Bidirectional(GRU(latent_dim)))
# model.add(Bidirectional(GRU(latent_dim, return_sequences=True,
#             dropout=0.1, recurrent_dropout=0.3),
#             merge_mode='concat'))
# model.add(Bidirectional(GRU(latent_dim, return_sequences=True,
#             dropout=0.1, recurrent_dropout=0.3),
#             merge_mode='concat'))
# model.add(Bidirectional(GRU(latent_dim,
#             dropout=0.1, recurrent_dropout=0.3),
#             merge_mode='concat'))
model.add(Dense(output_data.shape[1], activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 32)           1472      
                                                                 
 bidirectional (Bidirectiona  (None, 128, 128)         37632     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128, 128)         74496     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 128, 128)         74496     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              74496     
 nal)                                                   

In [9]:
input_data.shape

(1657003, 128)

In [10]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

checkpoint_path = "training/cp.ckpt"
callbacks_list = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=5),
    tf.keras.callbacks.ModelCheckpoint(
        checkpoint_path,
        save_weights_only=True,
        save_best_only=True,
        monitor='val_accuracy'),
]

test_samples = 163917

print('Train samples', len(input_texts)-test_samples)
print('Test samples', test_samples)

history = model.fit(
    input_data[test_samples:],
    output_data[test_samples:],
    batch_size=512,
    epochs=epochs,
    validation_split=0.15,
    verbose=1,
    callbacks=callbacks_list
  )

model.load_weights(checkpoint_path)

scores = model.evaluate(input_data[0:test_samples], output_data[0:test_samples])
print('Loss on the test set', scores[0])
print('Accuracy on the test set', scores[1])


Train samples 1493086
Test samples 163917
Epoch 1/100
 350/2479 [===>..........................] - ETA: 3:00 - loss: 1.9520 - accuracy: 0.3440

KeyboardInterrupt: ignored

In [None]:
labels

In [11]:
!python Classify_Poems_web.py

2.8.0
Experiment: Classify poems, batch size 64
Bach size 64
Epochs 100
Latent dim 64
Number of samples: 1657003
Max sequence length: 128
Number of tokens: 45
Number of classes: 17
2022-05-07 19:48:01.142690: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 32)           1472      
                                                                 
 bidirectional (Bidirectiona  (None, 128, 128)         49664     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128, 128)         98816     
 nal)                                                