# Importing Libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from numpy import array
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical

import gensim

# Loading Dataset

In [2]:
dataset = pd.read_csv('dataset.tsv', sep='\t',header=0 ,encoding="utf-8",
                      names=['rating', 'review_id', 'user_id', 'book_id', 'review'])

In [3]:
dataset.head()

Unnamed: 0,rating,review_id,user_id,book_id,review
0,4,39428407,1775679,3554772,من أمتع ما قرأت من روايات بلا شك. وحول الشك ت...
1,4,32159373,1304410,3554772,رواية تتخذ من التاريخ ،جوًا لها اختار المؤلف ...
2,1,442326656,11333112,3554772,إني أقدّر هذه الرواية كثيرا، لسبب مختلف عن أس...
3,5,46492258,580165,3554772,الكاهن الذي أطلق على نفسه اسم هيبا تيمنا بالع...
4,5,25550893,1252226,3554772,"""عزازيل"" هو اسم رواية يوسف زيدان الثانية و ال..."


In [4]:
dataset = dataset.loc[:, ['review', 'rating']]

# Some preprocessing 

In [5]:
def normalizeArabic(text):
    text = text.strip()
    text = re.sub(r"[إأٱآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ة","ه", text)
    text = re.sub(r"[0-9]|[!؟،؛,-_]", " ", text)
    text = re.sub(r'"', " ", text)
    text = re.sub(r'[\(\)]', " ", text)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    
    text = re.sub(noise, '', text)
    text = text.strip()
    return ''.join(text)

## Applying preprocessing

In [6]:
dataset['review'] = dataset['review'].apply(normalizeArabic)

In [65]:
dataset['review'][0]

'من امتع ما قرات من روايات بلا شك  وحول الشك تدندن  عزازيل  بلا هواده  احمد الديب'

# Making ratings from 0 to 4 instead 1 to 5

In [8]:
dataset['rating'] = dataset['rating'] - 1

In [9]:
y = to_categorical(dataset['rating'], 5)
y

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.]], dtype=float32)

# Squeezing reviews

In [10]:
reviews = [review for review in dataset['review']]

## one hot encoding for each sentence

In [11]:
t = Tokenizer()
t.fit_on_texts(reviews)

In [12]:
vocab_size = len(t.word_index)+1
print(vocab_size)
# integer encode the documents
encoded_docs = t.texts_to_sequences(reviews)


34465
3112


In [61]:
t.texts_to_sequences(inpt.split())

[[2],
 [4133],
 [7],
 [53],
 [2],
 [444],
 [293],
 [648],
 [4134],
 [474],
 [8939],
 [27],
 [293],
 [6542],
 [1384],
 [3497]]

In [63]:
print(encoded_docs[0])

[2, 4125, 7, 53, 2, 442, 288, 641, 4126, 469, 8933, 27, 288, 6536, 1374, 3488]


## This code is to know the maximum review in length ,so that I know the padding nomber

In [13]:
max([len(sublist.split()) for sublist in reviews]) 

3085

In [14]:
max_length = 3085   
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# Loading aravec

In [15]:
aravec = gensim.models.Word2Vec.load('full_grams_cbow_100_wiki/full_grams_cbow_100_wiki.mdl')

In [19]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    try:
        embedding_vector = aravec.wv.get_vector(word)
        embedding_matrix[i] = embedding_vector
        
    except KeyError:
        continue


In [20]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 2.15736341,  3.08792496,  1.72121239, ...,  0.17832963,
        -0.75964814,  2.01433587],
       [ 1.71386635,  2.76462722, -0.12731455, ..., -0.10195694,
        -0.44580469,  4.97124434],
       ...,
       [ 1.52720261,  0.99132931,  0.06829713, ..., -0.78637141,
         2.80833316, -1.39495456],
       [-0.08024251, -0.13364261, -0.04714713, ...,  0.54665828,
         0.28098759,  0.15865459],
       [-0.20780919, -0.25758916,  0.38040265, ..., -0.17234182,
        -1.1358428 , -0.08691123]])

# Splitting
## Note shuffling the data is not correct here

In [21]:
x_train , y_train = padded_docs[:2500] , y[:2500]
x_test , y_test = padded_docs[2500:] , y[2500:]

In [22]:
x_train

array([[    2,  4125,     7, ...,     0,     0,     0],
       [   14, 14138,     2, ...,     0,     0,     0],
       [   59,  1981,    16, ...,     0,     0,     0],
       ...,
       [    8,   568,   143, ...,     0,     0,     0],
       [  132,    40,  3988, ...,     0,     0,     0],
       [ 1935,     3,  4345, ...,     0,     0,     0]], dtype=int32)

# Feature extraction Using Embedding Layer and building the model

In [24]:
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
# Before embedding each sentence was vector of 3086 dimension, after it each sentence was vector of 100 dim
model.add(Flatten())
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])

In [25]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3085, 100)         3446500   
_________________________________________________________________
flatten (Flatten)            (None, 308500)            0         
_________________________________________________________________
dense (Dense)                (None, 5)                 1542505   
Total params: 4,989,005
Trainable params: 1,542,505
Non-trainable params: 3,446,500
_________________________________________________________________


# Training the model

In [27]:
model.fit(x_train, y_train, epochs=15, validation_data=(x_test, y_test) ,verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f9eb8e01110>

# The Generalization of the model on the test data is bad because of unbalancing of the data ,and trying SMOTE here is meaningless because the representation of the sentence is random because Tokenizer function give each word in each sentence a random number.

In [381]:
model.save('Arabic Book Review with random-initialized Embedding layer.h5')

# Test Case

In [78]:
# Another test case nt found in the reviews
inpt = 'زفت'
inpt = normalizeArabic(inpt)
encoded_inpt = [one_hot(inpt, vocab_size)]
padded_inpt = pad_sequences(encoded_inpt, maxlen=max_length, padding='post')
model.predict(padded_inpt)

array([[0.17439269, 0.14762722, 0.20788811, 0.18030788, 0.2897841 ]],
      dtype=float32)

# As I said previously the Generalization of the model on the test data is bad because of the unbalancing in the data .