### Importing The Needed Modules

In [1]:
import gc
import re
import pickle
import pandas

import numpy as np
import pyarabic.araby as araby
from keras.preprocessing.text import Tokenizer
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder

import tensorflow
from tensorflow.python.keras import backend
from tensorflow.python.keras.layers import Activation, Dense, Dropout
from tensorflow.python.keras.models import  Sequential
from tensorflow.python.keras.utils.np_utils import to_categorical


### Load Dataset

ar_reviews_100k.csv

In [2]:
dataset = pandas.read_csv("ar_reviews_100k.tsv", sep='\t', header=0)

### Cleaning The Reviews

In [4]:
labels = dataset['label']

#### Removing Emotions, Links, Mentions, Hashtag

In [None]:
dataset['text'] = dataset['text'].map(lambda text: re.sub(r'[^\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', ' ', text).strip())

#### Removing Tashkeel

In [None]:
dataset['text'] = dataset['text'].map(lambda text: araby.strip_diacritics(text))

### Spliting The Dataset Randomly with Ratio 70%, 30% into Train Data and Test Data 

In [None]:
train_data = dataset.sample(frac=.7)
test_data = dataset.drop(labels=train_data.index)
#print(f"Train Data:{len(train_data)}")
#print(f"Test Data:{len(test_data)}")
train_reviews = train_data['text']
test_reviews = test_data['text']

### Tokenizing the Dataset

In [6]:
tokenizer = Tokenizer(num_words=None,lower=False)
tokenizer.fit_on_texts(dataset['text'])
train_reviews1 = train_reviews[:875]
train_reviews2 = train_reviews[875:1750]
train_reviews3 = train_reviews[1750:2625]
train_reviews4 = train_reviews[2625:3500]
train_reviews5 = train_reviews[3500:4375]
train_reviews6 = train_reviews[4375:5250]
train_reviews7 = train_reviews[5250:6125]
train_reviews8 = train_reviews[6125:6999]
del train_reviews
gc.collect()
tokenized_train1 = tokenizer.texts_to_matrix(train_reviews1, mode='tfidf')
tokenized_train2 = tokenizer.texts_to_matrix(train_reviews2, mode='tfidf')
tokenized_train3 = tokenizer.texts_to_matrix(train_reviews3, mode='tfidf')
tokenized_train4 = tokenizer.texts_to_matrix(train_reviews4, mode='tfidf')
tokenized_train5 = tokenizer.texts_to_matrix(train_reviews5, mode='tfidf')
tokenized_train6 = tokenizer.texts_to_matrix(train_reviews6, mode='tfidf')
tokenized_train7 = tokenizer.texts_to_matrix(train_reviews7, mode='tfidf')
tokenized_train8 = tokenizer.texts_to_matrix(train_reviews8, mode='tfidf')
test_reviews1 = test_reviews[:750]
test_reviews2 = test_reviews[750:1500]
test_reviews3 = test_reviews[1500:2250]
test_reviews4 = test_reviews[2250:3000]
del test_reviews
gc.collect()
tokenized_test1 = tokenizer.texts_to_matrix(test_reviews1, mode='tfidf')
tokenized_test2 = tokenizer.texts_to_matrix(test_reviews2, mode='tfidf')
tokenized_test3 = tokenizer.texts_to_matrix(test_reviews3, mode='tfidf')
tokenized_test4 = tokenizer.texts_to_matrix(test_reviews4, mode='tfidf')


#### Collecting The Unnecessary Variables

In [7]:
tokenized_train = np.concatenate((tokenized_train1, tokenized_train2, tokenized_train3, tokenized_train4, tokenized_train5, tokenized_train6, tokenized_train7, tokenized_train8), axis=0)
tokenized_test = np.concatenate((tokenized_test1, tokenized_test2, tokenized_test3, tokenized_test4))
del tokenized_train1, tokenized_train2, tokenized_train3, tokenized_train4, tokenized_train5, tokenized_train6, tokenized_train7, tokenized_train8
del tokenized_test1, tokenized_test2, tokenized_test3, tokenized_test4
gc.collect()

0

### Encoding The Labels

In [8]:
encoder = LabelEncoder()
encoder.fit(labels)
labelst=encoder.fit_transform(labels)
del labels
num_classes = int((len(set(labelst))))
del labelst
gc.collect()
encoded_train = encoder.fit_transform(train_data['label'])
encoded_test = encoder.fit_transform(test_data['label'])
encoded_train= to_categorical(encoded_train,num_classes)
#encoded_test = to_categorical(encoded_test, num_classes)
max_words = int(len(tokenizer.word_index) + 1)

### Defining The Confusion Matrix

In [10]:

def confusion_matrix(true, pred):
    true_positives = backend.sum(backend.round(backend.clip(true * pred, 0, 1)))
    possible_positives = backend.sum(backend.round(backend.clip(true, 0, 1)))
    predicted_positives = backend.sum(backend.round(backend.clip(pred, 0, 1)))
    precision = true_positives / (predicted_positives + backend.epsilon())
    recall = true_positives / (possible_positives + backend.epsilon())
    return 2*(precision*recall)/(precision+recall+backend.epsilon())

### Building The Model

In [11]:
model = Sequential()
model.add(Dense(1024, input_shape=(max_words)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

#### Compiling The Model

In [12]:
model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','Recall','Precision', confusion_matrix,'TruePositives','TrueNegatives','FalsePositives','FalseNegatives'])


### Training The Model

#### Pretraining

In [13]:
tokenized_train = np.stack(tokenized_train, axis=0)
encoded_train = np.stack(encoded_train, axis=0)

#### Training

In [14]:
history = model.fit(tokenized_train,
                    encoded_train,
                    batch_size=100,
                    epochs=2,
                    verbose="auto",
                    validation_split=0.1)

Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: my_model.h1\assets


### Saving The Model

In [None]:
model.save('my_model.h1')
#del model

#### Saving The Tokenizer

In [15]:
#with open('tokenizer.pickle', 'wb') as handle:
#    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

#with open('tokenizer.pickle', 'rb') as handle:
#    tokenizer = pickle.load(handle)

##### Evaluating The Model


fe error hnaa

ValueError: Data cardinality is ambiguous: x sizes: 3000 y sizes: 30000 Make sure all arrays contain the same number of samples.

In [16]:
#model = load_model('my_model.h1')
#Evaluation_valus = model.evaluate(tokenized_test,encoded_test,verbose=0)
#print("Loss" , 'categorical_accuracy','Recall','Precision','confusion_matrix','TruePositives','TrueNegatives','FalsePositives','FalseNegatives')

#print(Evaluation_valus)


## Showcasing The Model

In [18]:
for review in test_data["text"].sample(n=10):

    tokenized_review = tokenizer.texts_to_matrix([review], mode='tfidf')

    prediction = model.predict(np.array(tokenized_review))
    predicted_class = model.predict_classes(tokenized_review)
    predicted_label = encoder.inverse_transform(predicted_class)

    print(prediction,"= \t",predicted_class,"\t",predicted_label)

[[0.6998499 0.4088995 0.4931733]] = 	 [0] 	 ['Mixed']
[[0.71525127 0.3223182  0.57689726]] = 	 [0] 	 ['Mixed']
[[0.86055386 0.21786037 0.75085783]] = 	 [0] 	 ['Mixed']
[[0.47947097 0.99509275 0.00829816]] = 	 [1] 	 ['Negative']
[[0.436921   0.31989893 0.91008186]] = 	 [2] 	 ['Positive']
[[0.33936447 0.83103275 0.49918267]] = 	 [1] 	 ['Negative']
[[0.0725103  0.61435807 0.924398  ]] = 	 [2] 	 ['Positive']
[[0.9773632  0.93890184 0.22163373]] = 	 [0] 	 ['Mixed']
[[0.33662146 0.7703366  0.31385532]] = 	 [1] 	 ['Negative']
[[0.47588736 0.49595106 0.48975328]] = 	 [1] 	 ['Negative']
