### Preparating Logger

In [1]:
import logging
logging.basicConfig(
            format='%(name)s - %(levelname)s: %(message)s', level=logging.DEBUG)
logger = logging.getLogger("nlp_practical_exam")

### Importing The Needed Modules

In [2]:
import re
import gc
import pickle

try:
    import pandas
except (ImportError, ModuleNotFoundError) as ex:
    logger.error("Module pandas not found")
    raise ex("Module pandas not found") from ex

try:
    import numpy as np
except (ImportError, ModuleNotFoundError) as ex:
    logger.error("Module numpy not found")
    raise ex("Module numpy not found") from ex

try:
    import pyarabic.araby as araby
except (ImportError, ModuleNotFoundError) as ex:
    logger.error("Module pyarabic not found")
    raise ex("Module pyarabic not found") from ex

try:
    from sklearn.preprocessing import LabelEncoder
except (ImportError, ModuleNotFoundError) as ex:
    logger.error("Module sklearn not found")
    raise ex("Module sklearn not found") from ex

try:
    from keras.preprocessing.text import Tokenizer
    from keras.models import load_model
except (ImportError, ModuleNotFoundError) as ex:
    logger.error("Module keras not found")
    raise ex("Module keras not found") from ex

try:
    import tensorflow
    from tensorflow.python.keras import backend
    from tensorflow.python.keras.layers import Activation, Dense, Dropout
    from tensorflow.python.keras.models import Sequential
    from tensorflow.python.keras.utils.np_utils import to_categorical
except (ImportError, ModuleNotFoundError) as ex:
    logger.error("Module tensorflow not found")
    raise ex("Module tensorflow not found") from ex


tensorflow - DEBUG: Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
h5py._conv - DEBUG: Creating converter from 7 to 5
h5py._conv - DEBUG: Creating converter from 5 to 7
h5py._conv - DEBUG: Creating converter from 7 to 5
h5py._conv - DEBUG: Creating converter from 5 to 7


### Load Dataset

ar_reviews_100k.csv

In [3]:
logger.info("Loading dataset")
dataset = pandas.read_csv("ar_reviews_100k.tsv", sep='\t', header=0)

nlp_practical_exam - INFO: Loading dataset


### Cleaning The Reviews

In [4]:
logger.info("Preprocessing dataset")
labels = dataset['label']

nlp_practical_exam - INFO: Preprocessing dataset


#### Removing Emojis, Links, Mentions, Hashtag

In [5]:
logger.info("Removing emojis, links, mentions and hashtags")
dataset['text'] = dataset['text'].map(lambda text: re.sub(r'[^\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', ' ', text).strip())

nlp_practical_exam - INFO: Removing emojis, links, mentions and hashtags


#### Removing Tashkeel

In [6]:
logger.info("Removing tashkel")
dataset['text'] = dataset['text'].map(lambda text: araby.strip_diacritics(text))

nlp_practical_exam - INFO: Removing tashkel


### Spliting The Dataset Randomly with Ratio 70%, 30% into Train Data and Test Data 

In [7]:
logger.info("Splitting dataset")
train_data = dataset.sample(frac=.7)
test_data = dataset.drop(labels=train_data.index)
#print(f"Train Data:{len(train_data)}")
#print(f"Test Data:{len(test_data)}")
logger.info("Extracting reviews")
train_reviews = train_data['text']
test_reviews = test_data['text']

nlp_practical_exam - INFO: Splitting dataset
nlp_practical_exam - INFO: Extracting reviews


### Tokenizing the Dataset

In [8]:
logger.info("Tokenizing reviews")
tokenizer = Tokenizer(num_words=None,lower=False)
tokenizer.fit_on_texts(dataset['text'])
train_reviews1 = train_reviews[:875]
train_reviews2 = train_reviews[875:1750]
train_reviews3 = train_reviews[1750:2625]
train_reviews4 = train_reviews[2625:3500]
train_reviews5 = train_reviews[3500:4375]
train_reviews6 = train_reviews[4375:5250]
train_reviews7 = train_reviews[5250:6125]
train_reviews8 = train_reviews[6125:6999]
logger.info("Collecting garbage")
del train_reviews
gc.collect()
tokenized_train1 = tokenizer.texts_to_matrix(train_reviews1, mode='tfidf')
tokenized_train2 = tokenizer.texts_to_matrix(train_reviews2, mode='tfidf')
tokenized_train3 = tokenizer.texts_to_matrix(train_reviews3, mode='tfidf')
tokenized_train4 = tokenizer.texts_to_matrix(train_reviews4, mode='tfidf')
tokenized_train5 = tokenizer.texts_to_matrix(train_reviews5, mode='tfidf')
tokenized_train6 = tokenizer.texts_to_matrix(train_reviews6, mode='tfidf')
tokenized_train7 = tokenizer.texts_to_matrix(train_reviews7, mode='tfidf')
tokenized_train8 = tokenizer.texts_to_matrix(train_reviews8, mode='tfidf')
test_reviews1 = test_reviews[:750]
test_reviews2 = test_reviews[750:1500]
test_reviews3 = test_reviews[1500:2250]
test_reviews4 = test_reviews[2250:3000]
logger.info("Collecting garbage")
del test_reviews
gc.collect()
tokenized_test1 = tokenizer.texts_to_matrix(test_reviews1, mode='tfidf')
tokenized_test2 = tokenizer.texts_to_matrix(test_reviews2, mode='tfidf')
tokenized_test3 = tokenizer.texts_to_matrix(test_reviews3, mode='tfidf')
tokenized_test4 = tokenizer.texts_to_matrix(test_reviews4, mode='tfidf')


nlp_practical_exam - INFO: Tokenizing reviews
nlp_practical_exam - INFO: Collecting garbage
nlp_practical_exam - INFO: Collecting garbage


#### Collecting The Unnecessary Variables

In [9]:
logger.info("Collecting tokenized reviews")
tokenized_train = np.concatenate((tokenized_train1, tokenized_train2, tokenized_train3, tokenized_train4, tokenized_train5, tokenized_train6, tokenized_train7, tokenized_train8), axis=0)
tokenized_test = np.concatenate((tokenized_test1, tokenized_test2, tokenized_test3, tokenized_test4))
logger.info("Collecting garbage")
del tokenized_train1, tokenized_train2, tokenized_train3, tokenized_train4, tokenized_train5, tokenized_train6, tokenized_train7, tokenized_train8
del tokenized_test1, tokenized_test2, tokenized_test3, tokenized_test4
gc.collect()

nlp_practical_exam - INFO: Collecting tokenized reviews
nlp_practical_exam - INFO: Collecting garbage


0

In [10]:
logger.info("Encoding labels")
encoder = LabelEncoder()
encoder.fit(labels)
labelst=encoder.fit_transform(labels)
logger.info("Collecting garbage")
del labels
num_classes = int((len(set(labelst))))
logger.info("Collecting garbage")
del labelst
gc.collect()
encoded_train = encoder.fit_transform(train_data['label'])
encoded_test = encoder.fit_transform(test_data['label'])
encoded_train= to_categorical(encoded_train,num_classes)
#encoded_test = to_categorical(encoded_test, num_classes)
num_labels = int(len(encoded_train.shape))
max_words = len(tokenizer.word_index) + 1
logger.info(f"Max words: {max_words}")

nlp_practical_exam - INFO: Encoding labels
nlp_practical_exam - INFO: Collecting garbage
nlp_practical_exam - INFO: Collecting garbage
nlp_practical_exam - INFO: Max words: 341256


### Encoding The Labels

### Defining The Confusion Matrix

In [11]:
def confusion_matrix(true, pred):
    """Compute confusion matrix to evaluate the accuracy of a classification."""
    logger.info("Computing confusion matrix")
    true_positives = backend.sum(backend.round(backend.clip(true * pred, 0, 1)))
    possible_positives = backend.sum(backend.round(backend.clip(true, 0, 1)))
    predicted_positives = backend.sum(backend.round(backend.clip(pred, 0, 1)))
    precision = true_positives / (predicted_positives + backend.epsilon())
    recall = true_positives / (possible_positives + backend.epsilon())
    return 2*(precision*recall)/(precision+recall+backend.epsilon())

### Building The Model

In [12]:
logger.info("Building model")
model = Sequential()
model.add(Dense(1024, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

nlp_practical_exam - INFO: Building model


#### Compiling The Model

In [13]:
model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','Recall','Precision', confusion_matrix,'TruePositives','TrueNegatives','FalsePositives','FalseNegatives'])


### Training The Model

#### Pretraining

In [14]:
#logger.info("Stacking tokenized reviews")
#tokenized_train = np.stack(tokenized_train, axis=0)
#encoded_train = np.stack(encoded_train, axis=0)

#### Training

In [15]:
logger.info("Training model")
history = model.fit(tokenized_train,
                    encoded_train,
                    batch_size=100,
                    epochs=1,
                    verbose="auto",
                    validation_split=0.1)
logger.info("Done training")

nlp_practical_exam - INFO: Training model
nlp_practical_exam - INFO: Computing confusion matrix
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


nlp_practical_exam - INFO: Computing confusion matrix




nlp_practical_exam - INFO: Computing confusion matrix




nlp_practical_exam - INFO: Done training


### Saving The Model

In [16]:
logger.info("Saving model")
model.save('my_model.h1')
#del model

nlp_practical_exam - INFO: Saving model
tensorflow - INFO: Assets written to: my_model.h1\assets


#### Saving The Tokenizer

In [17]:
#with open('tokenizer.pickle', 'wb') as handle:
#    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

#with open('tokenizer.pickle', 'rb') as handle:
#    tokenizer = pickle.load(handle)

##### Evaluating The Model


fe error hnaa

ValueError: Data cardinality is ambiguous: x sizes: 3000 y sizes: 30000 Make sure all arrays contain the same number of samples.

In [18]:
# logger.info("Loading model")
# model = load_model('my_model.h1')
# logger.info("Evaluating model")
#Evaluation_valus = model.evaluate(tokenized_test,encoded_test,verbose=0)
#print("Loss" , 'categorical_accuracy','Recall','Precision','confusion_matrix','TruePositives','TrueNegatives','FalsePositives','FalseNegatives')

#print(Evaluation_valus)


## Showcasing The Model

In [19]:
logger.info("Predicting random samples")
for review in test_data["text"].sample(n=10):

    tokenized_review = tokenizer.texts_to_matrix([review], mode='tfidf')

    prediction = model.predict(np.array(tokenized_review))
    predicted_class = model.predict_classes(tokenized_review)
    predicted_label = encoder.inverse_transform(predicted_class)

    print(prediction,"= \t",predicted_class,"\t",predicted_label)

nlp_practical_exam - INFO: Predicting random samples


[[0.5315839  0.9630507  0.05457997]] = 	 [1] 	 ['Negative']
[[0.6127333  0.33929932 0.66454184]] = 	 [2] 	 ['Positive']
[[0.76555943 0.09309551 0.94558156]] = 	 [2] 	 ['Positive']
[[0.93135476 0.284856   0.36489373]] = 	 [0] 	 ['Mixed']
[[0.8375083  0.2819291  0.47313377]] = 	 [0] 	 ['Mixed']
[[0.44984105 0.26952165 0.8521984 ]] = 	 [2] 	 ['Positive']
[[0.57763714 0.5156293  0.46252084]] = 	 [0] 	 ['Mixed']
[[0.65992033 0.29209715 0.72803265]] = 	 [2] 	 ['Positive']
[[0.83567715 0.39372626 0.6170634 ]] = 	 [0] 	 ['Mixed']
[[0.4468559 0.9111206 0.1715954]] = 	 [1] 	 ['Negative']
