<a href="https://colab.research.google.com/github/Mohamed2bdelaziz/NLP/blob/main/TF_RNN_TEXT_GENERATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install wikipedia

## 1. Imports

In [188]:
import wikipedia as wiki
import re
from tqdm import tqdm
import numpy as np

from nltk import RegexpTokenizer
import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation, Embedding
from tensorflow.keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping

from sklearn.metrics.pairwise import cosine_similarity




In [3]:
# download the pretrained glove models embeddings
!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip

--2024-05-10 13:14:44--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2024-05-10 13:17:24 (5.18 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



## 2. Creating text data from Wikipedia api

In [4]:
def get_these_topics(
    topics : list,
    language : str = 'en'
    ) -> list:
  wiki.set_lang(language)
  texts = list()
  for topic in topics:
    try:
      topic_page = wiki.page(topic)
      texts.append(topic_page.content)
    except Exception as e:
      print(f"No page of '{topic}' topic was found")
  return texts

In [5]:
topics = [
    "Egypt",
    "History of modern Egypt",
    "Mamluk Sultanate",
    "Fatimid Caliphate",
    "Arab conquest of Egypt",
    "Ancient Egypt"
]

docs = get_these_topics(topics)

In [6]:
print("Our DOCs words lenght are:\n")
{topic: len(doc.split()) for topic, doc in zip(topics, docs)}

Our DOCs words lenght are:



{'Egypt': 14089,
 'History of modern Egypt': 5005,
 'Mamluk Sultanate': 16697,
 'Fatimid Caliphate': 12972,
 'Arab conquest of Egypt': 5289,
 'Ancient Egypt': 12314}

## 3. Preprocessing

In [7]:
def preprocess(
    document : str
    ) -> str:
  document = re.sub(r'\s+', ' ', document, flags=re.I) # Remove extra white space from text
  document = re.sub(r'\W', ' ', str(document)) # Remove all the special characters from text
  # document = re.sub(r'\d', ' ', str(document)) # Remove all the digits from text
  document = re.sub(r'\s+[A-z]\s+', ' ', document) # Remove all single characters from text
  document = re.sub(r'\s+', ' ', document, flags=re.I) # Remove extra white space from text
  document = document.lower() # Converting to Lowercase
  return document

In [8]:
preprocess("أحمد ضرب عمرو 1 2 3 4 5Ahmed hit Omar ...s   $##@    r")

'أحمد ضرب عمرو 1 2 3 4 5ahmed hit omar r'

In [9]:
processed_texts = [preprocess(doc) for doc in docs]
processed_text = str(sum([processed_texts], [])[0])
processed_text[:999]

'egypt arabic مصر miṣr mesˁr egyptian arabic pronunciation mɑsˤr officially the arab republic of egypt is transcontinental country spanning the northeast corner of africa and the sinai peninsula in the southwest corner of asia it is bordered by the mediterranean sea to the north the gaza strip of palestine and israel to the northeast the red sea to the east sudan to the south and libya to the west the gulf of aqaba in the northeast separates egypt from jordan and saudi arabia cairo is the capital and largest city of egypt while alexandria the second largest city is an important industrial and tourist hub at the mediterranean coast at approximately 100 million inhabitants egypt is the 14th most populated country in the world and the third most populated in africa egypt has one of the longest histories of any country tracing its heritage along the nile delta back to the 6th 4th millennia bce considered cradle of civilisation ancient egypt saw some of the earliest developments of writing 

## 4. Tokenization & Vectorization

In [10]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(processed_text.lower())
len(tokens)

14038

In [11]:
vectorizer = keras.layers.TextVectorization(max_tokens=10000, output_sequence_length=200)
vectorizer.adapt(np.array(processed_texts))


In [12]:
len(vectorizer.get_vocabulary()), *vectorizer.get_vocabulary()[30:40]

(9713,
 'this',
 'also',
 'mamluks',
 'new',
 'who',
 'after',
 'it',
 'its',
 'they',
 'cairo')

In [13]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

# word_index

In [14]:
word_index['ali'], word_index['egypt'], word_index['islam']

(236, 8, 152)

## 5. Creating Data Sequences

In [92]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 10
n_words = len(tokens)

dataX = []
dataY = []

for i in tqdm(range(0, n_words - seq_length, 1)):
    seq_in = tokens[i:i + seq_length]
    seq_out = tokens[i + seq_length]
    dataX.append(seq_in)
    dataY.append([seq_out])

n_seqs = len(dataX)
print("\nTotal Sequences: ", n_seqs)

100%|██████████| 14028/14028 [00:00<00:00, 247924.53it/s]


Total Sequences:  14028





In [93]:
# X = np.zeros((len(dataX), n_words, len(voc)), dtype=bool)  # for each sample, n input words and then a boolean for each possible next word
# y = np.zeros((len(dataY), len(voc)), dtype=bool)  # for each sample a boolean for each possible next word

In [94]:
# for i, words in enumerate(dataX):
#     for j, word in enumerate(words):
#         X[i, j, word_index[word]] = 1
#     y[i, word_index[dataY[i][0]]] = 1

In [95]:
path_to_glove_file = "glove.6B.300d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    print("loading glove embeddings..")
    for line in tqdm(f.readlines()):
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

loading glove embeddings..


100%|██████████| 400000/400000 [00:24<00:00, 16351.40it/s]


Found 400000 word vectors.


In [96]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 8913 words (800 misses)


In [97]:
X = np.zeros((len(dataX), seq_length))
Y = np.zeros((len(dataY), 300))

for i, seq in enumerate(dataX):
  for j, word in enumerate(seq):
    X[i, j] = word_index[word]
  Y[i] = embedding_matrix[word_index[dataY[i][0]]]

## 6. RNN Model Building

In [242]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    trainable=False,
)
embedding_layer.build((1,))
embedding_layer.set_weights([embedding_matrix])

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, input_shape=(n_words, len(voc)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(100, activation='relu'))
model.add(Dense(300))
# model.add(Activation("softmax"))

model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_25 (Embedding)    (None, None, 300)         2914500   
                                                                 
 lstm_49 (LSTM)              (None, None, 128)         219648    
                                                                 
 lstm_50 (LSTM)              (None, 128)               131584    
                                                                 
 dense_50 (Dense)            (None, 100)               12900     
                                                                 
 dense_51 (Dense)            (None, 300)               30300     
                                                                 
Total params: 3308932 (12.62 MB)
Trainable params: 394432 (1.50 MB)
Non-trainable params: 2914500 (11.12 MB)
_________________________________________________________________


## 7. Model compiling & training

In [243]:
optimizer = Adam(learning_rate=0.01)
model.compile(loss="cosine_similarity", optimizer=optimizer, metrics=["accuracy"])


In [244]:
# early_stopping = EarlyStopping(monitor='val_loss', patience=5)

history = model.fit(
    X, Y,
    validation_split=0.2,
    batch_size=32,
    epochs=10,
    # callbacks=[early_stopping],
    shuffle=True
).history

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [245]:
def get_nearest_word_vec(
    vec,
    embedding_matrix = embedding_matrix,
    voc = voc
    ):
  sims = cosine_similarity(embedding_matrix, vec)
  nearest_word_idx = np.argmax(sims)
  return nearest_word_idx, voc[nearest_word_idx]


In [246]:
def get_next_word(
    sent,
    model = model
):
  sent = preprocess(sent)
  tokened_sent = tokenizer.tokenize(sent)
  vec_text = np.array([word_index[word] for word in tokened_sent]).reshape(1, -1)
  if vec_text.shape[1] == seq_length:
    diff = seq_length - vec_text.shape[1]
    vec_text = np.concatenate([[0]*diff, vec_text.reshape(-1)]).reshape(1, -1)
  predicted_next_word = model.predict(vec_text, verbose=0)
  return get_nearest_word_vec(predicted_next_word)

In [247]:
def generate(
    sent,
    max_tokens = 100
):
  acc_text = sent
  for x in range(max_tokens):
    _, next_word = get_next_word(" ".join(acc_text.split()[-30:]))
    acc_text += " "+next_word
  return acc_text

In [254]:
text = "egypt has one of the longest histories of any country tracing its heritage along the nile delta back to the 6th 4th millennia bce considered cradle of civilisation ancient egypt"

generate(text)

'egypt has one of the longest histories of any country tracing its heritage along the nile delta back to the 6th 4th millennia bce considered cradle of civilisation ancient egypt when of the earliest country in east early century egypt since the century egypt in the east area the century dynasty area the century egypt was but to the but of the century ottoman empire in average but to to but have since the in predominantly muslims in egypt and the but since egypt as part this its country in the one years which as country which established which established its development after increased agriculture to well and gas these especially areas and sector result because was increase in increase in increase in increase in 2007 month month according that'

In [253]:
# egypt has one of the longest histories of any country tracing its heritage along the nile delta back to the 6th 4th millennia bce
# considered cradle of civilisation ancient egypt when of the earliest country in east early century egypt since the century egypt in
# the east area the century dynasty area the century egypt was but to the but of the century ottoman empire in average but to to but
# have since the in predominantly muslims in egypt and the but since egypt as part this its country in the one years which as country
# which established which established its development after increased agriculture to well and gas these especially areas and sector
# result because was increase in increase in increase in increase in 2007 month month according that

In [249]:
model.save("RNN_Model.h5")

  saving_api.save_model(
