# Import Software Libraries

In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
import random



# Load the Quarn Dataset

In [2]:
corpus = open('./Corpus/A/A/aa1.txt', encoding='utf-8').read()

# Tokenize the Words

In [3]:
tokens = word_tokenize(corpus)

# Generate N-gram models

In [4]:
n = 6
n_grams = []
for i in range(2, n+1):
    n_grams.extend(ngrams(tokens, i))

# Count the frequency of words

In [5]:
# Count the frequency of each N-gram
ngrams_freq = Counter(n_grams)

# Write the most 10 frequent words on a text file (without removing stop words)

In [6]:
# Get the 20 most frequent tokens
top_10_freq_with_sw = ngrams_freq.most_common(10)

with open('top_10_freq_with_sw.txt', 'w', encoding='utf-8') as f:
    for word, freq in top_10_freq_with_sw:
        f.write(' '.join(word) + '\t' + str(freq) + '\n')
        
f = open('top_10_freq_with_sw.txt', encoding='utf-8').read()
print(f)

إن الله	205
الذين آمنوا	184
في الأرض	176
يا أيها	142
الذين كفروا	134
السماوات والأرض	133
الرحمن الرحيم	118
الله الرحمن	114
بسم الله	114
الله الرحمن الرحيم	114



# The method of removing the stop words

In [7]:
stop_words = set(nltk.corpus.stopwords.words('arabic'))

new_tokens = []

for token in tokens:
    if token not in stop_words:
        new_tokens.append(token)

# Reprocess the same process with removing stop words

In [8]:
n_grams = []
for i in range(1, n):
    n_grams.extend(ngrams(new_tokens, i))

ngrams_freq_wo_sw = Counter(n_grams)

top_10_freq_without_sw = ngrams_freq_wo_sw.most_common(10)

with open('top_10_freq_without_sw.txt', 'w', encoding='utf-8') as f:
    for word, freq in top_10_freq_without_sw:
        f.write(' '.join(word) + '\t' + str(freq) + '\n')

# The most 10 frequent words with removing the stop words

In [9]:
f = open('top_10_freq_without_sw.txt', encoding='utf-8').read()
print(f)

الله	2265
قال	416
قل	294
الأرض	287
آمنوا	263
قالوا	250
والله	240
كانوا	229
ربك	220
يوم	217



In [10]:
# Generate text using N-gram models
num_tokens = int(input('Enter the number of tokens: '))
start_word = input('Enter the start word: ')

Enter the number of tokens: 10
Enter the start word: الحمد


In [11]:
prediction = [start_word]

while len(prediction) < num_tokens:
    
    c = prediction[-(n-1):]
    
    if tuple(c) in ngrams_freq:
        
        while True:
            try :
                next_token = random.choices(
                    list(ngrams_freq_wo_sw[tuple(c)].keys()),
                    weights=ngrams_freq_wo_sw[tuple(c)].values()
                )[0]
                break
            except :
                pass
        
        prediction.append(next_token)
        
    else:
        next_token = random.choice(tokens)
        prediction.append(next_token)

s = ' '.join(prediction)
print(s)

الحمد موسى والله بيني المنافقون قال يكسبون وزنوا سم الرياح


# Now, let's use the same methods with different inputs to evaluate and test the outputs.

In [12]:
lst_s_words = [
    'الحمد',
    'محمد',
    'الله',
    'موسى',
    'الم',
    'الجنة',
    'المسلمون',
    'عيسى'
]

table = {
    'Sample_Test' : [],
    'Num_of_Words' : [],
    'Start_Word' : [],
    'Predicted_Text' : []
}

n = 6
sample_test = 1

for num in range(6, 10+1):
    for s_word in lst_s_words:
        
        num_tokens = num
        start_word = s_word
        prediction = [start_word]

        while len(prediction) < num_tokens:
            
            c = prediction[-(n-1):]
            
            if tuple(c) in ngrams_freq:
                while True:
                    try :
                        new_token = random.choices(
                            list(ngrams_freq_wo_sw[tuple(c)].keys()),
                            weights=ngrams_freq_wo_sw[tuple(c)].values()
                        )[0]
                        break
                    except :
                        new_token = random.choice(tokens)
                        break
                    
                prediction.append(new_token)
                
            else:
                next_token = random.choice(tokens)
                prediction.append(next_token)

        table['Sample_Test'].append(sample_test)
        sample_test += 1
        table['Num_of_Words'].append(num)
        table['Start_Word'].append(start_word)
        text = ' '.join(w for w in prediction)
        table['Predicted_Text'].append(text)
        
        print(f'Sample {num_tokens} and {start_word} done.')

Sample 6 and الحمد done.
Sample 6 and محمد done.
Sample 6 and الله done.
Sample 6 and موسى done.
Sample 6 and الم done.
Sample 6 and الجنة done.
Sample 6 and المسلمون done.
Sample 6 and عيسى done.
Sample 7 and الحمد done.
Sample 7 and محمد done.
Sample 7 and الله done.
Sample 7 and موسى done.
Sample 7 and الم done.
Sample 7 and الجنة done.
Sample 7 and المسلمون done.
Sample 7 and عيسى done.
Sample 8 and الحمد done.
Sample 8 and محمد done.
Sample 8 and الله done.
Sample 8 and موسى done.
Sample 8 and الم done.
Sample 8 and الجنة done.
Sample 8 and المسلمون done.
Sample 8 and عيسى done.
Sample 9 and الحمد done.
Sample 9 and محمد done.
Sample 9 and الله done.
Sample 9 and موسى done.
Sample 9 and الم done.
Sample 9 and الجنة done.
Sample 9 and المسلمون done.
Sample 9 and عيسى done.
Sample 10 and الحمد done.
Sample 10 and محمد done.
Sample 10 and الله done.
Sample 10 and موسى done.
Sample 10 and الم done.
Sample 10 and الجنة done.
Sample 10 and المسلمون done.
Sample 10 and عيسى done.


In [13]:
import pandas as pd

table = pd.DataFrame(table)
table

Unnamed: 0,Sample_Test,Num_of_Words,Start_Word,Predicted_Text
0,1,6,الحمد,الحمد ومنكم كبيرا ومن خالدين قوم
1,2,6,محمد,محمد شهود يعصينك صلاتهم يطعمني ربي
2,3,6,الله,الله جاءتهم تستعجلون ذلك وكن لنذهبن
3,4,6,موسى,موسى وأرسلنا فمن آيتين ما به
4,5,6,الم,الم بشرى وعلمك وإذا وقيل ربه
5,6,6,الجنة,الجنة يشاء لعلكم بما به عاما
6,7,6,المسلمون,المسلمون لي لهم بيوتكن كتب ما
7,8,6,عيسى,عيسى كذبوا بسم وقومه استكبروا من
8,9,7,الحمد,الحمد أكل ومن لنا من لسان رأوها
9,10,7,محمد,محمد اليمين واذكروا فإن ويوم إنهم الرعب


# Unnecessary code, just for saving the table.

In [20]:
table.to_csv('sample_examples_table.csv')