# Boolean

In [1]:
# %% import section
from boolean_retrieval.ir_system import IRSystem
from preprocess_quran_text import verse_complete_dict_nrmlz, verse_lemma_dict_nrmlz, \
    verse_root_dict_nrmlz, verse_complete_dict
import pandas as pd

# %% initialize IR system
docs, docs_complete, docs_lemma, docs_root = [*verse_complete_dict.values()], [*verse_complete_dict_nrmlz.values()], [
    *verse_lemma_dict_nrmlz.values()], [*verse_root_dict_nrmlz.values()]
boolean_ir_complete, boolean_ir_lemma, boolean_ir_root = IRSystem(docs_complete), IRSystem(docs_lemma), IRSystem(
    docs_root)

# %%
k = 10

with open('./queries_boolean.txt') as f:
    queries = f.readlines()
    queries = [q.strip().split() for q in queries]

results = []
i = 1
for query in queries:
    result = boolean_ir_complete.process_query(query, "complete")
    results_lemma = boolean_ir_lemma.process_query(query, "lemma")
    results_root = boolean_ir_root.process_query(query, "root")
    result.extend([r for r in results_lemma if r not in result])
    result.extend([r for r in results_root if r not in result])
    result = [docs[r] for r in result]
    results.append('q{} = "{}"'.format(i, ' '.join(query)))
    results.extend(result[0:k])
    i += 1

results = pd.DataFrame(results)
results

100%|██████████| 6236/6236 [00:00<00:00, 8459.64it/s]
100%|██████████| 6236/6236 [00:00<00:00, 12380.14it/s]
100%|██████████| 6236/6236 [00:00<00:00, 8873.18it/s]


Unnamed: 0,0
0,"q1 = ""سبیل"""
1,وَ لَا تَقُولُوا لِمَنْ يُقْتَلُ فِي سَبِيلِ ا...
2,وَ قَاتِلُوا فِي سَبِيلِ اللَّهِ الَّذِينَ يُق...
3,وَ أَنْفِقُوا فِي سَبِيلِ اللَّهِ وَ لَا تُلْق...
4,يَسْأَلُونَكَ عَنِ الشَّهْرِ الْحَرَامِ قِتَال...
...,...
86,وَ مَا يَأْتِيهِمْ مِنْ نَبِيٍّ إِلَّا كَانُوا...
87,وَ إِذْ قُلْتُمْ يَا مُوسَىٰ لَنْ نَصْبِرَ عَل...
88,وَ إِذَا قِيلَ لَهُمْ آمِنُوا بِمَا أَنْزَلَ ا...
89,قُولُوا آمَنَّا بِاللَّهِ وَ مَا أُنْزِلَ إِلَ...


# TF-IDF

In [2]:
from preprocess_quran_text import quran_series, quran_normalizer
from tfidf_vectorizer import get_most_similars
import pandas as pd

# %%
with open('./queries.txt') as f:
    queries = f.readlines()
    queries = [q.strip() for q in queries]

results = []
i = 1
for query in queries:
    results.append({'Query': 'q{} = "{}"'.format(i, query)})
    results.extend(get_most_similars(quran_series, quran_normalizer(query), 10).to_dict('records'))
    i += 1
results = pd.DataFrame(results)
results['شباهت'] = results['شباهت'].round(3, )
results



Unnamed: 0,Query,آیه,شباهت
0,"q1 = ""الحمد لله""",,
1,,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,0.775
2,,وَ الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,0.775
3,,الْحَمْدُ لِلَّهِ الَّذِي لَهُ مَا فِي السَّمَ...,0.566
4,,فَقُطِعَ دَابِرُ الْقَوْمِ الَّذِينَ ظَلَمُوا ...,0.459
...,...,...,...
292,,وَ عَلَّمَ آدَمَ الْأَسْمَاءَ كُلَّهَا ثُمَّ ع...,0.234
293,,وَ أَمَّا مَنْ آمَنَ وَ عَمِلَ صَالِحًا فَلَهُ...,0.225
294,,لِلَّذِينَ أَحْسَنُوا الْحُسْنَىٰ وَ زِيَادَةٌ...,0.192
295,,وَ يَجْعَلُونَ لِلَّهِ مَا يَكْرَهُونَ وَ تَصِ...,0.185


# Fasttext

In [4]:
# ! git clone https://github.com/facebookresearch/fastText.git
# ! cd fastText
# ! make
# ! sudo pip install .

# %%
import pandas as pd
from preprocess_quran_text import quran_normalizer, merged_quran_vec_df_nrmlz, quran_series
from tools import get_most_similars
from fasttext_vectorizer import sent_to_vec, merged_corpus_embeddings

# %%
with open('./queries.txt') as f:
    queries = f.readlines()
    queries = [q.strip() for q in queries]

results = []
i = 1
for query in queries:
    query_vec = sent_to_vec(quran_normalizer(query))
    results.append({'Query': 'q{} = "{}"'.format(i, query)})
    results.extend(get_most_similars(quran_series, merged_corpus_embeddings, query_vec, 10).to_dict('records'))
    i += 1
results = pd.DataFrame(results)
results['شباهت'] = results['شباهت'].round(3, )
results

Unnamed: 0,Query,آیه,شباهت
0,"q1 = ""الحمد لله""",,
1,,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,0.869
2,,وَ الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,0.865
3,,الْحَمْدُ لِلَّهِ الَّذِي لَهُ مَا فِي السَّمَ...,0.803
4,,فَلِلَّهِ الْحَمْدُ رَبِّ السَّمَاوَاتِ وَ رَب...,0.730
...,...,...,...
292,,وَ أَمَّا مَنْ آمَنَ وَ عَمِلَ صَالِحًا فَلَهُ...,0.627
293,,وَ يَجْعَلُونَ لِلَّهِ مَا يَكْرَهُونَ وَ تَصِ...,0.624
294,,وَ الذَّارِيَاتِ ذَرْوًا,0.618
295,,يُسَبِّحُ لِلَّهِ مَا فِي السَّمَاوَاتِ وَ مَا...,0.608


# Arabert

In [5]:
# %%
# !pip install transformers
# !git clone https://github.com/aub-mind/arabert
# !pip install -r arabert/requirements.txt
# %%
from transformers import AutoTokenizer, AutoModel
from arabert.preprocess import ArabertPreprocessor
from preprocess_quran_text import quran_series, quran_normalizer, merged_quran_vec_df_nrmlz
from tools import get_most_similars
from tfidf_vectorizer import get_word_idf
import numpy as np
import pandas as pd

# %%
EMBEDDING_LEN = 768
model_name = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)

# %%
count = 0


def sent_to_vec(sent):
    global count
    if sent == '':
        return np.zeros(EMBEDDING_LEN)
    text_preprocessed = arabert_prep.preprocess(sent)
    arabert_input = tokenizer.encode_plus(text_preprocessed, return_tensors='pt')
    tokens = tokenizer.convert_ids_to_tokens(arabert_input['input_ids'][0])[1:-1]
    outputs = model(**arabert_input)
    embeddings_text_only = outputs['last_hidden_state'][0][1:-1]
    count += 1
    if count % 1000 == 0:
        print(count)
    avg_vec = np.average(a=embeddings_text_only.detach().numpy(), weights=[get_word_idf(
        quran_normalizer(word)) if '+' not in word else 0 for word in tokens], axis=0)
    if np.linalg.norm(avg_vec) == 0:
        return np.zeros(EMBEDDING_LEN)
    return avg_vec / np.linalg.norm(avg_vec)


# %%
# merged_quran_df or merged_quran_vec_df_nrmlz
merged_corpus_embeddings = merged_quran_vec_df_nrmlz.applymap(sent_to_vec)

with open('./queries.txt') as f:
    queries = f.readlines()
    queries = [q.strip() for q in queries]

results = []
i = 1
for query in queries:
    query_vec = sent_to_vec(quran_normalizer(query))
    results.append({'Query': 'q{} = "{}"'.format(i, query)})
    results.extend(
        get_most_similars(quran_series, merged_corpus_embeddings, query_vec, 10, check_moghattaeh=True).to_dict(
            'records'))
    i += 1
results = pd.DataFrame(results)
results['شباهت'] = results['شباهت'].round(3, )
results

# %%
# query = 'وَلِلّهِ الأَسْمَاء الْحُسْنَى'
#
# query_vec = sent_to_vec(quran_normalizer(query))
# r = get_most_similars(quran_series, merged_corpus_embeddings, query_vec, 10, check_moghattaeh=True)
# print(r)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1000



KeyboardInterrupt

