# Import libraries

In [69]:
import re
import os
import pandas as pd
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DeLL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DeLL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DeLL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Loading dataset

In [70]:
import os
for dirname, _, filenames in os.walk('All Hadith Books'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

All Hadith Books\Maliks Muwatta Without_Tashkel.txt
All Hadith Books\Maliks Muwatta.txt
All Hadith Books\Musnad Ahmad ibn Hanbal Without_Tashkel.txt
All Hadith Books\Musnad Ahmad ibn Hanbal.txt
All Hadith Books\Sahih Bukhari Without_Tashkel.txt
All Hadith Books\Sahih Bukhari.txt
All Hadith Books\Sahih Muslim.txt
All Hadith Books\Sahih Muslime Without_Tashkel.txt
All Hadith Books\Sunan Abu Dawud Without_Tashkel.txt
All Hadith Books\Sunan Abu Dawud.txt
All Hadith Books\Sunan al Darami Without_Tashkel.txt
All Hadith Books\Sunan al Darami.txt
All Hadith Books\Sunan al Tirmidhi Without_Tashkel.txt
All Hadith Books\Sunan al Tirmidhi.txt
All Hadith Books\Sunan al-Nasai Without_Tashkel.txt
All Hadith Books\Sunan al-Nasai.txt
All Hadith Books\Sunan Ibn Maja Without_Tashkel.txt
All Hadith Books\Sunan Ibn Maja.txt


In [71]:
book1 = pd.read_csv(dirname + '/Sahih Bukhari Without_Tashkel.txt')

# Preprocssing Dataset

In [72]:
def preprocess_doc_lemma(doc):
    #normalization
    doc = re.sub("[إأآا]", "ا", doc)
    doc = re.sub("ى", "ي", doc)
    doc = re.sub("ؤ", "ء", doc)
    doc = re.sub("ئ", "ء", doc)
    doc = re.sub("ة", "ه", doc)
    doc = re.sub("گ", "ك", doc)
    doc = re.sub(r'[^\w\s]', '', doc)
    # Tokenization
    tokens = word_tokenize(doc)
    # Stop-word removal
    stop_words = set(nltk.corpus.stopwords.words("arabic"))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    filtered_sentences=" ".join(lemmatized_tokens)
    return filtered_sentences

In [73]:
all_hadiths_1 = []
for hadith in book1['Sahih Bukhari Without_Tashkel']:
    all_hadiths_1.append(hadith)
cleared_Hadith_1 = []
for hadith in all_hadiths_1:
    cleared_Hadith_1.append(preprocess_doc_lemma(hadith))

In [74]:
# make it as a DataFram
df = pd.DataFrame(cleared_Hadith_1, columns=['Sahih Bukhari'])
df['Hadith_No'] = df.index

## New cleaned data

In [75]:
df.to_csv('sahih_bukhari_clean',index=False)

# Query Preprocessing

In [76]:
def normalize_arabic(text):
    if not isinstance(text, (str, bytes)):
        # If input is not a string or bytes-like object, convert it to string
        text = str(text)
    if text.startswith("ال"):
        text = text[2:]
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("[ًٌٍَُِّْ]", "", text)
    return text

def preprocess_arabic_Remove_stop(words):
    stop_words = set(stopwords.words('arabic'))
    preprocessed_text=[]
    words = [word for word in words if word not in stop_words]
    preprocessed_text = words
    return preprocessed_text

def remove_punctuation(words):
    cleaned_words = [word for word in words if word not in string.punctuation]
    return cleaned_words

def lemma(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

def preprocess_text(text):
    words = word_tokenize(text)
    out = remove_punctuation(words)
    out = preprocess_arabic_Remove_stop(out)
    out = lemma(out)
    return out

# Inverted index

In [77]:
import string
def bulid_inverted_index(documents):
    inverted_index = dict()
    for doc_id, doc in enumerate(documents):
        doc = normalize_arabic(doc)
        terms = preprocess_text(doc)
        for term in terms:
            if term not in inverted_index:
                inverted_index[term] = []
            if doc_id not in inverted_index[term]:
                inverted_index[term].append(doc_id)
    return inverted_index
inverted_index = bulid_inverted_index(df['Sahih Bukhari'])

# Search

In [78]:
def search(query, inverted_index, documents):
    query = normalize_arabic(query)
    query = preprocess_text (query)
    relevant_doc_ids = list(range(len(documents)))
    #print(query)
    for term in query:
        relevant_docs = []
        if term in inverted_index:
            p1 = 0
            p2 = 0
            while p1 < len(relevant_doc_ids) and p2 < len(inverted_index[term]):
                if relevant_doc_ids[p1] < inverted_index[term][p2]:
                    p1 += 1
                elif relevant_doc_ids[p1] > inverted_index[term][p2]:
                    p2 += 1
                else:
                    relevant_docs.append(relevant_doc_ids[p1])
                    p1 += 1
                    p2 += 1
            relevant_doc_ids = relevant_docs
            result = df.iloc[relevant_doc_ids]

    return result.iloc[:,0]

# Ranking

In [79]:
def Ranking(query):
    vectorizer = CountVectorizer()

    # Fit and transform the text data
    X = vectorizer.fit_transform(df['Sahih Bukhari'])

    # Tokenize and vectorize the query
    query_vector = vectorizer.transform([query])

    query_vector_dense = query_vector.toarray()

    # Calculate cosine similarity between query and documents
    cosine_similarities = cosine_similarity(query_vector_dense, X)

    # Add cosine similarities to DataFrame
    df['cosine_similarity'] = cosine_similarities[0]

    # Sort DataFrame by cosine similarity
    df_sorted = df.sort_values(by='cosine_similarity', ascending=False)

    # Filter out documents with cosine similarity greater than 0
    similar_documents = df_sorted[df_sorted['cosine_similarity'] > 0.3]

    #similar_document_ids = similar_documents.iloc[:, 0]

    return similar_documents[['Sahih Bukhari','cosine_similarity']]

# Example

In [80]:
Query="هشام بن عروة"

In [81]:
search(Query,inverted_index,df['Sahih Bukhari'])

1       حدثنا عبد الله بن يوسف قال اخبرنا مالك هشام بن...
18      حدثنا محمد بن سلام قال اخبرنا عبده هشام ابيه ع...
40      حدثنا محمد بن المثني حدثنا يحيي هشام قال اخبرن...
41      حدثنا مسلم بن ابراهيم قال حدثنا هشام قال حدثنا...
83      حدثنا موسي بن اسماعيل قال حدثنا وهيب قال حدثنا...
                              ...                        
6929    حدثنا عبيد بن اسماعيل حدثنا ابو اسامه هشام بن ...
6961    حدثنا مسلم بن ابراهيم حدثنا هشام حدثنا قتاده ا...
6971    حدثنا عبيد بن اسماعيل حدثنا ابو اسامه هشام ابي...
6994    حدثنا يحيي بن بكير حدثنا الليث عقيل ابن شهاب ح...
7005    حدثنا علي حدثنا هشام اخبرنا معمر الزهري حدثني ...
Name: Sahih Bukhari, Length: 589, dtype: object

In [82]:
cosine_similarities = Ranking(Query)
cosine_similarities

Unnamed: 0,Sahih Bukhari,cosine_similarity
3722,حدثني ابراهيم بن موسي اخبرنا هشام معمر هشام بن...,0.589768
3243,حدثنا عبد العزيز بن عبد الله حدثنا ابراهيم بن ...,0.576166
4600,حدثنا ابو اليمان حدثنا شعيب الزهري واخبرني انس...,0.560112
6426,حدثنا يحيي بن بكير حدثنا الليث خالد بن يزيد سع...,0.549762
3664,حدثني عمرو بن خالد حدثنا زهير حدثنا ابو اسحاق ...,0.539360
...,...,...
5089,حدثنا احمد بن يعقوب اخبرنا اسحاق بن سعيد بن عم...,0.301511
994,حدثنا ربيع بن يحيي قال حدثنا زاءده هشام فاطمه ...,0.301511
3118,حدثنا قيس بن حفص وموسي بن اسماعيل قالا حدثنا ع...,0.301232
324,حدثنا يحيي بن بكير قال حدثنا الليث جعفر بن ربي...,0.300965


# Evaluation

In [83]:
retrieved_documents_list = [1 if doc_id in cosine_similarities.iloc[:,0] else 0 for doc_id in range(len(df))]
print(retrieved_documents_list)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 

In [84]:
def calculate_relevant_documents(query, inverted_index, documents):
    query = preprocess_text(query)
    relevant_doc_ids = set(range(len(documents)))

    for term in query:
        if term in inverted_index:
            relevant_doc_ids &= set(inverted_index[term])

    return relevant_doc_ids

query = 'هشام بن عروة'
relevant_documents = calculate_relevant_documents(query, inverted_index, df['Sahih Bukhari'])
print(relevant_documents)


{1, 6145, 2056, 6153, 2059, 2061, 6158, 18, 4115, 6169, 6174, 4134, 40, 41, 4143, 6196, 2102, 2105, 4157, 6208, 4161, 2120, 4168, 4173, 6224, 83, 6229, 4187, 97, 4193, 2153, 4201, 4206, 4208, 4210, 124, 126, 6274, 4228, 4231, 4233, 4234, 142, 145, 148, 6293, 4246, 2199, 6294, 2201, 6295, 6302, 4265, 2220, 177, 2226, 4276, 6330, 2240, 204, 4305, 2259, 214, 4313, 219, 220, 2269, 4316, 6368, 6370, 6374, 6381, 239, 6395, 6396, 4353, 259, 4356, 263, 2314, 6413, 2319, 272, 276, 2324, 6420, 281, 4377, 283, 6426, 285, 286, 2333, 288, 2334, 2335, 4382, 4389, 294, 295, 301, 2352, 305, 6451, 308, 6452, 311, 6456, 313, 6460, 4413, 318, 6463, 323, 4420, 2374, 2383, 2385, 4435, 340, 341, 342, 2391, 2392, 2395, 6493, 2398, 2401, 359, 6510, 371, 2423, 6519, 2426, 2427, 6524, 2430, 384, 4481, 391, 4497, 408, 2460, 415, 6561, 2466, 419, 6562, 2474, 4529, 2482, 4530, 2485, 442, 4538, 444, 2496, 4555, 2509, 4560, 6617, 2525, 2526, 489, 2537, 6633, 6638, 495, 499, 6643, 4600, 2553, 2555, 4603, 2558, 510, 4

In [85]:
ground_truth = [1 if doc_id in relevant_documents else 0 for doc_id in range(len(df))]
print(ground_truth)

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [86]:
def precision_score(ground_truth, retrieved_documents):
    # Calculate True Positives (TP) and False Positives (FP)
    TP = sum(1 for gt, retrieved in zip(ground_truth, retrieved_documents) if gt == 1 and retrieved == 1)
    FP = sum(1 for gt, retrieved in zip(ground_truth, retrieved_documents) if gt == 0 and retrieved == 1)

    # Calculate Precision
    if TP + FP == 0:
        return 0
    else:
        return TP / (TP + FP)

def recall_score(ground_truth, retrieved_documents):
    # Calculate True Positives (TP) and False Negatives (FN)
    TP = sum(1 for gt, retrieved in zip(ground_truth, retrieved_documents) if gt == 1 and retrieved == 1)
    FN = sum(1 for gt, retrieved in zip(ground_truth, retrieved_documents) if gt == 1 and retrieved == 0)

    # Calculate Recall
    if TP + FN == 0:
        return 0
    else:
        return TP / (TP + FN)

def f1_score(ground_truth, retrieved_documents):
    precision = precision_score(ground_truth, retrieved_documents)
    recall = recall_score(ground_truth, retrieved_documents)

    # Calculate F1 Score
    if precision + recall == 0:
        return 0
    else:
        return 2 * (precision * recall) / (precision + recall)


In [87]:
# Calculate Precision, Recall, and F1 Score
precision = precision_score(ground_truth, retrieved_documents_list)
recall = recall_score(ground_truth, retrieved_documents_list)
f1 = f1_score(ground_truth, retrieved_documents_list)

# Print the evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.22742474916387959
Recall: 0.3463497453310696
F1 Score: 0.27456258411843876


In [90]:
import customtkinter as ctk
import tkinter as tk
from PIL import Image, ImageTk
from tkinter import PhotoImage,scrolledtext,ttk
import pandas as pd
import os

class MyButton(tk.Button):
    def __init__(self, master=None,image_path=None,bac="#282828",**kwargs):
        super().__init__(master, **kwargs)
        image = Image.open(image_path)
        self.photo = ImageTk.PhotoImage(image)
        self.configure(
            font=("Times New Roman", 45),
            bg=bac,
            fg='#282828',
            border=0,
            highlightthickness=0,
            image=self.photo,
            activebackground=bac
        )
        self.bind("<Enter>", self.on_hover)
        self.bind("<Leave>", self.on_leave)
        self.bind("<Button-1>", self.on_click)
        self.bind("<ButtonRelease-1>", self.on_release)

    def on_hover(self, event):
        self.configure(cursor="hand2")

    def on_leave(self, event):
        self.configure(cursor="")
        
    def on_click(self, event):
        self.configure(bg='#282828',fg='#282828')
        
    def on_release(self, event):
        self.configure(bg='#282828',fg='#282828')

def on_entry_click(event):
    if input_text.get("1.0","end-1c") == "Search":
        input_text.delete('1.0', "end")

# Starting Gui
app = ctk.CTk(fg_color='#3c3c3c')
app.title("Info Retrieval")
app.geometry("1000x600")
app.resizable(0, 0)

# Defining Top Frame
Top_frame = ctk.CTkFrame(app, width=1000, height=80,fg_color='#282828',bg_color='#282828')
Top_frame.pack(fill=tk.X)

input_text = tk.Text(Top_frame, width=30, height=2, fg='white',bg='#0D0D0D')   
input_text.pack(padx=20, pady=20)

input_text.insert(1.0, "Search")

input_text.bind("<FocusIn>", on_entry_click)

input_text.tag_configure('tag-right', justify='right')

text_widget = scrolledtext.ScrolledText(app, width=140, height=100)
text_widget.configure(bg="#282828",fg='white')
text_widget.pack()
text_widget.tag_configure("center", justify="center")


# Create a vertical scrollbar
yscrollbar = ttk.Scrollbar(app, orient=tk.VERTICAL, command=text_widget.yview)
yscrollbar.pack(anchor='e')

# Configure the scrolled text widget to use the scrollbars
text_widget.config(yscrollcommand=yscrollbar.set)

def search_gui():
    entry = input_text.get("1.0", "end-1c")
    
    if entry == '':
        tk.messagebox.showerror("Error","Please, enter a valid query")
    else:
        df2 = pd.DataFrame(search(entry, inverted_index, df['Sahih Bukhari']))
        text_widget.delete("1.0", tk.END)
        for index, row in df2.iterrows():
            text_widget.insert(tk.END, ' '.join(row.astype(str).tolist()) + "\n\n", "center")
            
def ranked():
    entry = input_text.get("1.0", "end-1c")
    
    if entry == '':
        tk.messagebox.showerror("Error","Please, enter a valid query")
    else:
        df2 = pd.DataFrame(Ranking(entry))
        text_widget.delete("1.0", tk.END)
        for index, row in df2.iterrows():
            text_widget.insert(tk.END, ' '.join(row.astype(str).tolist()) + "\n\n", "center")    
    
search_btn = MyButton(
            master=Top_frame,
            image_path='search.png',
            bg='#282828'
        )
search_btn.place(x=830,y=20)
search_btn.configure(command=search_gui)

label = ctk.CTkLabel(Top_frame,fg_color='#282828',text='Ranking: ',font=ctk.CTkFont(size=20, weight="bold"))
label.place(x=790,y=20)

ascend = MyButton(
            master=Top_frame,
            image_path='ascend.png',
            bg='#282828'
        )
# ascend.place(x=1110,y=10)

descend = MyButton(
            master=Top_frame,
            image_path='descend.png',
            bg='#282828'
        )
descend.place(x=1110,y=25)
descend.configure(command=ranked)



app.mainloop()