### Import packages

In [56]:
import nltk
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

import numpy as np 
import pandas as pd 

import tkinter as tk
from tkinter import filedialog

import re

### Download the **stopwords** and **woednet**

In [57]:
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = stopwords.words('english')

stemmer = SnowballStemmer(language='english')
dictionary = []

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Check if a Word is Meaningful

In [58]:
def is_meaningful(word) -> bool:
    
    if wn.synsets(word):
        return True
    else: 
        return False 

### Convert Document to List

In [59]:
def convert_to_list(doc) -> list:
    doc = doc.split()
    return doc

### Create Dictionary from Document

In [60]:
def create_dictionary(doc, dictionary) -> list:
    for word_index, word in enumerate(doc):
        
        word = word.lower()
        
        if word.isalnum() and word not in stop_words and is_meaningful(word) and len(word) > 1:
            
            stem_word = stemmer.stem(word)
            doc[word_index] = stem_word
            
            if stem_word not in dictionary:
                dictionary.append(stem_word)
    
    return doc

### Join Document

In [61]:
def join_document(doc) -> str:
    doc = ' '.join(doc)
    return doc

### Process Document

In [62]:
def process_document(document) -> str:
    
    document = convert_to_list(document)
    
    document = create_dictionary(document, dictionary)
    
    document = join_document(document)
    
    return document

### Function to create dataframe and excel file

In [63]:
def create_excel_dataframe(input , name, columns, index):
    df = pd.DataFrame(input, columns = columns, index = index) 
    df.to_excel(name) 
    return df

### Select File

In [64]:
def select_file():
    global raw_documents
    file_name = "C:/Users/Mohammad/Desktop/Project-IR/cran.all.1400"
    try:
        with open(file_name, "r") as f:
            raw_documents = f.read()
    except FileNotFoundError:
        print(f"Error: File '{file_name}' not found.")

select_file()

In [65]:

raw_dataset = raw_documents

raw_dataset = raw_dataset.split(".I") 

raw_dataset.remove(raw_dataset[0]) 


In [66]:
import os
import pandas as pd


os.makedirs("docs", exist_ok=True)
os.makedirs("./Original docs", exist_ok=True)

doc_index = 0

for doc in raw_dataset:
    doc_index += 1
    doc_path = "docs" + "/doc" + str(doc_index) + ".txt"
    doc = process_document(doc)
    
    with open(doc_path, "w") as document: 
        document.write(doc)

doc_index = 0

for doc in raw_dataset:
    doc_index += 1
    doc_path = "./Original docs" + "/docs" + str(doc_index) + ".txt"
    
    with open(doc_path, "w") as document: 
        document.write(doc)
        
df = pd.DataFrame(dictionary, columns=["Words"])
df.to_excel("dictionary.xlsx")


### Initialized the numpy arrays for TF, IDF, TF-IDF:

In [67]:
row = len(dictionary)
column = doc_index

tf_array = np.zeros((row, column))
idf_array = np.zeros((row, 1))
tf_idf_array = np.zeros((row, column))

### Methods to calculate the **TF**:

In [68]:
def calculate_term_frequency(doc, dictionary, doc_no):
    for word in dictionary:
        if word in doc:
            frequency = doc.count(word)
            row = dictionary.index(word)
            column = doc_no - 1
            tf_array[row, column] += frequency

### Methods to calculate the **DF**:

In [69]:
def calculate_document_frequency(doc, dictionary):
    for word in dictionary:
        if word in doc:
            row = dictionary.index(word)
            idf_array[row, 0] += 1

In [70]:
doc_names_list = []
for num in range(1, doc_index + 1):
    doc_path = "./docs" + "/doc" + str(num) + ".txt"
    doc_names_list.append("doc" + str(num) + ".txt")
    with open(doc_path, "r") as file:
        doc = file.read()
        calculate_term_frequency(doc, dictionary, num)
        calculate_document_frequency(doc, dictionary)

In [71]:
row, column = np.shape(tf_array)
row , column

(3421, 1400)

### Calculate the Term Frequency **(TF)** for each word in each document:

$$
TF_{(w,d)}
=
\begin{cases}
\text{ if } \;\;\; tf_{(w,d)} > 0  \;\;\;\;\;\;\;\; 1 + \log(tf_{(w,d)})\\
\text{otherwise} \;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; 0 
\end{cases}
$$

In [72]:
for r in range(row):
    for c in range(column):
        if tf_array[r, c] > 0:
            extracted_element = tf_array[r, c]
            tf_array[r, c] = 1 + np.log10(extracted_element)
        else:
            tf_array[r, c] = 0

### Calculate Inverse Document Frequency **(IDF)** for each word:

$$
IDF_{(w)} = \log(\frac{N}{df_{w}})
$$

In [73]:
count_of_documents = column

row, column = np.shape(idf_array)
print(row , column)
for r in range(row):
    if idf_array[r, 0] > 0:
        extracted_element = idf_array[r, 0]
        idf_array[r, 0] = np.log10(count_of_documents / extracted_element)
    else:
        idf_array[r, 0] = 0

3421 1


### Calculate Inverse Document Frequency **(IDF)** for each word:

$$
IDF_{(w)} = \log(\frac{N}{df_{w}})
$$

In [74]:
row, column = tf_idf_array.shape
print(row, column)  

for r in range(row):
    for c in range(column):
        tf_idf_array[r, c] = tf_array[r, c] * idf_array[r, 0]


3421 1400


In [75]:
def create_excel_dataframe(input , name, columns, index):
    df = pd.DataFrame(input, columns = columns, index = index) 
    df.to_excel(name) 
    return df

### Export the TF, IDF, TF-IDF dataframes:

In [76]:
tf_dataframe = create_excel_dataframe(tf_array, "tf_excel.xlsx", doc_names_list, dictionary)

idf_dataframe = create_excel_dataframe(idf_array, "idf_excel.xlsx", ["IDF"], dictionary)

tf_idf_dataframe = create_excel_dataframe(tf_idf_array, "tf_idf_excel.xlsx", doc_names_list, dictionary)


In [77]:
tf_dataframe

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,1.477121,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,1.0,1.0,0.00000,0.0,0.0
investig,1.301030,1.00000,0.0,0.00000,0.0,0.00000,1.0,1.00000,1.477121,0.0,...,0.0,0.000000,1.0,0.000000,1.0,0.0,0.0,0.00000,0.0,1.0
aerodynam,1.301030,0.00000,0.0,0.00000,1.0,0.00000,0.0,0.00000,0.000000,0.0,...,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
wing,1.602060,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
slipstream,1.778151,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
intuit,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
squir,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,1.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
standoff,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.00000,0.0,0.0
recover,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.00000,0.0,0.0


In [78]:
idf_dataframe

Unnamed: 0,IDF
experiment,0.610834
investig,0.586221
aerodynam,0.876615
wing,0.786293
slipstream,1.970037
...,...
intuit,3.146128
squir,2.367977
standoff,2.845098
recover,1.714764


In [79]:
tf_idf_dataframe

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,0.902276,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.610834,0.000000,0.000000,0.000000,0.610834,0.610834,0.000000,0.0,0.000000
investig,0.762692,0.586221,0.0,0.000000,0.000000,0.000000,0.586221,0.586221,0.865920,0.0,...,0.000000,0.000000,0.586221,0.000000,0.586221,0.000000,0.000000,0.000000,0.0,0.586221
aerodynam,1.140503,0.000000,0.0,0.000000,0.876615,0.000000,0.000000,0.000000,0.000000,0.0,...,0.876615,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
wing,1.259688,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
slipstream,3.503023,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
intuit,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,3.146128,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
squir,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,2.367977,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
standoff,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,2.845098,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
recover,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,1.714764,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [80]:
def rank_cosine(export_string: str,
                cosine_docs: list,
                cosine_values: list) -> None:
    
    cosine_rank = sorted(dict(zip(cosine_docs, cosine_values)).items(), key = lambda x: x[1], reverse = True)
    
    text_box.delete("0.0", tk.END)
    
    for index, doc in enumerate(cosine_rank[:10], 1):
        text_box.insert(tk.END, str(index)+ "." + str(doc[0]) + " : " + str(doc[1]) + "\n")
    
    df = pd.DataFrame([doc[1] for doc in cosine_rank], index = [doc[0] for doc in cosine_rank], columns = ['cosine'])
    df.to_excel(export_string +".xlsx")


### Calculate the Cosine similarity:

In [81]:
def calculate_cosine(query_tfidf: np.array) -> tuple:    
    cosine_values = []
    cosine_docs = []

    for number in range(1, column + 1):
        
        doc_name = "doc"+ str(number) +".txt"
        doc_column = tf_idf_dataframe[doc_name].tolist()
        
        nominator = np.dot(query_tfidf, doc_column)
        
        doc_column_norm = np.linalg.norm(doc_column)
        query_tfidf_norm = np.linalg.norm(query_tfidf)
        
        denominator = doc_column_norm * query_tfidf_norm
        
        cosine_theta = nominator / denominator
        
        cosine_values.append(cosine_theta)
        cosine_docs.append(doc_name)
        
    return cosine_values, cosine_docs



In [82]:
def take_query_from_user():
    global query_tfidf
    global query_words_list
    
    query = query_var.get()
    query_entry.delete("0", tk.END)
    
    query_lower = query.lower()
    query_words_list = query_lower.split()
    
    temp = []
    for word in query_words_list:
        if word not in stop_words and is_meaningful(word) and len(word) > 1:
            temp.append(word) 

    
    query_words_list = temp
    temp = [stemmer.stem(word) for word in query_words_list]
    query_words = temp
    query_words_list = list(set(temp))
    query_words_str = " ".join(query_words)

    
    query_idf_lst = [idf_array[dictionary.index(word)][0] for word in query_words_list]
    
    query_tf_lst = [np.log10(query_words_str.count(word)) + 1 for word in query_words_list]
    
    query_tfidf = [idf*query_tf_lst[count] for count, idf in enumerate(query_idf_lst)]
    
    row = np.shape(tf_idf_array)[0]
    query_tfidf = np.pad(query_tfidf, (0, row - len(query_tfidf)), 'constant', constant_values = (0))
    for count, value in enumerate(query_tfidf):
        if count < len(query_words_list):
            if value > 0:
                if count != dictionary.index(query_words_list[count]):
                    dest_index = dictionary.index(query_words_list[count])
                    query_tfidf[dest_index] = value
                    query_tfidf[count] = 0

    cosine_values, cosine_docs = calculate_cosine(query_tfidf)

    
    rank_cosine("cosine_similarity_rank",
                cosine_docs,
                cosine_values)

In [83]:
def rerank():

    def get_doc_names() -> list:
        temp = []
        pattern = r'docs(\d+)\.txt'

        for path in files_path:
            temp.append(re.findall(pattern, path))
            
        return [int(value[0]) for value in temp]

    def add_tfidf_selected_docs():
        for doc in target_doc_numbers:
            for count, word in enumerate(query_words_list):
            
                word_index_in_dictionary = dictionary.index(word)
                query_tfidf[word_index_in_dictionary] += tf_idf_array[word_index_in_dictionary ,doc] 

    target_doc_numbers = get_doc_names()
    add_tfidf_selected_docs()
    
    
    cosine_values, cosine_docs = calculate_cosine(query_tfidf) 
    rank_cosine("cosine_similarity_rerank",
                cosine_docs,
                cosine_values)

In [84]:
window = tk.Tk()
window.title("IR")
window.geometry("930x300")

def choose():
    global files_path
    files_path = filedialog.askopenfilenames(
                parent = window,
                initialdir = "./Original docs",
                title = "Choose the docs",
                filetypes = [("text name","*.txt")]
                )
    
    rerank()


frame = tk.Frame(window)
frame.columnconfigure(0, weight = 1)
frame.columnconfigure(1, weight = 1)
frame.columnconfigure(2, weight = 1)

STICKY = tk.E + tk.W
PAD_VALUE = 1

query_var = tk.StringVar()

query_label = tk.Label(
                        frame,
                        text = "Query: ",
                        font = ('Arial', 13)
                        )
query_label.grid(
                row = 0,
                column = 0,
                sticky = STICKY,
                padx = PAD_VALUE,
                pady = PAD_VALUE
                )

query_entry = tk.Entry(
                        frame,
                        textvariable = query_var
                        )
query_entry.grid(
                row = 0,
                column = 1,
                sticky = STICKY,
                padx = PAD_VALUE,
                pady = PAD_VALUE,
                )

query_btn = tk.Button(frame,text = 'Enter Query',command = take_query_from_user,
                   font = ('Arial', 11) , bg="yellow"
                   )
query_btn.grid(
                row = 0,
                column = 2,
                sticky = STICKY,
                padx = PAD_VALUE,
                pady = PAD_VALUE
                )

rerank_btn = tk.Button(
                frame,
                   text = 'Rerank',
                   command = choose,
                   font = ('Arial', 13) ,
                   bg="red"
                   )
rerank_btn.grid(
                row = 1,
                column = 2,
                sticky = STICKY,
                padx = PAD_VALUE,
                pady = PAD_VALUE
                )

text_box = tk.Text(
            frame,
                    height = 10 ,
                    bg="black" ,
                    fg="white"
                    )
text_box.grid(
            row = 1,
            column = 1,
            sticky = STICKY,
            padx = PAD_VALUE,
            pady = PAD_VALUE
            )

frame.pack()

window.mainloop()