In [20]:
import nltk
import os
from nltk.tokenize import word_tokenize
from natsort import natsorted  
from nltk.stem import PorterStemmer
import math
import pandas as pd

nltk.data.path.append("/path/to/nltk_data")
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diesel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
document_folder = "Document_collection"

Read files

In [22]:
data = []
for filename in natsorted(os.listdir(document_folder)):
    file_path = os.path.join(document_folder, filename)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        document_content = file.read()
        data.append(document_content)

data

['antony brutus caeser cleopatra mercy worser',
 'antony brutus caeser calpurnia ',
 'mercy worser',
 'brutus caeser mercy worser',
 'caeser mercy worser',
 'antony caeser mercy ',
 'angels fools fear in rush to tread where',
 'angels fools fear in rush to tread where',
 'angels fools in rush to tread where',
 'fools fear in rush to tread where']

Tokenization

In [23]:
def Tokenization(data):
    tokenized_documents = []
    for doc in data:
        tokens = word_tokenize(doc)
        tokenized_documents.append(tokens)
    return tokenized_documents

tokenized_documents = Tokenization(data)

for doc_index,tokens in enumerate(tokenized_documents):
    print(f"Document {doc_index+1}: {tokens}")

Document 1: ['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser']
Document 2: ['antony', 'brutus', 'caeser', 'calpurnia']
Document 3: ['mercy', 'worser']
Document 4: ['brutus', 'caeser', 'mercy', 'worser']
Document 5: ['caeser', 'mercy', 'worser']
Document 6: ['antony', 'caeser', 'mercy']
Document 7: ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']
Document 8: ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']
Document 9: ['angels', 'fools', 'in', 'rush', 'to', 'tread', 'where']
Document 10: ['fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']


Stemming

In [24]:
def Stemming(tokenized_documents):
    stemmed_documents = []
    stemmer = PorterStemmer()

    for document_tokens in tokenized_documents:
        stemmed_tokens = [stemmer.stem(token) for token in document_tokens]
        stemmed_documents.append(stemmed_tokens)
    return stemmed_documents

stemmed_documents = Stemming(tokenized_documents)

for document_index, document_tokens in enumerate(stemmed_documents):
    print(f"Document {document_index+1}: {document_tokens}")

Document 1: ['antoni', 'brutu', 'caeser', 'cleopatra', 'merci', 'worser']
Document 2: ['antoni', 'brutu', 'caeser', 'calpurnia']
Document 3: ['merci', 'worser']
Document 4: ['brutu', 'caeser', 'merci', 'worser']
Document 5: ['caeser', 'merci', 'worser']
Document 6: ['antoni', 'caeser', 'merci']
Document 7: ['angel', 'fool', 'fear', 'in', 'rush', 'to', 'tread', 'where']
Document 8: ['angel', 'fool', 'fear', 'in', 'rush', 'to', 'tread', 'where']
Document 9: ['angel', 'fool', 'in', 'rush', 'to', 'tread', 'where']
Document 10: ['fool', 'fear', 'in', 'rush', 'to', 'tread', 'where']


Constructing Auxiliary structure(s) (Positional index)

In [33]:
def create_positional_index(stemmed_documents):
    positional_index = {}

    for doc_index, doc_tokens in enumerate(stemmed_documents, start=1):
        for position, token in enumerate(doc_tokens, start=1):
            if token not in positional_index:
                positional_index[token] = [0, {}]
            if doc_index not in positional_index[token][1]:
                positional_index[token][1][doc_index] = [position]
                positional_index[token][0] += 1
            else:
                positional_index[token][1][doc_index].append(position)

    return positional_index

positional_index = create_positional_index(stemmed_documents)
for key, value in positional_index.items():
    print(f"term: {key}, frequency: {value[0]}, posing_list: {value[1]}")

term: antoni, frequency: 3, posing_list: {1: [1], 2: [1], 6: [1]}
term: brutu, frequency: 3, posing_list: {1: [2], 2: [2], 4: [1]}
term: caeser, frequency: 5, posing_list: {1: [3], 2: [3], 4: [2], 5: [1], 6: [2]}
term: cleopatra, frequency: 1, posing_list: {1: [4]}
term: merci, frequency: 5, posing_list: {1: [5], 3: [1], 4: [3], 5: [2], 6: [3]}
term: worser, frequency: 4, posing_list: {1: [6], 3: [2], 4: [4], 5: [3]}
term: calpurnia, frequency: 1, posing_list: {2: [4]}
term: angel, frequency: 3, posing_list: {7: [1], 8: [1], 9: [1]}
term: fool, frequency: 4, posing_list: {7: [2], 8: [2], 9: [2], 10: [1]}
term: fear, frequency: 3, posing_list: {7: [3], 8: [3], 10: [2]}
term: in, frequency: 4, posing_list: {7: [4], 8: [4], 9: [3], 10: [3]}
term: rush, frequency: 4, posing_list: {7: [5], 8: [5], 9: [4], 10: [4]}
term: to, frequency: 4, posing_list: {7: [6], 8: [6], 9: [5], 10: [5]}
term: tread, frequency: 4, posing_list: {7: [7], 8: [7], 9: [6], 10: [6]}
term: where, frequency: 4, posing_

Phrase query



In [26]:
def preprocess_phrase_query(phrase_query):
    phrase_query = Tokenization([phrase_query])
    phrase_query = Stemming(phrase_query)
    return phrase_query

In [27]:
def apply_logical_operator(operator, positions_list, new_positions):
    if operator == "and":
        return list(set(positions_list) & set(new_positions))
    elif operator == "or":
        return list(set(positions_list) | set(new_positions))
    elif operator == "not":
        return list(set(positions_list) - set(new_positions))
    else:
        return new_positions

In [31]:
def get_related_documents(phrase_query):
    documents = [[] for _ in range(10)]
    related_documents = []

    logical_operator = None
    current_positions = None

    for word in phrase_query[0]:
        if word in ["and", "or", "not"]:
            logical_operator = word
        elif word in positional_index.keys():
            new_positions = positional_index[word][1][1]  # Assume positions for the first document

            if logical_operator:
                if current_positions is not None:
                    current_positions = apply_logical_operator(logical_operator, current_positions, new_positions)
                else:
                    current_positions = new_positions
            else:
                current_positions = new_positions

    # Check if the last logical operator was "not"
    if logical_operator == "not":
        current_positions = list(set(range(10)) - set(current_positions))

    for position, positions_list in enumerate(documents, start=1):
        if current_positions and set(current_positions).issubset(set(positions_list)):
            related_documents.append([position, positions_list])

    return related_documents

# Example usage:
phrase_query = input("Enter a Phrase Query:")
phrase_query = preprocess_phrase_query(phrase_query)
phrase_query_related_documents = get_related_documents(phrase_query)

for doc, positions_list in phrase_query_related_documents:
    print(f"Document {doc}: in positions: {positions_list}")

Term frequency

In [9]:
terms = list(set([word for doc in stemmed_documents for word in doc]))
term_frequency = pd.DataFrame(index=terms, columns=['doc' + str(i) for i in range(1, 11)])
term_frequency = term_frequency.fillna(0)

for i, doc in enumerate(stemmed_documents, start=1):
    for word in doc:
        term_frequency.at[word, 'doc' + str(i)] += 1

term_frequency

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
merci,1,0,1,1,1,1,0,0,0,0
tread,0,0,0,0,0,0,1,1,1,1
brutu,1,1,0,1,0,0,0,0,0,0
angel,0,0,0,0,0,0,1,1,1,0
worser,1,0,1,1,1,0,0,0,0,0
caeser,1,1,0,1,1,1,0,0,0,0
cleopatra,1,0,0,0,0,0,0,0,0,0
rush,0,0,0,0,0,0,1,1,1,1
antoni,1,1,0,0,0,1,0,0,0,0
fool,0,0,0,0,0,0,1,1,1,1


Weighted Term frequency

In [10]:
Weighted_term_frequency = term_frequency.applymap(lambda x: math.log10(x) + 1 if x > 0 else 0)
Weighted_term_frequency

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
merci,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
tread,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
brutu,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
angel,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
worser,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
caeser,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rush,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
antoni,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0


idf



In [11]:
total_documents = 10  
idf = []

for term, document_positions in positional_index.items():
    idf.append([document_positions[0],math.log10(total_documents / document_positions[0])])

idf = pd.DataFrame(idf, columns=['df', 'idf'], index=positional_index.keys())
idf

Unnamed: 0,df,idf
antoni,3,0.522879
brutu,3,0.522879
caeser,5,0.30103
cleopatra,1,1.0
merci,5,0.30103
worser,4,0.39794
calpurnia,1,1.0
angel,3,0.522879
fool,4,0.39794
fear,3,0.522879


TF.idf matrix 

In [12]:
tf_idf = pd.DataFrame(index=terms, columns=['doc' + str(i) for i in range(1, 11)])

for key in idf.index:
   tf_idf.loc[key] =  Weighted_term_frequency.loc[key].multiply(idf.loc[key]['idf'])

tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
merci,0.30103,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
tread,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
brutu,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
angel,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.522879,0.0
worser,0.39794,0.0,0.39794,0.39794,0.39794,0.0,0.0,0.0,0.0,0.0
caeser,0.30103,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
antoni,0.522879,0.522879,0.0,0.0,0.0,0.522879,0.0,0.0,0.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794


In [13]:
doc_length = tf_idf.applymap(lambda x: x ** 2).sum()**0.5
doc_length

doc1     1.373462
doc2     1.279618
doc3     0.498974
doc4     0.782941
doc5     0.582747
doc6     0.674270
doc7     1.223496
doc8     1.223496
doc9     1.106137
doc10    1.106137
dtype: float64

In [15]:
weighted_tf_idf = pd.DataFrame(index=terms, columns=['doc' + str(i) for i in range(1, 11)])

doc_index = 1
for key in tf_idf.columns:
   weighted_tf_idf[key] =  tf_idf[key].div(doc_length['doc'+str(doc_index)])
   doc_index+=1

weighted_tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
merci,0.219176,0.0,0.603298,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
tread,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
brutu,0.380701,0.408621,0.0,0.667839,0.0,0.0,0.0,0.0,0.0,0.0
angel,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.472707,0.0
worser,0.289735,0.0,0.797516,0.508263,0.682869,0.0,0.0,0.0,0.0,0.0
caeser,0.219176,0.23525,0.0,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
cleopatra,0.728087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
antoni,0.380701,0.408621,0.0,0.0,0.0,0.775474,0.0,0.0,0.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756


In [16]:
query = input("Enter a Query: ")
query = preprocess_phrase_query(query)

related_documents = get_related_documents(query)
similarity = []
for doc in related_documents:
    query_df = pd.DataFrame(index=query[0], columns=['tf-raw','tf-weighted','idf','tf-idf','weighted_tf-idf','doc_product'])
    doc_data = []

    for word in query[0]:
        word_data = []
        word_data.append(term_frequency.loc[word]['doc'+str(doc[0])])
        word_data.append(Weighted_term_frequency.loc[word]['doc'+str(doc[0])])   
        word_data.append(idf.loc[word][1])
        word_data.append(tf_idf.loc[word]['doc'+str(doc[0])])  
        doc_data.append(word_data)

    query_length = 0
    for word in range(len(query[0])):
        query_length += (doc_data[word][-1]**2)
    query_length = query_length**0.5


    word_index = 0
    for word in query[0]:
        doc_data[word_index].append(doc_data[word_index][-1]/query_length)
        doc_data[word_index].append(doc_data[word_index][-1]*weighted_tf_idf.loc[word]['doc'+str(doc[0])])
        query_df.iloc[word_index] = doc_data[word_index]
        word_index+=1
    similarity.append([doc[0],query_df['doc_product'].sum()])
    print('Document:',doc[0],'\n',query_df,'\nQuery Length',query_length,'\tSimilarity:',query_df['doc_product'].sum(),"\n\n")

sorted_similarity = sorted(similarity, key=lambda x: x[1], reverse=True)

print("returned_docs: ", end='')
for doc in sorted_similarity:
    print('document',doc[0],' , ', end='')

Document: 1 
        tf-raw tf-weighted       idf    tf-idf weighted_tf-idf doc_product
antoni      1         1.0  0.522879  0.522879        0.707107    0.269196
brutu       1         1.0  0.522879  0.522879        0.707107    0.269196 
Query Length 0.7394622130520805 	Similarity: 0.5383927937463102 


Document: 2 
        tf-raw tf-weighted       idf    tf-idf weighted_tf-idf doc_product
antoni      1         1.0  0.522879  0.522879        0.707107    0.288939
brutu       1         1.0  0.522879  0.522879        0.707107    0.288939 
Query Length 0.7394622130520805 	Similarity: 0.5778771030041435 


returned_docs: document 2  , document 1  , 