Libraries :

In [27]:
import os
import re
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from natsort import natsorted
from bs4 import BeautifulSoup  # for HTML parsing
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import defaultdict
from collections import Counter
import math
import numpy as np


Preprocessing :

In [28]:
def preprocessing(doc):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    stop_words.remove('in')
    stop_words.remove('to')
    stop_words.remove('where')
    
    # Remove HTML tags or markup
    doc = BeautifulSoup(doc, 'html.parser').get_text()

    # Convert to lowercase
    doc = doc.lower()

    # Remove special characters, keep only alphanumeric characters and spaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc)

    tokenized_doc = word_tokenize(doc)

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokenized_doc]

    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]

    terms = [word for word in stemmed_tokens if word not in stop_words]
    return terms

files_name = natsorted(os.listdir('files'))
document_of_terms = []

for files in files_name:
    with open(f'files/{files}', 'r') as f:
        document = f.read()
        document_terms = preprocessing(document)
        document_of_terms.append(document_terms)

# Print each document on an independent line
for document_terms in document_of_terms:
    print(document_terms)


['antoni', 'brutu', 'caeser', 'cleopatra', 'merci', 'worser']
['antoni', 'brutu', 'caeser', 'calpurnia']
['merci', 'worser']
['brutu', 'caeser', 'merci', 'worser']
['caeser', 'merci', 'worser']
['antoni', 'caeser', 'merci']
['angel', 'fool', 'fear', 'in', 'rush', 'to', 'tread', 'where']
['angel', 'fool', 'fear', 'in', 'rush', 'to', 'tread', 'where']
['angel', 'fool', 'in', 'rush', 'to', 'tread', 'where']
['fool', 'fear', 'in', 'rush', 'to', 'tread', 'where']


Positional index model:


First : Positional index

In [29]:
from collections import defaultdict

# Construct a positional index using defaultdict
positional_index = defaultdict(lambda: [0, {}])

for doc_id, terms in enumerate(document_of_terms, start=1):
    term_frequency = {}  # To store the term frequency for each document
    for position, term in enumerate(terms):
        positional_index[term][1].setdefault(doc_id, []).append(position)
        term_frequency[term] = term_frequency.get(term, 0) + 1

    # Update document frequency for the current term
    for term, freq in term_frequency.items():
        positional_index[term][0] += 1

# Print the positional index
print("Positional Index:")
for term, postings in positional_index.items():
    print(f"{term}: {postings}")


Positional Index:
antoni: [3, {1: [0], 2: [0], 6: [0]}]
brutu: [3, {1: [1], 2: [1], 4: [0]}]
caeser: [5, {1: [2], 2: [2], 4: [1], 5: [0], 6: [1]}]
cleopatra: [1, {1: [3]}]
merci: [5, {1: [4], 3: [0], 4: [2], 5: [1], 6: [2]}]
worser: [4, {1: [5], 3: [1], 4: [3], 5: [2]}]
calpurnia: [1, {2: [3]}]
angel: [3, {7: [0], 8: [0], 9: [0]}]
fool: [4, {7: [1], 8: [1], 9: [1], 10: [0]}]
fear: [3, {7: [2], 8: [2], 10: [1]}]
in: [4, {7: [3], 8: [3], 9: [2], 10: [2]}]
rush: [4, {7: [4], 8: [4], 9: [3], 10: [3]}]
to: [4, {7: [5], 8: [5], 9: [4], 10: [4]}]
tread: [4, {7: [6], 8: [6], 9: [5], 10: [5]}]
where: [4, {7: [7], 8: [7], 9: [6], 10: [6]}]


Second : Phrase query

In [30]:
def phrase_query(query, positional_index, document_of_terms):
    stemmer = PorterStemmer()

    stop_words = set(stopwords.words('english'))

    query_terms = [stemmer.stem(term.lower()) for term in word_tokenize(query) if term.lower() not in stop_words]

    if len(query_terms) == 1:  # Single-word query
        return [f"document {doc_id}" for doc_id in sorted(positional_index.get(query_terms[0], [0, {}])[1].keys())]

    result_docs = set(positional_index.get(query_terms[0], [0, {}])[1].keys())

    for term in query_terms[1:]:
        result_docs &= set(positional_index.get(term, [0, {}])[1].keys())

    matching_documents = [
        f"document {doc_id}"
        for doc_id in sorted(result_docs)
        if all(pos + 1 == next_pos for pos, next_pos in zip(*[positional_index[term][1].get(doc_id, []) for term in query_terms]))
    ]

    return matching_documents
# Example phrase query
example_query = "fools the fear"
result = phrase_query(example_query, positional_index, document_of_terms)
print(result)


['document 7', 'document 8', 'document 10']


Vector space model :

First : Term frequency

In [31]:
# Assuming 'all_terms' is a list of all unique terms across all documents
all_terms = list(set(term for document_terms in document_of_terms for term in document_terms))

# Create a DataFrame for term frequency (TF)
tf_data = {}
for i, document_terms in enumerate(document_of_terms, start=1):
    term_counts = Counter(document_terms)
    tf_data[f'doc{i}'] = [term_counts[term] for term in all_terms]

# Create a DataFrame from the TF data
tf_df = pd.DataFrame(tf_data, index=all_terms)

# Display the transposed TF DataFrame
print(tf_df)


           doc1  doc2  doc3  doc4  doc5  doc6  doc7  doc8  doc9  doc10
angel         0     0     0     0     0     0     1     1     1      0
brutu         1     1     0     1     0     0     0     0     0      0
merci         1     0     1     1     1     1     0     0     0      0
cleopatra     1     0     0     0     0     0     0     0     0      0
fool          0     0     0     0     0     0     1     1     1      1
calpurnia     0     1     0     0     0     0     0     0     0      0
in            0     0     0     0     0     0     1     1     1      1
rush          0     0     0     0     0     0     1     1     1      1
caeser        1     1     0     1     1     1     0     0     0      0
to            0     0     0     0     0     0     1     1     1      1
where         0     0     0     0     0     0     1     1     1      1
worser        1     0     1     1     1     0     0     0     0      0
tread         0     0     0     0     0     0     1     1     1      1
fear  

Weighted Tf (1+ log tf)

In [32]:
def weighted_tf(x):
    if x > 0:
        return math.log(x) + 1
    return 0

# Apply the weighted_tf function to the entire DataFrame
w_tf_df = tf_df.map(weighted_tf)

# Display the transposed weighted TF DataFrame
print(w_tf_df)

           doc1  doc2  doc3  doc4  doc5  doc6  doc7  doc8  doc9  doc10
angel       0.0   0.0   0.0   0.0   0.0   0.0   1.0   1.0   1.0    0.0
brutu       1.0   1.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0    0.0
merci       1.0   0.0   1.0   1.0   1.0   1.0   0.0   0.0   0.0    0.0
cleopatra   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0
fool        0.0   0.0   0.0   0.0   0.0   0.0   1.0   1.0   1.0    1.0
calpurnia   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0
in          0.0   0.0   0.0   0.0   0.0   0.0   1.0   1.0   1.0    1.0
rush        0.0   0.0   0.0   0.0   0.0   0.0   1.0   1.0   1.0    1.0
caeser      1.0   1.0   0.0   1.0   1.0   1.0   0.0   0.0   0.0    0.0
to          0.0   0.0   0.0   0.0   0.0   0.0   1.0   1.0   1.0    1.0
where       0.0   0.0   0.0   0.0   0.0   0.0   1.0   1.0   1.0    1.0
worser      1.0   0.0   1.0   1.0   1.0   0.0   0.0   0.0   0.0    0.0
tread       0.0   0.0   0.0   0.0   0.0   0.0   1.0   1.0   1.0    1.0
fear  

Second : IDF

In [33]:
# Calculate document frequency (DF) and Inverse Document Frequency (IDF) for each term
df_data = Counter(term for document_terms in document_of_terms for term in set(document_terms))
idf_data = {term: round(math.log10(len(document_of_terms) / df), 6) for term, df in df_data.items()}

# Create a DataFrame from the IDF data
idf_df = pd.DataFrame(list(idf_data.items()), columns=['Term', 'IDF'])

# Print headers
print(f"{'Term':<15}\t{'DF'}\t{'IDF':>8}")

# Print the DF and IDF values without the default index
for term, df in df_data.items():
    idf = idf_data[term]
    print(f"{term:<10}\t{df:0}\t{idf:10}")

Term           	DF	     IDF
brutu     	3	  0.522879
merci     	5	   0.30103
cleopatra 	1	       1.0
caeser    	5	   0.30103
worser    	4	   0.39794
antoni    	3	  0.522879
calpurnia 	1	       1.0
angel     	3	  0.522879
fool      	4	   0.39794
in        	4	   0.39794
rush      	4	   0.39794
where     	4	   0.39794
tread     	4	   0.39794
fear      	3	  0.522879
to        	4	   0.39794


Third : TF.IDF

In [34]:
tf_idf = w_tf_df.multiply(idf_data, axis=0)
tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
angel,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.522879,0.0
brutu,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
merci,0.30103,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
in,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
caeser,0.30103,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794


Document Length

In [35]:
doc_len = pd.DataFrame({'length': np.sqrt((tf_idf**2).sum())}).transpose()

doc_len

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
length,1.373463,1.279619,0.498974,0.782941,0.582747,0.67427,1.223496,1.223496,1.106137,1.106137


Normalized tf.idf

In [36]:
norm_tf_idf = tf_idf.div(doc_len.iloc[0])

norm_tf_idf

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
angel,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.472707,0.0
brutu,0.380701,0.408621,0.0,0.667839,0.0,0.0,0.0,0.0,0.0,0.0
merci,0.219176,0.0,0.603298,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
cleopatra,0.728087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fool,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
calpurnia,0.0,0.781483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
in,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
caeser,0.219176,0.23525,0.0,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756


Similarity between query and each document

In [37]:
def get_w_tf(x):
    if x > 0:
        return math.log10(x) + 1
    return 0

def insert_query(q):
    new_q = preprocessing(q)
    
    query = pd.DataFrame(index=norm_tf_idf.index)
    query['tf'] = [1 if x in new_q else 0 for x in list(norm_tf_idf.index)]
    query['w_tf'] = query['tf'].apply(lambda x : get_w_tf(x))
    product = norm_tf_idf.multiply(query['w_tf'], axis=0)
    query['idf'] = idf_df.set_index('Term').loc[query.index, 'IDF'].values * query['w_tf']
    query['tf_idf'] = query['w_tf'] * query['idf']
    query['normalized'] = 0

    for i in range(len(query)):
            query['normalized'] = query['normalized'].astype('float64')
            query.loc[new_q, 'normalized'] = query['idf'] / math.sqrt(sum(query['idf'].values**2))
    print('Query Details')
    print(query.loc[new_q])
    
    product2 = product.multiply(query['normalized'], axis=0)
    scores = {}
    column_mapping = {'document ' + str(i): 'doc' + str(i) for i in range(1, 11)}  # Adjust the range as needed

    for col in phrase_query(q, positional_index, document_of_terms):
       mapped_col = column_mapping.get(col)
       if mapped_col:
          scores[mapped_col] = product2[mapped_col].sum()

    product_result = product2[list(scores.keys())].loc[new_q]

    
    print('\nProduct (query*matched doc)')
    print(product_result)
    
    print('\nProduct sum')
    print(product_result.sum())
    
    print('\nQuery Length')
    q_len = math.sqrt((query['idf'].loc[new_q] ** 2).sum())
    print(q_len)
    
    print('\nCosine Similarity')
    print(product_result.sum())
    
    print('\nReturned docs')
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for typle in sorted_scores:
        print(typle[0], end=" ")
    
    
# Example usage
query = "antony brutus"
insert_query(query)


Query Details
        tf  w_tf       idf    tf_idf  normalized
antoni   1   1.0  0.522879  0.522879    0.707107
brutu    1   1.0  0.522879  0.522879    0.707107

Product (query*matched doc)
            doc1      doc2
antoni  0.269196  0.288939
brutu   0.269196  0.288939

Product sum
doc1    0.538393
doc2    0.577877
dtype: float64

Query Length
0.7394625732800816

Cosine Similarity
doc1    0.538393
doc2    0.577877
dtype: float64

Returned docs
doc2 doc1 