In [41]:
import os
import math
import numpy as np
import pandas as pd
from collections import defaultdict

In [None]:
import nltk
nltk.download('all')

In [43]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Task-1 Read the Dataset and Perform Text Preprocessing

In [44]:
folder_path = '/content/drive/MyDrive/CSE 419/Lab Assign-4/1000_documents'


In [45]:
documents = []
for filename in os.listdir(folder_path):
  with open(os.path.join(folder_path,filename),'r') as file:
    documents.append(file.read())

In [46]:
len(documents)

1000

In [47]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens


In [48]:
documents = [preprocess_text(doc) for doc in documents]

### Task-2 Calculate TF, IDF, and Create the TF-IDF Matrix

In [49]:
def calculate_tf(documents):
  tf_list = []
  for doc in documents:
    tf = defaultdict(int)
    for token in doc:
      tf[token] +=1

    total_terms = len(doc)
    tf = {term: count / total_terms for term, count in tf.items()}
    tf_list.append(tf)
  return tf_list

In [50]:
def calculate_idf(documents, tf_list):
    N = len(documents)
    df = defaultdict(int)
    for tf in tf_list:
        for term in tf.keys():
            df[term] += 1

    idf = {term: math.log(N / df[term]) if df[term] > 0 else 0 for term in df}
    return idf


In [51]:
def create_tf_idf_matrix(tf_list,idf):
  tf_idf_matrix = []
  for tf in tf_list:
    tf_idf = {term: (tf[term] * idf[term]) for term in tf.keys()}
    tf_idf_matrix.append(tf_idf)
  return tf_idf_matrix

In [52]:
tf_list = calculate_tf(documents)
idf = calculate_idf(documents, tf_list)
tf_idf_matrix = create_tf_idf_matrix(tf_list, idf)

In [53]:
tf_idf_matrix

[{'saab': 0.14579725744098984,
  'build': 0.0276823034893423,
  'cadillacs': 0.040514693718370305,
  'sweden': 0.06485115292073515,
  'general': 0.0059161470696785425,
  'motors': 0.014550865483656373,
  'world': 0.0037965606234430112,
  'largest': 0.009748493667231751,
  'car': 0.10324987404355811,
  'maker': 0.023897606656457012,
  'confirmed': 0.009513764319215514,
  'new': 0.009135997353504658,
  'cadillac': 0.06814243976907951,
  'bls': 0.020257346859185153,
  'factory': 0.07501460131604226,
  'unveiled': 0.022385412467001885,
  'geneva': 0.014550865483656373,
  'motor': 0.013813873025354598,
  'show': 0.007160842112397516,
  'intended': 0.010283160989208158,
  'compete': 0.009513764319215514,
  'luxury': 0.0776879379259243,
  'market': 0.034826331170511454,
  'sold': 0.010003658995647088,
  'us': 0.04088934113833299,
  'said': 0.009057951764227544,
  'gm': 0.12432485722819138,
  'europe': 0.02128257104912021,
  'president': 0.008154313471956146,
  'forster': 0.017035609942269877,

### Task-3 Implementation of Okapi BM25 Ranking Function

In [54]:
def bm25_score(query, tf_idf_matrix, documents, k1=1.5, b=0.75):
    avgdl = sum(len(doc) for doc in documents) / len(documents)
    scores = []

    for doc, tf_idf in zip(documents, tf_idf_matrix):
        score = 0.0
        doc_length = len(doc)

        for term in query:
            if term in tf_idf:
                f_q_d = tf_idf[term]  # Get TF-IDF value for term
                idf_q = idf.get(term, 0)
                score += idf_q * (f_q_d * (k1 + 1)) / (f_q_d + k1 * (1 - b + b * (doc_length / avgdl)))

        scores.append(score)

    return scores


In [55]:
query_input  = input("Enter a query: ")
query = [keyword.strip() for keyword in query_input.split(',')]

Enter a query: health, exercise


In [56]:
scores = bm25_score(query, tf_idf_matrix, documents)

In [57]:
top_indices = np.argsort(scores)[-10:][::-1]

In [58]:
print(f"\nTop documents for query {query}:")
for index in top_indices:
    print(f"Document {index}: Score {scores[index]:.4f}")


Top documents for query ['health', 'exercise']:
Document 580: Score 1.2741
Document 525: Score 1.0903
Document 591: Score 0.9484
Document 581: Score 0.9080
Document 511: Score 0.7806
Document 518: Score 0.6956
Document 555: Score 0.6543
Document 535: Score 0.5406
Document 926: Score 0.5309
Document 237: Score 0.4331


In [59]:
queries = []

while True:
    query_input = input("Enter a query (separate keywords with commas) or type 'exit' to finish: ")
    if query_input.lower() == 'exit':
        break
    queries.append([keyword.strip() for keyword in query_input.split(',')])

Enter a query (separate keywords with commas) or type 'exit' to finish: machine learning, python
Enter a query (separate keywords with commas) or type 'exit' to finish: startup, profit, business
Enter a query (separate keywords with commas) or type 'exit' to finish: food, restriction
Enter a query (separate keywords with commas) or type 'exit' to finish: games, online
Enter a query (separate keywords with commas) or type 'exit' to finish: sports, Cricket 
Enter a query (separate keywords with commas) or type 'exit' to finish: exit


In [60]:
for query in queries:
    scores = bm25_score(query, tf_idf_matrix, documents)
    top_indices = np.argsort(scores)[-10:][::-1]  # Get indices of top 10 documents
    print(f"\nTop documents for query {query}:")
    for index in top_indices:
        print(f"Document {index}: Score {scores[index]:.4f}")


Top documents for query ['machine learning', 'python']:
Document 143: Score 1.0655
Document 999: Score 0.0000
Document 329: Score 0.0000
Document 341: Score 0.0000
Document 340: Score 0.0000
Document 339: Score 0.0000
Document 338: Score 0.0000
Document 337: Score 0.0000
Document 336: Score 0.0000
Document 335: Score 0.0000

Top documents for query ['startup', 'profit', 'business']:
Document 951: Score 4.3865
Document 9: Score 1.3438
Document 367: Score 0.5718
Document 327: Score 0.4992
Document 20: Score 0.4821
Document 1: Score 0.4579
Document 54: Score 0.4306
Document 76: Score 0.3731
Document 3: Score 0.3556
Document 65: Score 0.3212

Top documents for query ['food', 'restriction']:
Document 281: Score 1.6681
Document 811: Score 1.3239
Document 237: Score 1.1420
Document 754: Score 1.1306
Document 286: Score 0.8605
Document 256: Score 0.8549
Document 275: Score 0.7196
Document 282: Score 0.6264
Document 788: Score 0.5880
Document 270: Score 0.5880

Top documents for query ['games'