In [5]:
import os
import re
import string
from typing import List, Tuple
import nltk
import pandas as pd
from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi
nltk.download("stopwords")
English_Stopwords = set(stopwords.words("english"))



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
path = "data/Articles.csv"

In [7]:
TOP_K = 10
print(f"TOP_K = {TOP_K}")
if os.path.exists(path):
    df = pd.read_csv(path, encoding="latin1")        
    print("Successfully Downloaded the dataset\n")
    print("Dataset:")
    print(df.head())
    print("\nNumber of rows in the dataset:", len(df))
else:
    print(f"Error: File not found at '{path}'")



TOP_K = 10
Successfully Downloaded the dataset

Dataset:
                                             Article      Date  \
0  KARACHI: The Sindh government has decided to b...  1/1/2015   
1  HONG KONG: Asian markets started 2015 on an up...  1/2/2015   
2  HONG KONG:  Hong Kong shares opened 0.66 perce...  1/5/2015   
3  HONG KONG: Asian markets tumbled Tuesday follo...  1/6/2015   
4  NEW YORK: US oil prices Monday slipped below $...  1/6/2015   

                                             Heading  NewsType  
0  sindh govt decides to cut public transport far...  business  
1                    asia stocks up in new year trad  business  
2           hong kong stocks open 0.66 percent lower  business  
3             asian stocks sink euro near nine year   business  
4                 us oil prices slip below 50 a barr  business  

Number of rows in the dataset: 2692


# Preprocessing


In [8]:
def normalize(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def tokenize(text: str) -> List[str]:
    tokens = text.split(" ")
    return [t for t in tokens if t and t not in English_Stopwords]

def preprocess(text: str) -> List[str]:
    return tokenize(normalize(text))

# Loading DataSet

In [9]:
def load_documents_csv(path):
    df = pd.read_csv(path, encoding="latin1")
    docs_raw = df["Article"].astype(str).tolist()
    doc_id = [f"doc_{i}" for i in range(len(docs_raw))]
    print(f"Loaded {len(docs_raw)} documents from CSV.")
    return docs_raw, doc_id

# BM25 Indexing

In [10]:
def build_bm25_index(docs_raw):                                                  
    tokenized_docs = [preprocess(doc) for doc in docs_raw]
    bm25 = BM25Okapi(tokenized_docs)
    print("BM25 index built.")
    return bm25, tokenized_docs


In [11]:
docs_raw, doc_ids = load_documents_csv(path)
print(len(docs_raw)) 


Loaded 2692 documents from CSV.
2692


In [12]:
bm25, tokenized_docs = build_bm25_index(docs_raw)


BM25 index built.


In [13]:
print(len(tokenized_docs)) 


2692


# TF-IDF Indexing

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def build_tfidf_index(docs_raw):
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(docs_raw)
    print("TF-IDF index built successfully!")
    return vectorizer, tfidf_matrix

tfidf_vectorizer, tfidf_matrix = build_tfidf_index(docs_raw)
print(tfidf_matrix.shape) 


TF-IDF index built successfully!
(2692, 27313)


# Search

### BM25

In [22]:
import time

def search(bm25, doc_ids, docs_raw, query, top_k=10):
    start = time.time()
    query_tokens = preprocess(query)
    scores = bm25.get_scores(query_tokens)
    ranked = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True
    )

    results = []
    print("\nRESULTS\n")
    count = 0
    for idx in ranked:                                                       
        if idx >= len(doc_ids):    
            continue
        doc_id = doc_ids[idx]
        score = scores[idx]
        snippet = docs_raw[idx][:200].replace("\n", " ")

        count += 1
        print(f"Rank {count} | {doc_id} | Score: {score:.4f}")
        print("Snippet:", snippet, "...\n")

        results.append((count, doc_id, score, snippet))

        if count == top_k:
            break
    
    end = time.time()
    print(f"BM25 Search Time: {end - start:.4f} seconds\n")

    return results


### TF-IDF

In [24]:
def search_tfidf(query, top_k=10):
    start = time.time()
    query_vec = tfidf_vectorizer.transform([query])
    scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

    ranked = scores.argsort()[::-1][:top_k]

    print("\n=== TF-IDF RESULTS ===\n")
    results = []

    for rank, idx in enumerate(ranked, start=1):
        doc_id = doc_ids[idx]
        score = scores[idx]
        snippet = docs_raw[idx][:200].replace("\n", " ")

        print(f"Rank {rank} | {doc_id} | Score: {score:.4f}")
        print("Snippet:", snippet, "...\n")

        results.append((rank, doc_id, score, snippet))
    end = time.time()
    print(f"TF-IDF Search Time: {end - start:.4f} seconds\n")
    return results


# Query

### BM25

In [18]:
def query_once(query):
    query = query.strip()
    if not query:
        print("Empty query. Try again.")
    else:
        search(bm25, doc_ids, docs_raw, query, TOP_K)

In [23]:
query_once("petrol price")
query_once("cricket match sports")
query_once("stock market crash")
query_once("government fuel policy")
query_once("budget economy pakistan")
query_once("international oil rates")



RESULTS

Rank 1 | doc_38 | Score: 13.0353
Snippet: ISLAMABAD: The consistent fall in the price of petrol saw a record sale in the month of January, with a 32 percent increase as compared to January 2014.Consumers across the country bought 0.39 million ...

Rank 2 | doc_23 | Score: 12.3237
Snippet: ISLAMABAD: In a move to give relief to consumers, sources in the Finance Ministry said on Tuesday that the price of petrol and petroleum products are expected to decrease further from February 1.Accor ...

Rank 3 | doc_544 | Score: 12.2501
Snippet: strong>ISLAMABAD: Government has ratcheted up prices of petrol and high speed diesel by Rs1.50 and Rs1.40 a litre respectively with effect from April 1.</strongSpeaking in Geo News programme Capital  ...

Rank 4 | doc_96 | Score: 12.0534
Snippet: ISLAMABAD: The new price of petrol effective from midnight Tuesday, October 31, will be Rs 74.29 per litre.The price of petrol has been increased by Rs 4 per litre for the month of April.Similarly, th ..

### TF-IDF

In [25]:
def query_tfidf(query, top_k=10):
    results = search_tfidf(query, top_k)
    return results


In [26]:
query_tfidf("petrol price")   
query_tfidf("cricket match sports")   
query_tfidf("stock market crash")    
query_tfidf("government fuel policy")
query_tfidf("budget economy pakistan")
query_tfidf("international oil rates")


=== TF-IDF RESULTS ===

Rank 1 | doc_38 | Score: 0.5309
Snippet: ISLAMABAD: The consistent fall in the price of petrol saw a record sale in the month of January, with a 32 percent increase as compared to January 2014.Consumers across the country bought 0.39 million ...

Rank 2 | doc_155 | Score: 0.4047
Snippet: KARACHI: The price of petroleum products are expected to increase in June, industry sources have revealed to Geo News. The per litre price of diesel may rise by Rs6.20 and petrol by Rs5.80.  A stagger ...

Rank 3 | doc_2527 | Score: 0.4031
Snippet: strong>ISLAMABAD: Federal Government has increased the prices of petrol and diesel from December 1.</strongFederal finance minister Ishaq Dar announced that prices of petrol and diesel have been raise ...

Rank 4 | doc_15 | Score: 0.3846
Snippet: ISLAMABAD: A two member committee tasked to probe the prevailing petrol shortage on Tuesday said that the situation was a serious failure on the part of OGRA (Oil and Gas Regulatory Authorit

[(1,
  'doc_294',
  0.30949716834286123,
  'London: Oil prices fell Thursday as jittery investors awaited a key interest rate decision from the US Federal Reserve.US benchmark West Texas Intermediate (WTI) for delivery in October dropped 58 cen'),
 (2,
  'doc_2521',
  0.2985853964130738,
  'strong>KARACHI: The State Bank of Pakistan announced its monetary policy on Saturday.</strongIt was decided that interest rate for the next two months will be maintained at 5.75 percent.Inflation rate'),
 (3,
  'doc_390',
  0.26140672941048404,
  'strong>Singapore: Crude prices extended losses in Asia Thursday after another report showing a further increase in US stockpiles added to fears about a global glut, while the dollar strengthened after'),
 (4,
  'doc_4',
  0.22066353287816318,
  'NEW YORK: US oil prices Monday slipped below $50 a barrel for the first time in more than five years as the surging dollar and news of additional supplies extended a six-month rout.US benchmark West T'),
 (5,
  'do

# Ground Truth

In [53]:
queries = {
    "petrol price": ["petrol", "price"],
    "cricket match sports": ["cricket", "match"],
    "stock market crash": ["stock","market","crash"],
    "government fuel policy": ["government","fuel","policy"],
    "budget economy pakistan": ["budget","economy"],
    "international oil rates": ["international","oil","rate"]
}

initial_truth = {}                                                  
for q, keywords in queries.items():
    matches = []
    for i, text in enumerate(docs_raw):
        low = text.lower()
        if all(k in low for k in keywords):
            matches.append(f"doc_{i}")
    initial_truth[q] = matches


In [54]:
GROUND_TRUTH = {}

for q in queries.keys():                                    #help from gpt since ground truth and bm25 index were not matching

    results = search(bm25, doc_ids, docs_raw, q, top_k=20)
    retrieved_docs = [doc_id for (_, doc_id, _, _) in results]
    combined = list(dict.fromkeys(initial_truth[q] + retrieved_docs))
    GROUND_TRUTH[q] = combined[:25]



RESULTS

Rank 1 | doc_38 | Score: 13.0353
Snippet: ISLAMABAD: The consistent fall in the price of petrol saw a record sale in the month of January, with a 32 percent increase as compared to January 2014.Consumers across the country bought 0.39 million ...

Rank 2 | doc_23 | Score: 12.3237
Snippet: ISLAMABAD: In a move to give relief to consumers, sources in the Finance Ministry said on Tuesday that the price of petrol and petroleum products are expected to decrease further from February 1.Accor ...

Rank 3 | doc_544 | Score: 12.2501
Snippet: strong>ISLAMABAD: Government has ratcheted up prices of petrol and high speed diesel by Rs1.50 and Rs1.40 a litre respectively with effect from April 1.</strongSpeaking in Geo News programme Capital  ...

Rank 4 | doc_96 | Score: 12.0534
Snippet: ISLAMABAD: The new price of petrol effective from midnight Tuesday, October 31, will be Rs 74.29 per litre.The price of petrol has been increased by Rs 4 per litre for the month of April.Similarly, th ..

In [55]:
GROUND_TRUTH


{'petrol price': ['doc_0',
  'doc_1',
  'doc_3',
  'doc_4',
  'doc_7',
  'doc_9',
  'doc_12',
  'doc_13',
  'doc_22',
  'doc_23',
  'doc_25',
  'doc_27',
  'doc_28',
  'doc_30',
  'doc_32',
  'doc_33',
  'doc_38',
  'doc_49',
  'doc_58',
  'doc_59',
  'doc_63',
  'doc_64',
  'doc_85',
  'doc_96',
  'doc_98'],
 'cricket match sports': ['doc_1021',
  'doc_1022',
  'doc_1023',
  'doc_1024',
  'doc_1027',
  'doc_1029',
  'doc_1043',
  'doc_1048',
  'doc_1049',
  'doc_1059',
  'doc_1060',
  'doc_1061',
  'doc_1062',
  'doc_1063',
  'doc_1066',
  'doc_1070',
  'doc_1071',
  'doc_1072',
  'doc_1073',
  'doc_1074',
  'doc_1075',
  'doc_1076',
  'doc_1079',
  'doc_1080',
  'doc_1081'],
 'stock market crash': ['doc_145',
  'doc_204',
  'doc_224',
  'doc_280',
  'doc_354',
  'doc_422',
  'doc_424',
  'doc_463',
  'doc_622',
  'doc_798',
  'doc_6',
  'doc_2612',
  'doc_726',
  'doc_817',
  'doc_84',
  'doc_253',
  'doc_643',
  'doc_2671',
  'doc_2689',
  'doc_2430',
  'doc_664',
  'doc_858',
  'do

# Evaluation Metrices

In [56]:
def precision_at_k(results, relevant_docs, k=10):
    retrieved = [doc_id for (_, doc_id, _, _) in results[:k]]
    relevant_retrieved = sum(1 for d in retrieved if d in relevant_docs)
    return relevant_retrieved / k

In [57]:
def recall_at_k(results, relevant_docs, k=10):
    if len(relevant_docs) == 0:
        return 0.0
    retrieved = [doc_id for (_, doc_id, _, _) in results[:k]]
    relevant_retrieved = sum(1 for d in retrieved if d in relevant_docs)
    return relevant_retrieved / len(relevant_docs)


In [58]:
def f1_at_k(p, r):
    if p + r == 0:
        return 0.0
    return 2 * p * r / (p + r)


In [59]:
def average_precision(results, relevant_docs):
    score = 0
    hits = 0
    for i, (_, doc_id, _, _) in enumerate(results, start=1):
        if doc_id in relevant_docs:
            hits += 1
            score += hits / i
    return score / hits if hits > 0 else 0

# Evaluation

### BM25

In [60]:
def evaluate_query(query, k=10):
    results = search(bm25, doc_ids, docs_raw, query, top_k=20)
    relevant = GROUND_TRUTH.get(query, [])
    p = precision_at_k(results, relevant, k)
    r = recall_at_k(results, relevant, k)
    f1 = f1_at_k(p, r)
    ap = average_precision(results, relevant)
    print(f"\nEvaluation Score: '{query}' ==")
    print(f"Precision@{k}: {p:.3f}")
    print(f"Recall@{k}:    {r:.3f}")
    print(f"F1-score@{k}:  {f1:.3f}")
    print(f"AP:            {ap:.3f}")
    return p, r, f1, ap



In [64]:
for q in GROUND_TRUTH.keys():
    print(f"QUERY: {q}")
    evaluate_query(q, k=10)


QUERY: petrol price

RESULTS

Rank 1 | doc_38 | Score: 13.0353
Snippet: ISLAMABAD: The consistent fall in the price of petrol saw a record sale in the month of January, with a 32 percent increase as compared to January 2014.Consumers across the country bought 0.39 million ...

Rank 2 | doc_23 | Score: 12.3237
Snippet: ISLAMABAD: In a move to give relief to consumers, sources in the Finance Ministry said on Tuesday that the price of petrol and petroleum products are expected to decrease further from February 1.Accor ...

Rank 3 | doc_544 | Score: 12.2501
Snippet: strong>ISLAMABAD: Government has ratcheted up prices of petrol and high speed diesel by Rs1.50 and Rs1.40 a litre respectively with effect from April 1.</strongSpeaking in Geo News programme Capital  ...

Rank 4 | doc_96 | Score: 12.0534
Snippet: ISLAMABAD: The new price of petrol effective from midnight Tuesday, October 31, will be Rs 74.29 per litre.The price of petrol has been increased by Rs 4 per litre for the month of Ap

### TF-IDF

In [71]:
def evaluate_query_tfidf(query, k=10):
    results = search_tfidf(query, top_k=k)  
    relevant = GROUND_TRUTH.get(query, [])
    p = precision_at_k(results, relevant, k)
    r = recall_at_k(results, relevant, k)
    f1 = f1_at_k(p, r)
    ap = average_precision(results, relevant)
    
    print(f"\nEvaluation Score: '{query}' ==")
    print(f"Precision@{k}: {p:.3f}")
    print(f"Recall@{k}: {r:.3f}")
    print(f"F1-score@{k}: {f1:.3f}")
    print(f"AP: {ap:.3f}")
    
    return p, r, f1, ap


In [72]:
for q in GROUND_TRUTH.keys():
    print(f"QUERY: {q}")
    evaluate_query_tfidf(q, k=10)


QUERY: petrol price
Rank 1 | doc_38 | Score: 0.5309
Snippet: ISLAMABAD: The consistent fall in the price of petrol saw a record sale in the month of January, with a 32 percent increase as compared to January 2014.Consumers across the country bought 0.39 million...

Rank 2 | doc_155 | Score: 0.4047
Snippet: KARACHI: The price of petroleum products are expected to increase in June, industry sources have revealed to Geo News. The per litre price of diesel may rise by Rs6.20 and petrol by Rs5.80.  A stagger...

Rank 3 | doc_2527 | Score: 0.4031
Snippet: strong>ISLAMABAD: Federal Government has increased the prices of petrol and diesel from December 1.</strongFederal finance minister Ishaq Dar announced that prices of petrol and diesel have been raise...

Rank 4 | doc_15 | Score: 0.3846
Snippet: ISLAMABAD: A two member committee tasked to probe the prevailing petrol shortage on Tuesday said that the situation was a serious failure on the part of OGRA (Oil and Gas Regulatory Authority) as a 

# Combine Model

In [27]:
def combined_search(query, top_k=10):
    start = time.time()
    bm25_results = search(bm25, doc_ids, docs_raw, query, top_k)
    tfidf_results = search_tfidf(query, top_k)
    combined_results = []

    for (rank_b, doc_b, score_b, _), (rank_t, doc_t, score_t, _) in zip(bm25_results, tfidf_results):
        if doc_b == doc_t:  
            combined_score = score_b + score_t   
            combined_results.append((doc_b, combined_score))
    combined_results = sorted(combined_results, key=lambda x: x[1], reverse=True)
    print("\n Combined Results")
    results = []
    for rank, (doc_id, score) in enumerate(combined_results[:top_k], start=1):
        idx = doc_ids.index(doc_id)
        snippet = docs_raw[idx][:200].replace("\n", " ")
        print(f"Rank {rank} | {doc_id} | Score: {score:.4f}")
        print("Snippet:", snippet, "...\n")
        results.append((rank, doc_id, score, snippet))
    end = time.time()
    print(f"Combined Search Time: {end - start:.4f} seconds\n")
    return results


In [28]:
combined_search("petrol price", top_k=10)
combined_search("petrol price", top_k=10)   
combined_search("cricket match sports", top_k=10)   
combined_search("stock market crash", top_k=10)    
combined_search("government fuel policy", top_k=10)
combined_search("budget economy pakistan", top_k=10)
combined_search("international oil rates", top_k=10)


RESULTS

Rank 1 | doc_38 | Score: 13.0353
Snippet: ISLAMABAD: The consistent fall in the price of petrol saw a record sale in the month of January, with a 32 percent increase as compared to January 2014.Consumers across the country bought 0.39 million ...

Rank 2 | doc_23 | Score: 12.3237
Snippet: ISLAMABAD: In a move to give relief to consumers, sources in the Finance Ministry said on Tuesday that the price of petrol and petroleum products are expected to decrease further from February 1.Accor ...

Rank 3 | doc_544 | Score: 12.2501
Snippet: strong>ISLAMABAD: Government has ratcheted up prices of petrol and high speed diesel by Rs1.50 and Rs1.40 a litre respectively with effect from April 1.</strongSpeaking in Geo News programme Capital  ...

Rank 4 | doc_96 | Score: 12.0534
Snippet: ISLAMABAD: The new price of petrol effective from midnight Tuesday, October 31, will be Rs 74.29 per litre.The price of petrol has been increased by Rs 4 per litre for the month of April.Similarly, th ..

[]

In [73]:
def evaluate_combined(query, k=10):
    results = combined_search(query, top_k=k)
    
    # Now use the same evaluation functions as before
    relevant = GROUND_TRUTH.get(query, [])
    
    p = precision_at_k(results, relevant, k)
    r = recall_at_k(results, relevant, k)
    f1 = f1_at_k(p, r)
    ap = average_precision(results, relevant)
    
    print(f"\nEvaluation Score: '{query}' ==")
    print(f"Precision@{k}: {p:.3f}")
    print(f"Recall@{k}: {r:.3f}")
    print(f"F1-score@{k}: {f1:.3f}")
    print(f"AP: {ap:.3f}")
    
    return p, r, f1, ap


In [74]:
for query in GROUND_TRUTH.keys():
    evaluate_combined(query, k=10)



RESULTS

Rank 1 | doc_38 | Score: 13.0353
Snippet: ISLAMABAD: The consistent fall in the price of petrol saw a record sale in the month of January, with a 32 percent increase as compared to January 2014.Consumers across the country bought 0.39 million ...

Rank 2 | doc_23 | Score: 12.3237
Snippet: ISLAMABAD: In a move to give relief to consumers, sources in the Finance Ministry said on Tuesday that the price of petrol and petroleum products are expected to decrease further from February 1.Accor ...

Rank 3 | doc_544 | Score: 12.2501
Snippet: strong>ISLAMABAD: Government has ratcheted up prices of petrol and high speed diesel by Rs1.50 and Rs1.40 a litre respectively with effect from April 1.</strongSpeaking in Geo News programme Capital  ...

Rank 4 | doc_96 | Score: 12.0534
Snippet: ISLAMABAD: The new price of petrol effective from midnight Tuesday, October 31, will be Rs 74.29 per litre.The price of petrol has been increased by Rs 4 per litre for the month of April.Similarly, th ..