In [1]:
# =========================
# Step 0: Import required libraries
# =========================
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from collections import Counter

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Kusum
[nltk_data]     Kunwar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Kusum
[nltk_data]     Kunwar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
# =========================
# Step 1: Load dataset
# =========================
file_path = r"C:\Users\Kusum Kunwar\Desktop\IR_Dataset\Reviews.csv"
df = pd.read_csv(file_path)
df.head(10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [24]:
print(f"\n{df.shape[0]} rows × {df.shape[1]} columns")


568454 rows × 10 columns


In [26]:
# Sample for faster processing
df = df.sample(10000, random_state=42).reset_index(drop=True)
print("Total sampled reviews:", df.shape[0])


Total sampled reviews: 10000


In [5]:
# =========================
# Step 2: Preprocessing
# =========================
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    """Lowercase, remove punctuation/numbers, tokenize, remove stopwords, lemmatize."""
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([c for c in text if not c.isdigit()])
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing
docs_tokenized = df['Text'].apply(preprocess).tolist()
print("Preprocessing complete!")

Preprocessing complete!


In [6]:
# =========================
# Step 3: BM25 Indexing
# =========================
bm25 = BM25Okapi(docs_tokenized)
print("BM25 Indexing complete!")

BM25 Indexing complete!


In [27]:
# =========================
# Step 4: RM3-style Query Expansion
# =========================
def expand_query_rm3(query, top_docs=5, expansion_terms=5):
    """
    Expand query using RM3 pseudo-relevance feedback:
    - Get top_docs retrieved documents for the query
    - Count term frequencies (excluding stopwords)
    - Add top expansion_terms to query
    """
    original_tokens = preprocess(query)
    scores = bm25.get_scores(original_tokens)
    top_indices = np.argsort(scores)[::-1][:top_docs]

    candidate_terms = Counter()
    for idx in top_indices:
        for term in docs_tokenized[idx]:
            if term not in stop_words and term not in original_tokens:
                candidate_terms[term] += 1

    expansion_tokens = [term for term, _ in candidate_terms.most_common(expansion_terms)]
    expanded_query = original_tokens + expansion_tokens
    return list(dict.fromkeys(expanded_query))


In [8]:
# =========================
# Step 5: Load queries from file
# =========================
queries_path = r"C:\Users\Kusum Kunwar\Desktop\IR_Dataset\Queries.txt"
with open(queries_path, 'r', encoding='utf-8') as f:
    queries = [line.strip() for line in f if line.strip() != ""]

print(f"Loaded {len(queries)} queries from file.")

Loaded 13 queries from file.


In [29]:
# =========================
# Step 6: Search function
# =========================
def search_bm25(query, top_n=5, top_docs=5, expansion_terms=5):
    """
    Search BM25 with RM3 query expansion
    """
    expanded_query = expand_query_rm3(query, top_docs=top_docs, expansion_terms=expansion_terms)
    
    print("Original query:", query)
    print("Expanded query terms:", expanded_query)

    scores = bm25.get_scores(expanded_query)
    top_indices = np.argsort(scores)[::-1][:top_n]

    results = []
    for rank, idx in enumerate(top_indices, start=1):
        text_preview = df.loc[idx, 'Text']
        # Truncate text for display
        if len(text_preview) > 100:
            text_preview = text_preview[:100] + "..."
        results.append({
            "Rank": rank,
            "Doc ID": idx,
            "BM25 Score": round(scores[idx], 4),
            "Text": text_preview
        })

    results_df = pd.DataFrame(results)
    display(results_df)


In [14]:
# =========================
# Run retrieval for all queries
# =========================
for i, q in enumerate(queries, start=1):
    print("\n==============================")
    print(f"Query {i}")
    search_bm25(q, top_n=4)



Query 1
Original query: organic chocolate
Expanded query terms: ['organic', 'chocolate', 'dark', 'bar', 'taste', 'br', 'soy']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,172,21.4069,I admit that I am not a big dark chocolate fan...
1,2,8106,19.3196,My #1 favorite bar is the Macrobar peanut butt...
2,3,3729,15.7929,I love Next Organic Dark Chocolate Nutes! They...
3,4,5553,15.4857,Santanders 53% semi-sweet dark chocolate bar i...



Query 2
Original query: gluten free bread
Expanded query terms: ['gluten', 'free', 'bread', 'substitute', 'milk', 'powder', 'also', 'love']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,3036,38.5055,This is the secret to gluten free bread making...
1,2,8731,25.3716,I love Pamela's Products wheat free bread. It ...
2,3,3663,16.6427,All of The Pure Pantry mixes are fabulous. Gl...
3,4,8524,15.6816,It is really hard to find a good gluten free b...



Query 3
Original query: crunchy cookies
Expanded query terms: ['crunchy', 'cooky', 'like', 'chip', 'usually', 'great', 'would']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,1979,22.4108,"These are all natural, raw, and crunchy! I usu..."
1,2,2530,17.8344,These cookies are so crunchy and full of mini ...
2,3,3311,17.6587,I feel like I have tried all of the gluten-fre...
3,4,3027,15.9241,Great cookies. These are very crunchy. Not a...



Query 4
Original query: sugar free beverage
Expanded query terms: ['sugar', 'free', 'beverage', 'taste', 'good', 'make', 'drink', 'soda']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,4226,22.5233,"Good, but not a replacement for what I usually..."
1,2,9005,18.5479,I like this 'Orange Tangerine' more than the o...
2,3,7103,18.0723,I don't eat sugar so this soda was a godsend f...
3,4,5117,17.9991,Didnt taste flat as I expected it to. Good fla...



Query 5
Original query: spicy snacks
Expanded query terms: ['spicy', 'snack', 'flavor', 'salty', 'right', 'recommend', 'satisfies']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,1182,25.5365,I love the Snapea Crisp snacks. The Caesar fla...
1,2,1007,25.2843,Great snack when your trying to eat healthy bu...
2,3,1589,18.9462,My new favorite! Lots of spicy crunch that fu...
3,4,6864,15.8128,Fantastic bold citrusy flavor. Not very spicy...



Query 6
Original query: dark chocolate bar
Expanded query terms: ['dark', 'chocolate', 'bar', 'taste', 'br', 'best', 'gram', 'like']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,3964,23.3701,Kellogg's Special K Dark Chocolate granola bar...
1,2,172,19.1562,I admit that I am not a big dark chocolate fan...
2,3,5553,19.0189,Santanders 53% semi-sweet dark chocolate bar i...
3,4,2258,15.7995,"Being lactose intolerant, I was happy to find ..."



Query 7
Original query: healthy cereal
Expanded query terms: ['healthy', 'cereal', 'sugar', 'breakfast', 'love', 'fiber', 'like']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,5358,22.5172,I and my kids all enjoyed this cereal. It's n...
1,2,3972,19.6845,This is my new favorite cereal. I had given up...
2,3,5174,19.4122,"Another great, nutritious breakfast cereal pro..."
3,4,2100,19.2757,"I love hot cereal for breakfast, and I've alwa..."



Query 8
Original query: low fat yogurt
Expanded query terms: ['low', 'fat', 'yogurt', 'good', 'free', 'pudding', 'excellent', 'snack']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,2321,34.0385,The Jello Fat Free puddings are and excellent ...
1,2,1211,18.5008,This granola is delicious and has no added fat...
2,3,4660,17.6475,"These are great. Low fat, low calorie, low in ..."
3,4,2197,17.1133,This is a healthy low fat snack for dogs and a...



Query 9
Original query: protein bars
Expanded query terms: ['protein', 'bar', 'taste', 'g', 'snack', 'meal', 'better']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,711,26.9938,"I was looking for a healthy, great tasting sna..."
1,2,8459,24.6439,I purchased these bars as a meal replacement a...
2,3,5211,21.7966,I love this protein snack bar. Tastes more lik...
3,4,7590,19.8366,Do not buy these if you were hooked on these p...



Query 10
Original query: sour candy
Expanded query terms: ['sour', 'candy', 'like', 'good', 'br', 'sweet', 'pretty']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,5132,23.4291,"I love sweet and sour candies, so I decided to..."
1,2,1571,20.4417,These are my all time favorite sour candy. The...
2,3,1015,16.7062,"These are great! I feel like Im eating candy,..."
3,4,612,16.563,I don't generally crave candy or sweets that o...



Query 11
Original query: nut free snacks
Expanded query terms: ['nut', 'free', 'snack', 'one', 'love', 'great', 'flavor', 'cracker']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,2876,21.8699,This is a great snack/animal cracker for young...
1,2,9068,19.2598,"Especially for people wanting to avoid wheat, ..."
2,3,5391,17.3456,If you are looking for a great gluten free sna...
3,4,2907,17.1608,My two crunchy snacks of choice for some time ...



Query 12
Original query: dairy free milk
Expanded query terms: ['dairy', 'free', 'milk', 'gluten', 'bread', 'taste', 'good', 'powder']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,3036,37.5861,This is the secret to gluten free bread making...
1,2,9392,20.7193,these are good but they contain dairy! Amazon ...
2,3,1656,20.5057,These are the best gluten and dairy free cooki...
3,4,1534,19.2123,My son is Gluten free and Casein Dairy free an...



Query 13
Original query: coffee beans
Expanded query terms: ['coffee', 'bean', 'direct', 'time', 'bag', 'lavazza', 'ordering']


Unnamed: 0,Rank,Doc ID,BM25 Score,Text
0,1,3532,28.5775,We have been ordering this coffee for over a y...
1,2,1548,24.8599,This coffee is an excellent buy. I only use i...
2,3,2274,22.8332,Lavazza Italian Coffee is the best coffee I ha...
3,4,3334,16.7704,I have bough Larry's coffee in the past and it...
