In [2]:
!pip install numpy
!pip install rank-bm25
!pip install nltk
!pip install ipywidgets


Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1


In [3]:
# BM25 Model with UI and Evaluation Technique
# Ensure these packages are installed: nltk, numpy, zipfile, rank-bm25, ipywidgets
import json
import zipfile
import numpy as np
from rank_bm25 import BM25Okapi
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import ipywidgets as widgets
from IPython.display import display

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Custom tokenizer function with stop words handling
def custom_tokenizer(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    processed_tokens = [
        stemmer.stem(lemmatizer.lemmatize(token))
        for token in tokens if token.isalpha() and token not in stop_words
    ]
    return processed_tokens

# Function to read and preprocess article texts and titles
def preprocess_documents(file_path):
    documents = []
    articles_titles = []
    with zipfile.ZipFile(file_path, 'r') as z:
        with z.open(z.namelist()[0]) as f:
            for line in f:
                article = json.loads(line)
                text = article['text']
                documents.append(text)
                articles_titles.append(article['title'])
    return documents, articles_titles

# Load and preprocess documents
file_path = 'wiki-articles.zip'
documents, articles_titles = preprocess_documents(file_path)

# Tokenize documents
tokenized_docs = [custom_tokenizer(doc) for doc in documents]

# Initialize the BM25 model
bm25_model = BM25Okapi(tokenized_docs)

# Define the BM25 search function
def search_bm25(query, top_n=5):
    query_tokens = custom_tokenizer(query)
    doc_scores = bm25_model.get_scores(query_tokens)
    top_doc_indices = np.argsort(doc_scores)[::-1][:top_n]
    return [articles_titles[idx] for idx in top_doc_indices]

# Text widget for BM25 search query input
bm25_query_input = widgets.Text(
    value='Python Programming',
    description='BM25 Query:',
    disabled=False
)

# Integer slider for BM25 specifying the number of top titles to return
bm25_top_n_slider = widgets.IntSlider(
    value=5,
    min=1,
    max=20,
    step=1,
    description='BM25 Top N:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

# Button to perform the BM25 search
bm25_search_button = widgets.Button(
    description='BM25 Search',
    disabled=False,
    button_style='',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to search',
    icon='search'  # (FontAwesome names without the `fa-` prefix)
)

# Output widget to display the BM25 search results
bm25_output = widgets.Output()

def on_bm25_search_button_clicked(b):
    # Perform BM25 search when button is clicked
    with bm25_output:
        bm25_output.clear_output()  # Clear the previous search results
        query = bm25_query_input.value
        top_n = bm25_top_n_slider.value
        top_titles = search_bm25(query, top_n=top_n)  # Use the BM25 search function
        for title in top_titles:
            print(title)

bm25_search_button.on_click(on_bm25_search_button_clicked)

# Display the BM25 widgets
bm25_widgets = widgets.VBox([bm25_query_input, bm25_top_n_slider, bm25_search_button, bm25_output])
display(bm25_widgets)





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


VBox(children=(Text(value='Python Programming', description='BM25 Query:'), IntSlider(value=5, continuous_upda…

In [4]:
# Evaluation Technique (Mean Average Precision)
def compute_precision_at_k(relevance, k):
    relevant_count = 0
    precision_sum = 0
    for i, rel in enumerate(relevance[:k]):
        if rel == 1:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)  # Precision at k
    if relevant_count == 0:
        return 0
    return precision_sum / relevant_count  # Average Precision (AP)

def compute_map(queries, relevance_judgments, top_n=5):
    map_scores = []
    for query in queries:
        relevance = relevance_judgments.get(query, [])
        precision_at_k = compute_precision_at_k(relevance, top_n)
        map_scores.append(precision_at_k)
    return np.mean(map_scores)

# Example relevance judgments for BM25
bm25_relevance_judgments = {
    "python programming": [1, 1, 0, 0, 1],
    "world war": [1, 0, 0, 1, 1],
    "planet": [1, 1, 1, 1, 1],
}

# Compute MAP for BM25
bm25_queries = list(bm25_relevance_judgments.keys())
bm25_map = compute_map(bm25_queries, bm25_relevance_judgments, top_n=5)

print("BM25 MAP:", bm25_map)

BM25 MAP: 0.8555555555555556
