## Importing Packages

In [88]:
!pip install wikipedia rank_bm25



In [89]:
import wikipedia
import pandas as pd
import json
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score
from rank_bm25 import BM25Okapi


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dimit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dimit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dimit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Collection and save of articles
    Collecting and storing wikipedia articles about AI, Computer Science and more

In [91]:
def collect_wikipedia_articles(topics, num_articles=10):
   
    articles = []

    for topic in topics:
        try:
            # Searching of articles based on the topics
            search_results = wikipedia.search(topic, results=num_articles)

            for title in search_results:
                try:
                    page = wikipedia.page(title)
                    articles.append({
                        'title': page.title,
                        'content': page.content,
                        'url': page.url
                    })
                except:
                    continue

        except:
            continue

    return articles
topics = ['Artificial Intelligence', 'Machine Learning', 'Neural Networks', 'Applications of Artificial Intelligence', 'AI in Healthcare', 'Data Engineering', 'Data Science']
collected_articles = collect_wikipedia_articles(topics)

# Json formatting
with open('wikipedia_articles.json', 'w', encoding='utf-8') as f:
    json.dump(collected_articles, f, ensure_ascii=False, indent=4)

print(f"Gathered {len(collected_articles)} articles in total")

Gathered 65 articles in total


## NLP Preprocessing Articles
    Preparing Articles for NLP Analysis Using Tokenization, Stopword Removal, and Stemming

In [93]:
def preprocess_text(text):
    
    # Convert text to lower case
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Stemming
    tokens = [token for token in tokens if token not in string.punctuation and not token.isdigit()]

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens


processed_articles = []
for article in collected_articles:
    processed_tokens = preprocess_text(article['content'])
    processed_articles.append({
        'title': article['title'],
        'processed_content': processed_tokens,
        'original_content': article['content']
    })

print(f"Preprocessing of {len(processed_articles)} articles")

Preprocessing of 65 articles


## Inverse Index Creation
    Constructing an Inverted Index from Preprocessed Articles

In [95]:
def create_inverted_index(processed_articles):
    
    inverted_index = defaultdict(list)

    for doc_id, article in enumerate(processed_articles):
        for position, token in enumerate(article['processed_content']):
            inverted_index[token].append((doc_id, position))

    return dict(inverted_index)
inverted_index = create_inverted_index(processed_articles)

print(f"The index contains {len(inverted_index)} unique terms")

The index contains 10916 unique terms


## Building and Querying the Search Engine with Boolean, TF-IDF, and BM25
    Querying using a user-friendly menu

In [97]:
class SearchEngine:
    def __init__(self, processed_articles, inverted_index):
        self.processed_articles = processed_articles
        self.inverted_index = inverted_index
        self.vectorizer = TfidfVectorizer()

        # TF-IDF matric creation
        documents = [' '.join(article['processed_content']) for article in processed_articles]
        self.tfidf_matrix = self.vectorizer.fit_transform(documents)

        # BM model
        self.bm25 = BM25Okapi([article['processed_content'] for article in processed_articles])
        

    def boolean_search(self, query, operator):
        """
        Boolean search with AND, OR, NOT
        """
        query_tokens = preprocess_text(query)

        if operator == 'AND':
            
            result_docs = set(range(len(self.processed_articles)))
            for token in query_tokens:
                if token in self.inverted_index:
                    docs = set(doc_id for doc_id, _ in self.inverted_index[token])
                    result_docs = result_docs.intersection(docs)
                else:
                    return []

        elif operator == 'OR':
            
            result_docs = set()
            for token in query_tokens:
                if token in self.inverted_index:
                    docs = set(doc_id for doc_id, _ in self.inverted_index[token])
                    result_docs = result_docs.union(docs)

        elif operator == 'NOT':
            if len(query_tokens) < 2:
                return [] 
    
            first_token = query_tokens[0]
            second_token = query_tokens[1]

            docs_with_first = set(doc_id for doc_id, _ in self.inverted_index.get(first_token, []))
    
            docs_with_second = set(doc_id for doc_id, _ in self.inverted_index.get(second_token, []))

            result_docs = docs_with_first.difference(docs_with_second)

        
        # Convert the set of doc_ids to a list of (doc_id, score) pairs
        # For boolean, 1.0 is the score for all matching documents
        return [(doc_id, 1.0) for doc_id in result_docs]




    def tfidf_search(self, query):
        """
        Search using TF-IDF
        """
        query_vector = self.vectorizer.transform([' '.join(preprocess_text(query))])
        similarities = (self.tfidf_matrix @ query_vector.T).toarray().flatten()
        return sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)

    def bm25_search(self, query):
        """
        Search using BM25
        """
        query_tokens = preprocess_text(query)
        scores = self.bm25.get_scores(query_tokens)
        return sorted(enumerate(scores), key=lambda x: x[1], reverse=True)

    def search(self, query, method, operator, top_k=5):
        """
        Search options
        """
        if method == 'boolean':
            results = self.boolean_search(query, operator)
        elif method == 'tfidf':
            results = self.tfidf_search(query)
        elif method == 'bm25':
            results = self.bm25_search(query)
        else:
            raise ValueError("You must select either boolean,tfidf or bm25")

        # Top -k results
        return [(self.processed_articles[doc_id]['title'], score)
                for doc_id, score in results[:top_k]]

    def detect_operator(self, query):

        query = query.lower()
        
        if 'and' in query:
            return 'AND'
        elif 'or' in query:
            return 'OR'
        elif 'not' in query:
            return 'NOT'
        else:
            return 'AND'

search_engine = SearchEngine(processed_articles, inverted_index)

def main_menu():
    print("\nWelcome to my Search Engine!")
    print("\nInstructions:")
    print("- Enter your query to search.")
    print("- Choose a method: Boolean, TF-IDF, or BM25.")
    print("- Type 'exit' to quit the program.")
    print("\n- Example query: 'Find documents about AI'")

def choose_method():
    methods = {"1": "boolean", "2": "tfidf", "3": "bm25"}
    print("\nChoose a search method:")
    for key, value in methods.items():
        print(f"{key}. {value.title()}")

    while True:
        choice = input("Enter the number corresponding to your choice: ").strip()
        if choice in methods:
            return methods[choice]
        else:
            print("Invalid choice. Please select 1, 2, or 3.")

def display_results(results):
    print("\nResults:")
    if not results:
        print("- No results found.")
    else:
        for i, (title, score) in enumerate(results, start=1):
            print(f"{i}. {title} (Score: {score:.4f})")
    print("-" * 40)

def main():
    main_menu()
    while True:
        query = input("\nEnter your query (or type 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            confirm = input("Are you sure you want to exit? (y/n): ").strip().lower()
            if confirm == 'y':
                print("Goodbye!")
                break
            else:
                continue

        operator = search_engine.detect_operator(query)
        method = choose_method()

        try:
            results = search_engine.search(query, method, operator)
            display_results(results)
        except ValueError as e:
            print(f"Error: {e}. Please try again.")

if __name__ == "__main__":
    main()



Welcome to my Search Engine!

Instructions:
- Enter your query to search.
- Choose a method: Boolean, TF-IDF, or BM25.
- Type 'exit' to quit the program.

- Example query: 'Find documents about AI'



Enter your query (or type 'exit' to quit):  data science



Choose a search method:
1. Boolean
2. Tfidf
3. Bm25


Enter the number corresponding to your choice:  1



Results:
1. Artificial intelligence (Score: 1.0000)
2. Artificial general intelligence (Score: 1.0000)
3. Ethics of artificial intelligence (Score: 1.0000)
4. Applications of artificial intelligence (Score: 1.0000)
5. Artificial intelligence in healthcare (Score: 1.0000)
----------------------------------------



Enter your query (or type 'exit' to quit):  data science



Choose a search method:
1. Boolean
2. Tfidf
3. Bm25


Enter the number corresponding to your choice:  2



Results:
1. Data science (Score: 0.7601)
2. Data science (Score: 0.7601)
3. Social data science (Score: 0.6156)
4. Data engineering (Score: 0.4678)
5. Big data (Score: 0.4642)
----------------------------------------



Enter your query (or type 'exit' to quit):  data science



Choose a search method:
1. Boolean
2. Tfidf
3. Bm25


Enter the number corresponding to your choice:  3



Results:
1. Social data science (Score: 3.5993)
2. Data science (Score: 3.5918)
3. Data science (Score: 3.5918)
4. Biomedical data science (Score: 3.5721)
5. Computer science (Score: 3.4964)
----------------------------------------



Enter your query (or type 'exit' to quit):  psychology and health



Choose a search method:
1. Boolean
2. Tfidf
3. Bm25


Enter the number corresponding to your choice:  1



Results:
1. Artificial intelligence (Score: 1.0000)
2. Hallucination (artificial intelligence) (Score: 1.0000)
3. Artificial intelligence (Score: 1.0000)
4. OpenAI (Score: 1.0000)
5. Deep learning (Score: 1.0000)
----------------------------------------



Enter your query (or type 'exit' to quit):  psychology and health



Choose a search method:
1. Boolean
2. Tfidf
3. Bm25


Enter the number corresponding to your choice:  2



Results:
1. Artificial intelligence in mental health (Score: 0.2510)
2. Artificial intelligence in healthcare (Score: 0.1142)
3. Artificial intelligence in healthcare (Score: 0.1142)
4. Artificial intelligence in healthcare (Score: 0.1142)
5. Biomedical data science (Score: 0.0495)
----------------------------------------



Enter your query (or type 'exit' to quit):  psychology and health



Choose a search method:
1. Boolean
2. Tfidf
3. Bm25


Enter the number corresponding to your choice:  3



Results:
1. Big data (Score: 2.4992)
2. Hallucination (artificial intelligence) (Score: 2.1687)
3. Social data science (Score: 2.1420)
4. Artificial intelligence in mental health (Score: 2.1183)
5. List of engineering branches (Score: 2.1133)
----------------------------------------



Enter your query (or type 'exit' to quit):  exit
Are you sure you want to exit? (y/n):  y


Goodbye!


## Evaluating Search Engine Performance
    Performance Evaluation of Search Engine with Test Queries and showing results such as: Precision, Recall, F1-Score, and MAP

In [99]:
def evaluate_search_engine_robust(search_engine, test_queries, relevant_docs):
   
    results = {
        'precision': [],
        'recall': [],
        'f1': [],
        'average_precision': []
    }

    for query, relevant in zip(test_queries, relevant_docs):
        print(f"\nEvaluating query: '{query}'")

        for method in ['tfidf', 'bm25', 'boolean']:
            # Get search results
            search_results = search_engine.search(query, method, operator="AND")
            retrieved_docs = [doc[0].lower() for doc in search_results]  # Convert to lowercase
            relevant_lower = [r.lower() for r in relevant]  # Convert to lowercase

            # Print debugging info
            print(f"\nMethod: {method}")
            print("Retrieved documents:")
            for doc, score in search_results:
                print(f"- {doc} (score: {score:.4f})")

            # Calculate metrics with case-insensitive matching
            relevant_retrieved = set(retrieved_docs).intersection(set(relevant_lower))

            # Calculate and store metrics
            if retrieved_docs:
                precision = len(relevant_retrieved) / len(retrieved_docs)
            else:
                precision = 0

            if relevant:
                recall = len(relevant_retrieved) / len(relevant)
            else:
                recall = 0

            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            ap = 0
            hits = 0
            
            for i, doc in enumerate(retrieved_docs):
                if doc in relevant_lower:
                    hits += 1
                    ap += hits / (i + 1)  # Precision at rank i+1
            ap = ap / len(relevant) if relevant else 0
            

            # Store and print results
            results['precision'].append(precision)
            results['recall'].append(recall)
            results['f1'].append(f1)
            results['average_precision'].append(ap)

            print(f"\nMetrics for {method}:")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1-score: {f1:.4f}")
            print(f"Average Precision (AP): {ap:.4f}")

    map_score = sum(results['average_precision']) / len(test_queries) if test_queries else 0
    print(f"\nMean Average Precision (MAP): {map_score:.4f}")

    return results

# Update relevant_docs based on actual articles in collection
# First, get all actual titles
actual_titles = [article['title'] for article in processed_articles]
print("\nAvailable titles in collection:")
for title in actual_titles:
    print(f"- {title}")


test_queries = [
    "What is artificial intelligence?",
    "What is machine learning?",
    "How neural networks connects with artificial intelligence?",
    "Applications of artificial intelligence",
    "How can AI help in healthcare",
    "How Data engineering affecting the world",
    "How Data science developing"
]

relevant_docs = [
    [title for title in actual_titles if any(topic.lower() in title.lower() for topic in topics)],
    [title for title in actual_titles if any(topic.lower() in title.lower() for topic in topics)],
    [title for title in actual_titles if any(topic.lower() in title.lower() for topic in topics)],
    [title for title in actual_titles if any(topic.lower() in title.lower() for topic in topics)],
    [title for title in actual_titles if any(topic.lower() in title.lower() for topic in topics)],
    [title for title in actual_titles if any(topic.lower() in title.lower() for topic in topics)],
    [title for title in actual_titles if any(topic.lower() in title.lower() for topic in topics)]
]

evaluation_results = evaluate_search_engine_robust(search_engine, test_queries, relevant_docs)


Available titles in collection:
- Artificial intelligence
- Artificial general intelligence
- A.I. Artificial Intelligence
- Ethics of artificial intelligence
- Applications of artificial intelligence
- Artificial intelligence in healthcare
- History of artificial intelligence
- Hallucination (artificial intelligence)
- Regulation of artificial intelligence
- Quantum machine learning
- Neural network (machine learning)
- Timeline of machine learning
- Transformer (deep learning architecture)
- Attention (machine learning)
- Boosting (machine learning)
- Adversarial machine learning
- Active learning (machine learning)
- Feature (machine learning)
- Neural network
- Neural network (machine learning)
- Convolutional neural network
- Recurrent neural network
- Feedforward neural network
- Deep learning
- Rectifier (neural networks)
- Neural network (biology)
- Residual neural network
- Physics-informed neural networks
- Applications of artificial intelligence
- Artificial intelligence in