## Boolean Retrieval

In [1]:
documents = {
    "D1": "Machine learning improves search engines.",
    "D2": "Information retrieval techniques are evolving.",
    "D3": "Search engines use advanced algorithms.",
    "D4": "Deep learning and neural networks are popular.",
    "D5": "Boolean retrieval uses logical operators.",
    "D6": "Query processing is essential in search engines.",
    "D7": "Text mining and NLP are related to information retrieval.",
    "D8": "Search algorithms improve information discovery.",
    "D9": "Data science leverages machine learning.",
    "D10": "Ranking methods optimize search engine results."
}

In [2]:
query1 = "Search AND Engine"
query2 = "Information OR Retrieval"
query3 = "Machine NOT Learning"


In [None]:
# Corrected Ground Truth
ground_truth = {
    "Search AND Engine": ["D3", "D6", "D10"],  # Corrected: D1 removed
    "Information OR Retrieval": ["D2", "D5", "D7", "D8"],
    "Machine NOT Learning": []
}

Simple Boolean Retrieval

In [3]:

def simple_boolean_retrieval(query, documents):
    results = []
    query_parts = query.split()
    term1 = query_parts[0].lower()  # First term
    operator = query_parts[1].upper()  # AND, OR, NOT
    term2 = query_parts[2].lower()  # Second term

    for doc_id, doc_text in documents.items():
        doc_text_lower = doc_text.lower()
        term1_present = term1 in doc_text_lower
        term2_present = term2 in doc_text_lower

        if operator == "AND":
            if term1_present and term2_present:
                results.append(doc_id)
        elif operator == "OR":
            if term1_present or term2_present:
                results.append(doc_id)
        elif operator == "NOT":
            if term1_present and not term2_present:
                results.append(doc_id)
        else:
            print("Invalid operator")
            return []

    return results



In [4]:
results1 = simple_boolean_retrieval(query1, documents)
results2 = simple_boolean_retrieval(query2, documents)
results3 = simple_boolean_retrieval(query3, documents)


print(f"Results for '{query1}': {results1}")
print(f"Results for '{query2}': {results2}")
print(f"Results for '{query3}': {results3}")

Results for 'Search AND Engine': ['D1', 'D3', 'D6', 'D10']
Results for 'Information OR Retrieval': ['D2', 'D5', 'D7', 'D8']
Results for 'Machine NOT Learning': []


Stemming Boolean Retrieval

In [5]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Download required resource for tokenization (if you haven't already)

[nltk_data] Downloading package punkt to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:

def stem_text(text):
    """Stems the words in a given text."""
    stemmer = PorterStemmer()
    words = word_tokenize(text)  # Tokenize the text into words
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)  # Return as a string


def stem_boolean_retrieval(query, documents):
    results = []
    query_parts = query.split()
    term1 = stem_text(query_parts[0]).lower()  # Stem and lowercase term1
    operator = query_parts[1].upper()  # AND, OR, NOT
    term2 = stem_text(query_parts[2]).lower()  # Stem and lowercase term2

    for doc_id, doc_text in documents.items():
        stemmed_doc_text = stem_text(doc_text).lower()
        term1_present = term1 in stemmed_doc_text
        term2_present = term2 in stemmed_doc_text

        if operator == "AND":
            if term1_present and term2_present:
                results.append(doc_id)
        elif operator == "OR":
            if term1_present or term2_present:
                results.append(doc_id)
        elif operator == "NOT":
            if term1_present and not term2_present:
                results.append(doc_id)
        else:
            print("Invalid operator")
            return []

    return results

In [7]:
results1 = stem_boolean_retrieval(query1, documents)
results2 = stem_boolean_retrieval(query2, documents)
results3 = stem_boolean_retrieval(query3, documents)


print(f"Results for '{query1}': {results1}")
print(f"Results for '{query2}': {results2}")
print(f"Results for '{query3}': {results3}")

Results for 'Search AND Engine': ['D1', 'D3', 'D6', 'D10']
Results for 'Information OR Retrieval': ['D2', 'D5', 'D7', 'D8']
Results for 'Machine NOT Learning': []


Lemmatization Boolean Retrieval

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')  # Download required resource for lemmatization (if you haven't already)

[nltk_data] Downloading package punkt to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...


True

In [11]:
def lemmatize_text(text):
    """Lemmatizes the words in a given text."""
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)  # Tokenize the text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)  # Return as a string


def lemmatization_boolean_retrieval(query, documents):
    results = []
    query_parts = query.split()
    term1 = lemmatize_text(query_parts[0]).lower()  # Lemmatize and lowercase term1
    operator = query_parts[1].upper()  # AND, OR, NOT
    term2 = lemmatize_text(query_parts[2]).lower()  # Lemmatize and lowercase term2

    for doc_id, doc_text in documents.items():
        lemmatized_doc_text = lemmatize_text(doc_text).lower()
        term1_present = term1 in lemmatized_doc_text
        term2_present = term2 in lemmatized_doc_text

        if operator == "AND":
            if term1_present and term2_present:
                results.append(doc_id)
        elif operator == "OR":
            if term1_present or term2_present:
                results.append(doc_id)
        elif operator == "NOT":
            if term1_present and not term2_present:
                results.append(doc_id)
        else:
            print("Invalid operator")
            return []

    return results



In [12]:
results1 = lemmatization_boolean_retrieval(query1, documents)
results2 = lemmatization_boolean_retrieval(query2, documents)
results3 = lemmatization_boolean_retrieval(query3, documents)


print(f"Results for '{query1}': {results1}")
print(f"Results for '{query2}': {results2}")
print(f"Results for '{query3}': {results3}")


Results for 'Search AND Engine': ['D1', 'D3', 'D6', 'D10']
Results for 'Information OR Retrieval': ['D2', 'D5', 'D7', 'D8']
Results for 'Machine NOT Learning': []


Stop Words Boolean Retrieval

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords') # Download stop words

[nltk_data] Downloading package stopwords to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:

def remove_stopwords(text):
    """Removes stop words from a given text."""
    stop_words = set(stopwords.words('english'))  # Use NLTK's English stop words
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)


def stop_words_boolean_retrieval(query, documents):
    results = []
    query_parts = query.split()
    term1 = remove_stopwords(query_parts[0]).lower()  # Remove stopwords, lowercase term1
    operator = query_parts[1].upper()  # AND, OR, NOT
    term2 = remove_stopwords(query_parts[2]).lower()  # Remove stopwords, lowercase term2

    for doc_id, doc_text in documents.items():
        processed_doc_text = remove_stopwords(doc_text).lower()
        term1_present = term1 in processed_doc_text
        term2_present = term2 in processed_doc_text

        if operator == "AND":
            if term1_present and term2_present:
                results.append(doc_id)
        elif operator == "OR":
            if term1_present or term2_present:
                results.append(doc_id)
        elif operator == "NOT":
            if term1_present and not term2_present:
                results.append(doc_id)
        else:
            print("Invalid operator")
            return []

    return results


In [15]:
results1 = stop_words_boolean_retrieval(query1, documents)
results2 = stop_words_boolean_retrieval(query2, documents)
results3 = stop_words_boolean_retrieval(query3, documents)


print(f"Results for '{query1}': {results1}")
print(f"Results for '{query2}': {results2}")
print(f"Results for '{query3}': {results3}")

Results for 'Search AND Engine': ['D1', 'D3', 'D6', 'D10']
Results for 'Information OR Retrieval': ['D2', 'D5', 'D7', 'D8']
Results for 'Machine NOT Learning': []


## Ultra Model

In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pandas as pd  # Import pandas

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

documents = {
    "D1": "Machine learning improves search engines.",
    "D2": "Information retrieval techniques are evolving.",
    "D3": "Search engines use advanced algorithms.",
    "D4": "Deep learning and neural networks are popular.",
    "D5": "Boolean retrieval uses logical operators.",
    "D6": "Query processing is essential in search engines.",
    "D7": "Text mining and NLP are related to information retrieval.",
    "D8": "Search algorithms improve information discovery.",
    "D9": "Data science leverages machine learning.",
    "D10": "Ranking methods optimize search engine results."
}

# Corrected Ground Truth
ground_truth = {
    "Search AND Engine": ["D3", "D6", "D10"],  # Corrected: D1 removed
    "Information OR Retrieval": ["D2", "D5", "D7", "D8"],
    "Machine NOT Learning": []
}

all_doc_ids = list(documents.keys())


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)


def simple_boolean_retrieval(query, documents):
    results = []
    query_parts = query.split()
    term1 = query_parts[0].lower()
    operator = query_parts[1].upper()
    term2 = query_parts[2].lower()

    for doc_id, doc_text in documents.items():
        doc_text_lower = doc_text.lower()
        term1_present = term1 in doc_text_lower
        term2_present = term2 in doc_text_lower

        if operator == "AND":
            if term1_present and term2_present:
                results.append(doc_id)
        elif operator == "OR":
            if term1_present or term2_present:
                results.append(doc_id)
        elif operator == "NOT":
            if term1_present and not term2_present:
                results.append(doc_id)
        else:
            print("Invalid operator")
            return []

    return results


def stem_text(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)


def stemming_boolean_retrieval(query, documents):
    results = []
    query_parts = query.split()
    term1 = stem_text(query_parts[0]).lower()
    operator = query_parts[1].upper()
    term2 = stem_text(query_parts[2]).lower()

    for doc_id, doc_text in documents.items():
        stemmed_doc_text = stem_text(doc_text).lower()
        term1_present = term1 in stemmed_doc_text
        term2_present = term2 in stemmed_doc_text

        if operator == "AND":
            if term1_present and term2_present:
                results.append(doc_id)
        elif operator == "OR":
            if term1_present or term2_present:
                results.append(doc_id)
        elif operator == "NOT":
            if term1_present and not term2_present:
                results.append(doc_id)
        else:
            print("Invalid operator")
            return []

    return results


def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)


def lemmatization_boolean_retrieval(query, documents):
    results = []
    query_parts = query.split()
    term1 = lemmatize_text(query_parts[0]).lower()
    operator = query_parts[1].upper()
    term2 = lemmatize_text(query_parts[2]).lower()

    for doc_id, doc_text in documents.items():
        lemmatized_doc_text = lemmatize_text(doc_text).lower()
        term1_present = term1 in lemmatized_doc_text
        term2_present = term2 in lemmatized_doc_text

        if operator == "AND":
            if term1_present and term2_present:
                results.append(doc_id)
        elif operator == "OR":
            if term1_present or term2_present:
                results.append(doc_id)
        elif operator == "NOT":
            if term1_present and not term2_present:
                results.append(doc_id)
        else:
            print("Invalid operator")
            return []

    return results


def stop_words_boolean_retrieval(query, documents):
    results = []
    query_parts = query.split()
    term1 = remove_stopwords(query_parts[0]).lower()
    operator = query_parts[1].upper()
    term2 = remove_stopwords(query_parts[2]).lower()

    for doc_id, doc_text in documents.items():
        processed_doc_text = remove_stopwords(doc_text).lower()
        term1_present = term1 in processed_doc_text
        term2_present = term2 in processed_doc_text

        if operator == "AND":
            if term1_present and term2_present:
                results.append(doc_id)
        elif operator == "OR":
            if term1_present or term2_present:
                results.append(doc_id)
        elif operator == "NOT":
            if term1_present and not term2_present:
                results.append(doc_id)
        else:
            print("Invalid operator")
            return []

    return results


def evaluate(results, ground_truth, all_doc_ids):
    relevant_retrieved = set(results) & set(ground_truth)

    true_positives = len(relevant_retrieved)
    false_positives = len(results) - true_positives
    false_negatives = len(ground_truth) - true_positives
    true_negatives = len(set(all_doc_ids) - set(results) ) - false_negatives

    if len(results) > 0:
        precision = true_positives / len(results)
    else:
        precision = 0.0

    if len(ground_truth) > 0:
        recall = true_positives / len(ground_truth)
    else:
        recall = 0.0

    accuracy = (true_positives + true_negatives) / len(all_doc_ids)
    if (precision + recall) > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0

    return {
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "f1_score": f1_score
    }


# Define the queries and methods
queries = ["Search AND Engine", "Information OR Retrieval", "Machine NOT Learning"]
methods = {
    "Simple": simple_boolean_retrieval,
    "Stemming": stemming_boolean_retrieval,
    "Lemmatization": lemmatization_boolean_retrieval,
    "Stop Words": stop_words_boolean_retrieval,
}

# Store the results in a list of dictionaries
data = []
for query in queries:
    row = {"Query": query}
    for name, method in methods.items():
        results = method(query, documents)
        metrics = evaluate(results, ground_truth[query], all_doc_ids)
        row[f"{name} (P)"] = metrics['precision']
        row[f"{name} (R)"] = metrics['recall']
        row[f"{name} (A)"] = metrics['accuracy']
        row[f"{name} (F1)"] = metrics['f1_score']
    data.append(row)

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
print(df)

[nltk_data] Downloading package punkt to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                      Query  Simple (P)  Simple (R)  Simple (A)  Simple (F1)  \
0         Search AND Engine        0.75         1.0         0.9     0.857143   
1  Information OR Retrieval        1.00         1.0         1.0     1.000000   
2      Machine NOT Learning        0.00         0.0         1.0     0.000000   

   Stemming (P)  Stemming (R)  Stemming (A)  Stemming (F1)  Lemmatization (P)  \
0          0.75           1.0           0.9       0.857143               0.75   
1          1.00           1.0           1.0       1.000000               1.00   
2          0.00           0.0           1.0       0.000000               0.00   

   Lemmatization (R)  Lemmatization (A)  Lemmatization (F1)  Stop Words (P)  \
0                1.0                0.9            0.857143            0.75   
1                1.0                1.0            1.000000            1.00   
2                0.0                1.0            0.000000            0.00   

   Stop Words (R)  Stop Words (A)  St