In [21]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.12.1-cp310-cp310-macosx_11_0_arm64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.12.1
[0m

In [30]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
import json
import os
from datetime import datetime, timedelta

# Connect to Elasticsearch
es = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "BqhgYopav*kT-ovIUSrX"),
    verify_certs=False,
    ssl_show_warn=False,
    request_timeout=30
)

index_name = "books"
log_index_name = "search_logs"
json_file_path = "books.json"
csv_file_path = "synonyms.csv"  # Make sure this file exists

# Function to load JSON data
def load_books_from_json(json_file_path):
    if not os.path.exists(json_file_path):
        raise FileNotFoundError(f"JSON file not found: {json_file_path}")
    
    with open(json_file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    if not isinstance(data, list):
        raise ValueError("JSON file must contain a list of book objects.")
    
    return data

# Function to load synonyms from CSV
def load_synonyms_from_csv(csv_file_path):
    if not os.path.exists(csv_file_path):
        raise FileNotFoundError(f"CSV file not found: {csv_file_path}")

    df = pd.read_csv(csv_file_path)
    df.columns = df.columns.str.lower()  # Normalize column names

    if "lemma" not in df.columns or "synonyms" not in df.columns:
        raise ValueError("CSV file must have 'lemma' and 'synonyms' columns.")

    return [f"{row['lemma']} => {row['synonyms']}" for _, row in df.iterrows()]

# Function to update synonym filter in Elasticsearch
def update_synonym_filter(csv_file_path):
    synonyms = load_synonyms_from_csv(csv_file_path)

    if es.indices.exists(index=index_name):
        es.indices.close(index=index_name)

        es.indices.put_settings(
            index=index_name,
            body={
               "settings": {
    "analysis": {
        "analyzer": {
            "synonym_analyzer": {
                "tokenizer": "standard",
                "filter": ["lowercase", "synonym_filter"]
            }
        },
        "filter": {
            "synonym_filter": {
                "type": "synonym",
                "synonyms": [
                    "car, automobile",
                    "book, novel, publication",
                    "adventure, journey, quest"
                ]
            }
        }
    }
}
            }
        )
        es.indices.open(index=index_name)
        print("Synonym filter updated successfully.")
    else:
        print(f"Index {index_name} does not exist.")

# Ensure the books index exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"Deleted existing index: {index_name}")

# Create the books index
es.indices.create(index=index_name, body={
    "mappings": {
        "properties": {
            "Title": {"type": "text"},
            "Author": {"type": "text"},
            "Description": {"type": "text"},
            "Language": {"type": "keyword"},
            "Category": {"type": "text"},
            "Link": {"type": "keyword"},
        }
    },
    "settings": {
        "analysis": {
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": []
                }
            },
            "analyzer": {
                "synonym_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "synonym_filter"]
                }
            }
        }
    }
})
print(f"Created index: {index_name}")

# Ensure the search_logs index exists
if es.indices.exists(index=log_index_name):
    es.indices.delete(index=log_index_name)
    print(f"Deleted existing index: {log_index_name}")

if not es.indices.exists(index=log_index_name):
    es.indices.create(index=log_index_name, body={
        "mappings": {
            "properties": {
                "query": {"type": "keyword"},
                "timestamp": {"type": "date"}
            }
        }
    })
    print(f"Created index: {log_index_name}")

# Function to Log Search Queries
def log_search(query):
    doc = {
        "query": query,
        "timestamp": datetime.utcnow().isoformat()
    }
    es.index(index=log_index_name, document=doc)

# Load book data from JSON file
books_data = load_books_from_json(json_file_path)

# Prepare bulk data for indexing
bulk_data = [
    {
        "_index": index_name,
        "_source": book
    }
    for book in books_data
]

# Load and update synonyms
update_synonym_filter(csv_file_path)

# Perform bulk indexing and handle errors
success, failed = bulk(es, bulk_data, raise_on_error=False)
print(f"Successfully indexed {success} documents, {len(failed)} failed.")

print("Setup completed successfully!")

Deleted existing index: books
Created index: books
Deleted existing index: search_logs
Created index: search_logs
Synonym filter updated successfully.
Successfully indexed 759 documents, 0 failed.
Setup completed successfully!


In [48]:
#Useful
def search_books_by_language(criteria):
    query = {
        "query": {
            "multi_match": {
                "query": criteria,
                "fields": ["Language"]
            }
        }
    }
    response = es.search(index=index_name, body=query)
    return [
        {
            "Title": hit["_source"].get("Title"),
            "Author": hit["_source"].get("Author"),
            "Description": hit["_source"].get("Description"),
            "Language": hit["_source"].get("Language"),
            "Category": hit["_source"].get("Category"),
            "Book": hit["_source"].get("Book")
        }
        for hit in response["hits"]["hits"]
    ]
            
# Query Functions
def search_books(criteria, fields=None):
    log_search(criteria)
    if fields is None:
        fields = ["Title", "Author", "Description", "Category"]
    query = {
        "query": {
            "multi_match": {
                "query": criteria,
                "fields": fields
            }
        }
    }
    response = es.search(index=index_name, body=query)
    return [
        {
            "Title": hit["_source"].get("Title"),
            "Author": hit["_source"].get("Author"),
            "Description": hit["_source"].get("Description"),
            "Book": hit["_source"].get("Book"),
            "_score": hit["_score"]  # ✅ Include the _score here
        }
        for hit in response["hits"]["hits"]
    ]

def fuzzy_search(term, language="standard"):
    log_search(term)
    fuzziness_level = "1" if len(term) < 5 else "2"  # Shorter terms need lower fuzziness

    query = {
        "query": {
            "multi_match": {
                "query": term,
                "fields": ["Title", "Author", "Description", "Book"],
                "fuzziness": fuzziness_level,
                "analyzer": language
            }
        }
    }
    response = es.search(index=index_name, body=query)
    return [
        {
            "Title": hit["_source"].get("Title"),
            "Author": hit["_source"].get("Author"),
            "Description": hit["_source"].get("Description"),
            "Book": hit["_source"].get("Book"),
            "_score": hit["_score"]
        }
        for hit in response["hits"]["hits"]
    ]

def synonym_search(term):
    log_search(term)  # Log the search query
    query = {
        "query": {
            "multi_match": {
                "query": term,
                "fields": ["Title", "Author", "Description", "Book"],
                "analyzer": "synonym_analyzer"
            }
        }
    }
    response = es.search(index=index_name, body=query)
    return [
        {
            "Title": hit["_source"].get("Title"),
            "Author": hit["_source"].get("Author"),
            "Description": hit["_source"].get("Description"),
            "Book": hit["_source"].get("Book"),
            "_score": hit["_score"]  # ✅ Include the _score here
        }
        for hit in response["hits"]["hits"]
    ]

def natural_language_search(query_text):
    log_search(query_text)
    query = {
        "query": {
            "dis_max": {
                "queries": [
                    {"multi_match": {"query": query_text, "fields": ["Title", "Author"], "operator": "and"}},
                    {"multi_match": {"query": query_text, "fields": ["Description", "Book"], "operator": "or"}}
                ],
                "tie_breaker": 0.3
            }
        }
    }
    response = es.search(index=index_name, body=query)
    return [
        {
            "Title": hit["_source"].get("Title"),
            "Author": hit["_source"].get("Author"),
            "Description": hit["_source"].get("Description"),
            "Book": hit["_source"].get("Book"),
            "_score": hit["_score"]
        }
        for hit in response["hits"]["hits"]
    ]
    
def user_recommendations(user_history):
    query = {
        "query": {
            "more_like_this": {
                "fields": ["Title", "Author", "Description", "Book"],
                "like": user_history,
                "min_term_freq": 1,
                "max_query_terms": 12
            }
        }
    }
    response = es.search(index=index_name, body=query)
    return [{
        "Title": hit["_source"].get("Title"),
        "Author": hit["_source"].get("Author"),
        "Description": hit["_source"].get("Description"),
        "Book": hit["_source"].get("Book"),
        "_score": hit["_score"]  # ✅ Include the _score here
    } for hit in response["hits"]["hits"]]

def popular_searches(hours_interval=None):
    # Define the time range filter if hours_interval is provided
    time_filter = {}
    if hours_interval is not None:
        start_time = (datetime.utcnow() - timedelta(hours=hours_interval)).isoformat()
        time_filter = {"range": {"timestamp": {"gte": start_time}}}
    else:
        time_filter = {"match_all": {}}

    # Build the query with optional time filter
    query = {
        "query": time_filter,
        "aggs": {
            "popular_terms": {
                "terms": {
                    "field": "query",  # Use the `keyword` field
                    "size": 10  # Number of terms to return
                }
            }
        }
    }
    #response = es.search(index=log_index_name, body={"query": {"match_all": {}}})
    #print("Search Logs Data:", response)
    
    response = es.search(index=log_index_name, body=query)
    #print(response)

    # Check if aggregation returned results
    if "aggregations" in response and "popular_terms" in response["aggregations"]:
        buckets = response["aggregations"]["popular_terms"]["buckets"]
        if buckets:
            return [bucket["key"] for bucket in buckets]

    # Fallback to fetching raw queries if aggregation is empty
    match_all_response = es.search(
        index=log_index_name,
        body={"query": {"match_all": {}}},
        size=10  # Limit the number of raw queries fetched
    )
    return [log["_source"]["query"] for log in match_all_response["hits"]["hits"]]

def advanced_search(query, operator="AND"):
    log_search(query)
    query_body = {
        "query": {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": query,
                            "fields": [
                                "Title^3",          # Boost Title relevance
                                "Author^2",         # Boost Author slightly
                                "Description",
                                "Book"
                            ],
                            "operator": operator.lower()
                        }
                    }
                ]
            }
        }
    }
    response = es.search(index=index_name, body=query_body)
    return [
        {
            "Title": hit["_source"].get("Title"),
            "Author": hit["_source"].get("Author"),
            "Description": hit["_source"].get("Description"),
            "Book": hit["_source"].get("Book"),
            "_score": hit["_score"]
        }
        for hit in response["hits"]["hits"]
    ]
    
    
     
def expanded_search(term):
    log_search(term)
    expanded_terms = [term, f"{term}s", f"{term}ing"]  # Basic stemming

    query = {
        "query": {
            "bool": {
                "should": [
                    {"match": {"Title": t}} for t in expanded_terms
                ] + [
                    {"match": {"Description": t}} for t in expanded_terms
                ],
                "minimum_should_match": 1
            }
        }
    }
    response = es.search(index=index_name, body=query)
    return [
        {
            "Title": hit["_source"].get("Title"),
            "Author": hit["_source"].get("Author"),
            "Description": hit["_source"].get("Description"),
            "Book": hit["_source"].get("Book"),
            "_score": hit["_score"]
        }
        for hit in response["hits"]["hits"]
    ]
     


In [40]:
analyze_query = {
    "analyzer": "synonym_analyzer",
    "text": "Book"
}
response = es.indices.analyze(index=index_name, body=analyze_query)
print("Analyze Response:", response)

search_lanugage_based = search_books_by_language("English")
print("Search Results for English Language:", search_lanugage_based)
print("\n")

search_result = search_books("Public School Education")
print("Search Results:", search_result)
print("\n")

field_specific_result = search_books("Michael", fields=["Author"])
print("Field Specific Search Results:", field_specific_result)
print("\n")

fuzzy_result = fuzzy_search("Publc Skool", "english")
print("Fuzzy Search Results:", fuzzy_result)
print("\n")

fuzzy_result_BG = fuzzy_search("образувание", "bulgarian")
print("Fuzzy search Results for BG:", fuzzy_result_BG)
print("\n")

synonym_result = synonym_search("debater")
print("Synonym Search Results:", synonym_result)
print("\n")

nl_search_result = natural_language_search("book on education")
print("Natural Language Search Results:", nl_search_result)
print("\n")

recommendation_result = user_recommendations("Education")
print("User Recommendations:", recommendation_result)
print("\n")

popular_search_result = popular_searches(hours_interval=1)
print("Popular Searches:", popular_search_result)
print("\n")

advanced_search_result = advanced_search("educational OR practices", "OR")
print("Advanced Search Results:", advanced_search_result)
print("\n")


Analyze Response: {'tokens': [{'token': 'book', 'start_offset': 0, 'end_offset': 4, 'type': '<ALPHANUM>', 'position': 0}, {'token': 'novel', 'start_offset': 0, 'end_offset': 4, 'type': 'SYNONYM', 'position': 0}, {'token': 'publication', 'start_offset': 0, 'end_offset': 4, 'type': 'SYNONYM', 'position': 0}]}


Search Results: [{'Title': 'Public School Education', 'Author': 'Müller, Michael, 1825-1899', 'Description': '"Public School Education" by Michael Müller is a critical examination of the American public school system written in the late 19th century. The work presents a strong argument against the system, examining its implications on society and morality, particularly from a Christian perspective. The author expresses deep concern about the moral decay he perceives in society, which he attributes to the secular nature of public education and its failure to incorporate religious teachings.  The opening of the work sets a dramatic tone, as Müller passionately articulates his love f

In [50]:
import time
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Define all search functions to be tested
search_functions = {
    "Exact Search": search_books,
    "Fuzzy Search": fuzzy_search,
    "Synonym Search": synonym_search,
    "Natural Language Search": natural_language_search,
    "Advanced Search": advanced_search,
    "Expanded Search": expanded_search,
    "Search by Language": search_books_by_language
}

# Generate queries from Titles, Authors, and Categories
title_queries = df['Title'].dropna().unique().tolist()
author_queries = df['Author'].dropna().unique().tolist()
category_queries = df['Category'].explode().dropna().unique().tolist()
all_queries = title_queries + author_queries + category_queries

# Metrics storage
results_summary = {}

# Test each search function
for name, search_func in search_functions.items():
    print(f"Testing: {name}")
    
    response_times = []
    true_relevance = []
    predicted_relevance = []
    fallback_count = 0  # Only relevant for functions using fallback
    
    for query in all_queries:
        start_time = time.time()
        
        # Handle function arguments dynamically
        if name == "Search by Language":
            results = search_func(query)
        elif name == "Advanced Search":
            results = search_func(query, operator="AND")
        else:
            results = search_func(query)
        
        response_time = time.time() - start_time
        response_times.append(response_time)

        # Relevance assignment
        predicted = 1 if results else 0
        actual = 1 if any(query.lower() in str(value).lower() for value in df[['Title', 'Author', 'Category']].values.flatten()) else 0

        predicted_relevance.append(predicted)
        true_relevance.append(actual)

    # Calculate metrics
    precision = precision_score(true_relevance, predicted_relevance)
    recall = recall_score(true_relevance, predicted_relevance)
    f1 = f1_score(true_relevance, predicted_relevance)
    avg_response_time = np.mean(response_times)
    zero_result_rate = predicted_relevance.count(0) / len(all_queries)

    # Store results
    results_summary[name] = {
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Average Response Time (s)": avg_response_time,
        "Zero-Result Rate (%)": zero_result_rate * 100
    }

# Display results
for method, metrics in results_summary.items():
    print(f"\n=== {method} ===")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print(f"F1-Score: {metrics['F1-Score']:.2f}")
    print(f"Average Response Time: {metrics['Average Response Time (s)']:.4f} seconds")
    print(f"Zero-Result Rate: {metrics['Zero-Result Rate (%)']:.2f}%")

Testing: Exact Search
Testing: Fuzzy Search
Testing: Synonym Search
Testing: Natural Language Search
Testing: Advanced Search
Testing: Expanded Search
Testing: Search by Language

=== Exact Search ===
Precision: 1.00
Recall: 0.93
F1-Score: 0.96
Average Response Time: 0.0072 seconds
Zero-Result Rate: 6.98%

=== Fuzzy Search ===
Precision: 1.00
Recall: 0.99
F1-Score: 0.99
Average Response Time: 0.0194 seconds
Zero-Result Rate: 1.16%

=== Synonym Search ===
Precision: 1.00
Recall: 0.92
F1-Score: 0.96
Average Response Time: 0.0058 seconds
Zero-Result Rate: 8.14%

=== Natural Language Search ===
Precision: 1.00
Recall: 0.92
F1-Score: 0.96
Average Response Time: 0.0057 seconds
Zero-Result Rate: 8.14%

=== Advanced Search ===
Precision: 1.00
Recall: 0.31
F1-Score: 0.48
Average Response Time: 0.0048 seconds
Zero-Result Rate: 68.60%

=== Expanded Search ===
Precision: 1.00
Recall: 0.93
F1-Score: 0.96
Average Response Time: 0.0051 seconds
Zero-Result Rate: 6.98%

=== Search by Language ===
Preci

  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
import time
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Define all search functions to be tested
search_functions = {
    "Exact Search": search_books,
    "Fuzzy Search": fuzzy_search,
    "Synonym Search": synonym_search,
    "Natural Language Search": natural_language_search,
    "Advanced Search": advanced_search,
    "Expanded Search": expanded_search,
    "Search by Language": search_books_by_language
}

# Generate queries from Titles, Authors, and Categories
title_queries = df['Title'].dropna().unique().tolist()
author_queries = df['Author'].dropna().unique().tolist()
category_queries = df['Category'].explode().dropna().unique().tolist()
all_queries = title_queries + author_queries + category_queries

# Metrics storage
results_summary = {}

# Test each search function
for name, search_func in search_functions.items():
    print(f"Testing: {name}")
    
    response_times = []
    true_relevance = []
    predicted_relevance = []
    fallback_count = 0  # Only relevant for functions using fallback
    
    for query in all_queries:
        start_time = time.time()
        
        # Handle function arguments dynamically
        if name == "Search by Language":
            results = search_func(query)
        elif name == "Advanced Search":
            results = search_func(query, operator="AND")
        else:
            results = search_func(query)
        
        response_time = time.time() - start_time
        response_times.append(response_time)

        # Relevance assignment
        predicted = 1 if results else 0
        actual = 1 if any(query.lower() in str(value).lower() for value in df[['Title', 'Author', 'Category']].values.flatten()) else 0

        predicted_relevance.append(predicted)
        true_relevance.append(actual)

    # Calculate metrics
    precision = precision_score(true_relevance, predicted_relevance)
    recall = recall_score(true_relevance, predicted_relevance)
    f1 = f1_score(true_relevance, predicted_relevance)
    avg_response_time = np.mean(response_times)
    zero_result_rate = predicted_relevance.count(0) / len(all_queries)

    # Store results
    results_summary[name] = {
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Average Response Time (s)": avg_response_time,
        "Zero-Result Rate (%)": zero_result_rate * 100
    }

# Display results
for method, metrics in results_summary.items():
    print(f"\n=== {method} ===")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print(f"F1-Score: {metrics['F1-Score']:.2f}")
    print(f"Average Response Time: {metrics['Average Response Time (s)']:.4f} seconds")
    print(f"Zero-Result Rate: {metrics['Zero-Result Rate (%)']:.2f}%")

Testing: Exact Search
Testing: Fuzzy Search
Testing: Synonym Search
Testing: Natural Language Search
Testing: Advanced Search
Testing: Expanded Search
Testing: Search by Language

=== Exact Search ===
Precision: 1.00
Recall: 0.93
F1-Score: 0.96
Average Response Time: 0.0064 seconds
Zero-Result Rate: 6.98%

=== Fuzzy Search ===
Precision: 1.00
Recall: 0.99
F1-Score: 0.99
Average Response Time: 0.0182 seconds
Zero-Result Rate: 1.16%

=== Synonym Search ===
Precision: 1.00
Recall: 0.92
F1-Score: 0.96
Average Response Time: 0.0057 seconds
Zero-Result Rate: 8.14%

=== Natural Language Search ===
Precision: 1.00
Recall: 0.92
F1-Score: 0.96
Average Response Time: 0.0060 seconds
Zero-Result Rate: 8.14%

=== Advanced Search ===
Precision: 1.00
Recall: 0.31
F1-Score: 0.48
Average Response Time: 0.0047 seconds
Zero-Result Rate: 68.60%

=== Expanded Search ===
Precision: 1.00
Recall: 0.93
F1-Score: 0.96
Average Response Time: 0.0061 seconds
Zero-Result Rate: 6.98%

=== Search by Language ===
Preci

  _warn_prf(average, modifier, msg_start, len(result))
