<a href="https://colab.research.google.com/github/Method-for-Software-System-Development/Cloud_Computing/blob/develop/logic/search_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
DBLink = "https://optiline-kakado-default-rtdb.europe-west1.firebasedatabase.app/"
url = 'https://mqtt.org/'

import requests
import re
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from firebase import firebase
import matplotlib.pyplot as plt
from urllib.parse import urljoin


# --- Save the index to FireBase ---

# Initialize Firebase connection
FBconn = firebase.FirebaseApplication(DBLink, None)

from difflib import get_close_matches

def read_index_from_firebase():
  """Reads the site index from the Firebase database."""
  FBconn = firebase.FirebaseApplication(DBLink, None)
  site_index = FBconn.get('/', 'site_index')
  return site_index if site_index else {}

def search_words(query):
    """
    Searches for words in the index and returns the URLs ordered by count.
    Attempts to fix typos and suggest unstemmed words from a dictionary.
    Formats output as HTML with clickable links.
    """
    site_index = read_index_from_firebase()
    if not site_index:
        # Return a simple error message formatted as HTML
        return "<p>Error: Could not read index from Firebase.</p>"

    stemmer = PorterStemmer()
    query_words = re.findall(r'\w+', query.lower())
    output_html = "" # Use an HTML string for output

    for word in query_words:
        stemmed_word = stemmer.stem(word)

        if stemmed_word in site_index:
            output_html += f"<h2>Results for '{word}':</h2>"
            # Get the url with the counter
            word_index = [(site_index[stemmed_word]['DocIDs'][i], site_index[stemmed_word]['DocCounts'][i]) for i in range(len(site_index[stemmed_word]['DocIDs']))]

            # Sort URLs by count in descending order
            sorted_urls = sorted(word_index, key=lambda item: item[1], reverse=True)
            output_html += "<ul>" # Use an unordered list for the links
            for url, count in sorted_urls:
                # Format the URL as an HTML link
                output_html += f"<li><a href='{url}' target='_blank'>{url}</a> (Count: {count})</li>"
            output_html += "</ul>"
            output_html += "<br>" # Add a break after each word's results
        else:
            # Try to fix typo (search for the steammed word)
            # ToDo?: try to create the currect word
            all_words = list(site_index.keys())
            matches = get_close_matches(stemmed_word, all_words, n=1, cutoff=0.8)
            if matches:
                corrected_stemmed_word = matches[0]

                output_html += f"<p>Did you mean '{corrected_stemmed_word}'?</p>"
                output_html += f"<h2>Results for '{corrected_stemmed_word}':</h2>"

                # Get the url with the counter
                word_index = [(site_index[corrected_stemmed_word]['DocIDs'][i], site_index[corrected_stemmed_word]['DocCounts'][i]) for i in range(len(site_index[corrected_stemmed_word]['DocIDs']))]

                # Sort URLs by count in descending order
                sorted_urls = sorted(word_index, key=lambda item: item[1], reverse=True)
                output_html += "<ul>" # Use an unordered list for the links
                for url, count in sorted_urls:
                     # Format the URL as an HTML link
                     output_html += f"<li><a href='{url}' target='_blank'>{url}</a> (Count: {count})</li>"
                output_html += "</ul>"
                output_html += "<br>"
            else:
                output_html += f"<p>No results found for '{word}' and no close matches found.</p><br>"

    if not output_html: # Check if any output was generated
        return "<p>No results found for your query.</p>"

    return output_html