<a href="https://colab.research.google.com/github/Method-for-Software-System-Development/Cloud_Computing/blob/develop/logic/search_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
DBLink = "https://cloud-project-5adfc-default-rtdb.europe-west1.firebasedatabase.app/"
url = 'https://mqtt.org/'

import requests
import re
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from firebase import firebase
import matplotlib.pyplot as plt
from urllib.parse import urljoin

# Get the data of the selected url
def fetch_page(url):
 response = requests.get(url)
 if response.status_code == 200:
  soup = BeautifulSoup(response.text, 'html.parser')
  return soup
 else:
  return None

# Create index of the loaded page
def index_words(soup):
  index = {}
  words = re.findall(r'(?!\d)\w+', soup.get_text())
  for word in words:
    word = word.lower()
    if word in index:
      index[word] += 1
    else:
      index[word] = 1
  return index

# Remove the stop words from the index
def remove_stop_words(index):
  stop_words = {'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'}
  for stop_word in stop_words:
    if stop_word in index:
      del index[stop_word]
  return index

# Apply stemming to the index
def apply_stemming(index):
  stemmer = PorterStemmer()
  stemmed_index = {}
  for word, count in index.items():
    stemmed_word = stemmer.stem(word)
    if stemmed_word in stemmed_index:
      stemmed_index[stemmed_word] += count
    else:
      stemmed_index[stemmed_word] = count
  return stemmed_index


def remove_infrequent_words(index, threshold):
  for word, count in list(index.items()):
    if count < threshold:
      del index[word]
  return index

# Run the index creation of selected url
def create_index(url):    # rename to index creaation
  soup = fetch_page(url)
  if soup is None:
     return None
  index = index_words(soup)
  index = remove_stop_words(index)
  index = apply_stemming(index)
  index = remove_infrequent_words(index, 2)
  return index

def get_sub_urls(start_url):
    """
    Fetches all sub-URLs from a given URL site.

    Args:
        url (str): The base URL to crawl.

    Returns:
        list: A list of sub-URLs in the site.
    """
    url_query = [start_url]
    sub_urls = [start_url]
    while url_query != []:
      # get first url
      url = url_query[0]
      url_query.remove(url)

      # run scrape on the current page
      response = requests.get(url)      #ToDo: need to run in depth
      response.raise_for_status()  # Raise an exception for bad responses

      soup = BeautifulSoup(response.content, 'html.parser')

      for link in soup.find_all('a', href=True):
          href = link['href']
          absolute_url = urljoin(url, href)  # Make URL absolute

          if absolute_url.startswith(url) and absolute_url != url and absolute_url not in sub_urls:
              sub_urls.append(absolute_url)

              if absolute_url not in url_query:
                  url_query.append(absolute_url)

    return sub_urls

# This section will run on every sub site and create the index

# get the sub url list
site_urls = get_sub_urls(url)

site_index = {}
for sub_url in site_urls:
  idx = create_index(sub_url)

  # marge idx to site_index
  for word in idx:
    if word in site_index:
      site_index[word].append((sub_url, idx[word]))
    else:
      site_index[word] = [(sub_url, idx[word])]

# Save the index to FireBase

# Initialize Firebase connection
FBconn = firebase.FirebaseApplication(DBLink, None)

# Upload the site_index to Firebase
FBconn.put('/', 'site_index', site_index)

from difflib import get_close_matches

def read_index_from_firebase():
    """Reads the site index from the Firebase database."""
    FBconn = firebase.FirebaseApplication(DBLink, None)
    site_index = FBconn.get('/', 'site_index')
    return site_index if site_index else {}

def search_words(query):
    """
    Searches for words in the index and returns the URLs ordered by count.
    Attempts to fix typos and suggest unstemmed words from a dictionary.
    Formats output as HTML with clickable links.
    """
    site_index = read_index_from_firebase()
    if not site_index:
        # Return a simple error message formatted as HTML
        return "<p>Error: Could not read index from Firebase.</p>"

    stemmer = PorterStemmer()
    query_words = re.findall(r'\w+', query.lower())
    output_html = "" # Use an HTML string for output

    for word in query_words:
        stemmed_word = stemmer.stem(word)

        if stemmed_word in site_index:
            output_html += f"<h2>Results for '{word}':</h2>"
            # Sort URLs by count in descending order
            sorted_urls = sorted(site_index[stemmed_word], key=lambda item: item[1], reverse=True)
            output_html += "<ul>" # Use an unordered list for the links
            for url, count in sorted_urls:
                # Format the URL as an HTML link
                output_html += f"<li><a href='{url}' target='_blank'>{url}</a> (Count: {count})</li>"
            output_html += "</ul>"
            output_html += "<br>" # Add a break after each word's results
        else:
            # Try to fix typo (search for the steammed word)
            # ToDo?: try to create the currect word
            all_words = list(site_index.keys())
            matches = get_close_matches(stemmed_word, all_words, n=1, cutoff=0.8)
            if matches:
                corrected_stemmed_word = matches[0]

                output_html += f"<p>Did you mean '{corrected_stemmed_word}'?</p>"
                output_html += f"<h2>Results for '{corrected_stemmed_word}':</h2>"

                # Sort URLs by count in descending order
                sorted_urls = sorted(site_index[corrected_stemmed_word], key=lambda item: item[1], reverse=True)
                output_html += "<ul>" # Use an unordered list for the links
                for url, count in sorted_urls:
                     # Format the URL as an HTML link
                     output_html += f"<li><a href='{url}' target='_blank'>{url}</a> (Count: {count})</li>"
                output_html += "</ul>"
                output_html += "<br>"
            else:
                output_html += f"<p>No results found for '{word}' and no close matches found.</p><br>"

    if not output_html: # Check if any output was generated
        return "<p>No results found for your query.</p>"

    return output_html