In [None]:
!pip install firebase

Collecting firebase
  Downloading firebase-4.0.1-py3-none-any.whl (12 kB)
Installing collected packages: firebase
Successfully installed firebase-4.0.1


In [None]:
from firebase import firebase
from IPython.display import HTML, display
from google.colab import output
from nltk.stem import PorterStemmer
import re
import requests
from bs4 import BeautifulSoup
import nltk
import json
from collections import deque
from urllib.parse import urljoin
from notebook.services.config import ConfigManager
import logging
import chardet
import time
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore



#Connect to firebase
FBconn = firebase.FirebaseApplication("https://cloudproject-1bbde-default-rtdb.firebaseio.com/", None)

In [None]:
def fetch_page(url):
  nltk.download('punkt')
  links = []
  response = requests.get(url)
  if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    anchorTags = soup.find_all('a')

    for anchorTag in anchorTags:
      link = anchorTag.get('href')
      if link:
        links.append(link)
    return soup, links
  else:
    return None



def extract_data_from_url(soup):
  # Extract text content from the HTML
  text_content = soup.get_text()

  # Tokenize text into sentences using NLTK
  sentences = nltk.sent_tokenize(text_content)

  # Remove newline characters from each sentence and store in the 'data' list
  data = [sentence.replace('\n', '') for sentence in sentences]
  return data


def create_json_object(json_object, url, data):
  json_object[url] = data


def create_index(url, jsonData):

  soup, links = fetch_page(url)


  for link in links:
      data = extract_data_from_url(soup)
      create_json_object(jsonData, link, data)

  return jsonData


def remove_stop_words(query):
  #Remove all the stop words from the query
  stop_words = {'a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then',
                'there', 'these', 'they', 'this', 'to', 'was', 'will', 'with'}
  words = query.split()
  filtered_words = [word for word in words if word.lower() not in stop_words]

  result =  ' '.join(filtered_words)
  return result


def clean_special_characters(text):
    # Replace non-ASCII characters with an empty string
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', text)
    return cleaned_text



In [None]:
pip install --upgrade certifi



In [None]:
cm = ConfigManager()

nltk.download('punkt')

cm.update('NotebookApp', {'iopub_data_rate_limit': 1000000000000})

# Configure logging
logging.basicConfig(level=logging.INFO)

def is_valid_url(url):
    return url.startswith(('http://', 'https://')) and not url.startswith('javascript:')


def fetch_page_iterative(url, depth, visited=None):
    if visited is None:
        visited = set()

    # Early return if the URL has already been visited
    if url in visited or not is_valid_url(url):
        return []

    visited.add(url)  # Mark the current URL as visited

    if depth == 0:
        return []

    links = []
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            anchorTags = soup.find_all('a')

            for anchorTag in anchorTags:
                link = anchorTag.get('href', '')
                if is_valid_url(link):
                    links.append(link)
                    # Extend the list with links from deeper levels, if valid and not yet visited
                    child_links = fetch_page_iterative(link, depth - 1, visited)
                    links.extend(child_links)
    except Exception as e:
        print(f"Error fetching {url}: {e}")

    # Return the list of links, deduplicated by converting it to a set and back to a list
    return list(set(links))

def save_to_json_file(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

# Initialize Firebase Admin SDK
cred = credentials.Certificate('cloudproject-1bbde-firebase-adminsdk-849hb-47666b58c8.json') # Update the path to your downloaded JSON file
firebase_admin.initialize_app(cred)

# Get a Firestore client
db = firestore.client()


def build_index(urls):
    """
    Builds an inverted index from a list of URLs, treating uppercase and lowercase words as the same and excluding non-word characters.
    :param urls: A list of URLs to index.
    :return: An inverted index where keys are terms (in lowercase) and values are lists of URLs where the term appears, excluding punctuation.
    """
    inverted_index = {}

    for url in urls:
        try:
            soup, links = fetch_page(url)
            if soup is not None:
                data = extract_data_from_url(soup)
                for sentence in data:
                    # Optional: clean the text and remove stop words
                    sentence = clean_special_characters(sentence)
                    filtered_sentence = remove_stop_words(sentence)
                    # Convert to lowercase to ensure uniformity
                    filtered_sentence = filtered_sentence.lower()
                    # Use regular expressions to find all words in the sentence, excluding numbers
                    words = re.findall(r'\b(?![0-9]+\b)\w+\b', filtered_sentence)
                    for word in words:
                        if word not in inverted_index:
                            inverted_index[word] = [url]
                        elif url not in inverted_index[word]:
                            inverted_index[word].append(url)
        except Exception as e:
            print(f"Error processing {url}: {e}")

    return inverted_index

def upload_index_to_firestore(inverted_index):
    """
    Uploads the inverted index to Firestore.
    :param inverted_index: The inverted index to upload.
    """
    # Reference a collection. If it doesn't exist, it will be created.
    index_collection = db.collection('inverted_index')

    for term, urls in inverted_index.items():
        # Each term will be a document in the 'inverted_index' collection
        # The URLs where the term appears will be stored in the 'urls' field
        index_collection.document(term).set({'urls': urls})

    print("Inverted index uploaded to Firestore.")

# After building your inverted index


base_url = 'https://www.ibm.com/cloud'
depth = 1  # For example, to stop after 2 iterations

# Fetch links up to the specified depth
urls = fetch_page_iterative(base_url, depth)

# Build the index from the fetched URLs
inverted_index = build_index(urls)


# Upload the index to Firestore
upload_index_to_firestore(inverted_index)


Fetched 14 URLs at depth 1
['https://www.ibm.com/cloud/case-studies', 'https://www.ibm.com/blog/ubotica-partners-with-ibm-for-one-click-deployment-of-space-ai-applications/', 'https://www.ibm.com/products/cloud-object-storage', 'https://www.ibm.com/products/security-and-compliance-center', 'https://www.ibm.com/downloads/cas/MZA4QK5M', 'https://www.ibm.com/downloads/cas/RGKYOOKB', 'https://newsroom.ibm.com/How-IBM-is-Helping-Clients-Deploy-Foundation-Models-and-AI-Workloads-with-New-GPU-Offering-on-IBM-Cloud', 'https://www.businesswire.com/news/home/20221117005261/en/IDC-Introduces-the-Trust-Perception-Index-Measuring-Trust-Perceptions-for-the-Top-Six-Cloud-Providers', 'https://www.ibm.com/cloud/data-centers', 'https://c212.net/c/link/?t=0&l=en&o=3120992-1&h=3670330057&u=https%3A%2F%2Fnewsroom.ibm.com%2F2020-07-22-IBM-and-Bank-of-America-Advance-IBM-Cloud-for-Financial-Services-BNP-Paribas-Joins-as-Anchor-Client-in-Europe&a=BNP+Paribas', 'https://www.ibm.com/blog/ibm-cloud-delivers-ente

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[

{'cloud': ['https://www.ibm.com/cloud/case-studies', 'https://www.ibm.com/blog/ubotica-partners-with-ibm-for-one-click-deployment-of-space-ai-applications/', 'https://www.ibm.com/products/cloud-object-storage', 'https://www.ibm.com/products/security-and-compliance-center', 'https://www.ibm.com/downloads/cas/RGKYOOKB', 'https://newsroom.ibm.com/How-IBM-is-Helping-Clients-Deploy-Foundation-Models-and-AI-Workloads-with-New-GPU-Offering-on-IBM-Cloud', 'https://www.businesswire.com/news/home/20221117005261/en/IDC-Introduces-the-Trust-Perception-Index-Measuring-Trust-Perceptions-for-the-Top-Six-Cloud-Providers', 'https://www.ibm.com/cloud/data-centers', 'https://c212.net/c/link/?t=0&l=en&o=3120992-1&h=3670330057&u=https%3A%2F%2Fnewsroom.ibm.com%2F2020-07-22-IBM-and-Bank-of-America-Advance-IBM-Cloud-for-Financial-Services-BNP-Paribas-Joins-as-Anchor-Client-in-Europe&a=BNP+Paribas', 'https://www.ibm.com/blog/ibm-cloud-delivers-enterprise-sovereign-cloud-capabilities/', 'https://www.ibm.com/blo

KeyboardInterrupt: 