<a href="https://colab.research.google.com/github/Method-for-Software-System-Development/Cloud_Computing/blob/tirgul-Omer-Matan/tirgul6_Omer_Matan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
# =========================================================
# 1) Install required libraries  (run once per Colab session)
# =========================================================
!pip install -q firebase requests beautifulsoup4 pandas altair ipywidgets


In [46]:
# =========================================================
# 2) Imports and global constants
# =========================================================
import collections, re, requests, pandas as pd, altair as alt
from bs4 import BeautifulSoup
from firebase import firebase

# --- target page -------------------------------------------------------------
URL = "https://en.wikipedia.org/wiki/One_Piece"

# --- Firebase Realtime‑DB URL (given in the assignment) ----------------------
DB_URL = "https://tirgul6-d9703-default-rtdb.europe-west1.firebasedatabase.app/"


In [48]:
# =========================================================
# 3) Helper functions
# =========================================================
def fetch_page(url: str) -> BeautifulSoup:
    """Download a web page and return a BeautifulSoup object (UTF‑8)."""
    r = requests.get(url, timeout=10)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def build_index(soup: BeautifulSoup, stop_words: set[str]) -> dict[str,int]:
    """Return {word: count} after lower‑casing and stop‑word removal."""
    counts = collections.defaultdict(int)
    for w in re.findall(r"[A-Za-z]{2,}", soup.get_text()):
        w = w.lower()
        if w not in stop_words:
            counts[w] += 1
    return counts


In [49]:
# =========================================================
# 4) Step‑1 : scrape page and compute top‑10 words
# =========================================================
soup = fetch_page(URL)

basic_stop = {
    "the","and","of","to","in","a","for","as","is","on","by","with","that",
    "an","at","from","its","it","be","this","was","are","or","which","also"
}

index_full = build_index(soup, basic_stop)
top10 = dict(sorted(index_full.items(), key=lambda x: x[1], reverse=True)[:10])

print("Top‑10 word index:\n", top10)


Top‑10 word index:
 {'one': 315, 'piece': 298, 'original': 242, 'archived': 233, 'retrieved': 230, 'manga': 194, 'anime': 134, 'november': 134, 'december': 121, 'series': 114}


In [51]:
# =========================================================
# 5) Step‑2 : write to Firebase Realtime‑DB
# =========================================================
fb = firebase.FirebaseApplication(DB_URL, None)

# store under /one_piece/index
fb.put("/one_piece", "index", top10)

print("✓  Index stored successfully.")


✓  Index stored successfully.


In [55]:
# =========================================================
# 6) Step‑3 : read index back into a pandas DataFrame
# =========================================================
data = fb.get("/one_piece/index", None)          # returns a dict
df = pd.DataFrame(list(data.items()), columns=["word", "count"])


df


Unnamed: 0,word,count
0,anime,134
1,archived,233
2,december,121
3,manga,194
4,november,134
5,one,315
6,original,242
7,piece,298
8,retrieved,230
9,series,114


In [53]:
# =========================================================
# 7) Step‑4 : visualise with Altair bar chart
# =========================================================
alt.Chart(df).mark_bar().encode(
    x=alt.X("word:N", sort="-y", title="Word"),
    y=alt.Y("count:Q", title="Frequency"),
    tooltip=["word","count"]
).properties(
    title="Top‑10 Word Frequency — One Piece (Wikipedia)",
    width=500,
    height=300
)
