In [1]:

import time
from collections import Counter 
import requests 

In [2]:
# Misra-Gries Algorithm for Word Count
def misra_gries_word_count(words, k):
    """
    Misra-Gries algorithm to count word frequencies approximately.

    :param words: List of words to process
    :param k: Parameter determining the number of counters (frequency threshold)
    :return: Approximate word counts as a dictionary
    """

    counters = {}

    # First pass: count words with a maximum of k-1 counters
    for word in words:
        if word in counters:
            counters[word] += 1
        elif len(counters) < k - 1:
            counters[word] = 1
        else:
            # Decrement all counters if a new word can't be added
            for key in list(counters.keys()):
                counters[key] -= 1
                if counters[key] == 0:
                    del counters[key]

    # Second pass: refine the counts for the words in the counters
    refined_counts = {word: 0 for word in counters}
    for word in words:
        if word in refined_counts:
            refined_counts[word] += 1

    return refined_counts


In [None]:
def load_text_from_url(url):
  
  # Fetch content from txt-files
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request fails
    return response.text

In [3]:
if __name__ == "__main__":
    
    book_urls = [
        "https://raw.githubusercontent.com/MonaTlili/GIK2Q3_Assignment1/main/Around%20the%20World%20in%20Eighty%20Days.txt",
        "https://raw.githubusercontent.com/MonaTlili/GIK2Q3_Assignment1/main/A%20Journey%20to%20the%20Centre%20of%20the%20Earth.txt",
        "https://raw.githubusercontent.com/MonaTlili/GIK2Q3_Assignment1/main/Twenty%20Thousand%20Leagues%20under%20the%20Sea.txt"
    ]

    try:
        k = 100 
        combined_counts = Counter()  

        # Start timer
        start_time = time.perf_counter()

        for url in book_urls:
            # Fetch and process each book
            book_text = load_text_from_url(url)
            words = book_text.split()  # Split text from books into words
            word_counts = misra_gries_word_count(words, k)  # Misra-Gries
            
            # Combine word counts for book
            combined_counts.update(word_counts)

        # Stop timer
        end_time = time.perf_counter()
        duration = end_time - start_time

        # Display the top 20 words
        print("Approximate Word Counts (Top 20):")
        sorted_word_counts = sorted(combined_counts.items(), key=lambda x: -x[1])
        for word, count in sorted_word_counts[:20]:
            print(f"{word}: {count}")

        # Print performance duration
        print(f"\nSerial Duration: {duration:.6f} seconds")

    except requests.RequestException as e:
        print(f"Error fetching a book: {e}")


Approximate Word Counts (Top 20):
the: 13862
of: 6642
and: 5997
a: 4549
to: 4531
in: 3908
that: 2691
his: 2428
with: 1695
as: 1600
this: 1171
by: 1124
from: 1072
not: 1050
you: 789
or: 759
are: 589
which: 559
we: 394
any: 339
