In [1]:
import requests 

In [2]:
# Misra-Gries Algorithm for Word Count
def misra_gries_word_count(words, k):
    """
    Misra-Gries algorithm to count word frequencies approximately.

    :param words: List of words to process
    :param k: Parameter determining the number of counters (frequency threshold)
    :return: Approximate word counts as a dictionary
    """

    counters = {}

    # First pass: count words with a maximum of k-1 counters
    for word in words:
        if word in counters:
            counters[word] += 1
        elif len(counters) < k - 1:
            counters[word] = 1
        else:
            # Decrement all counters if a new word can't be added
            for key in list(counters.keys()):
                counters[key] -= 1
                if counters[key] == 0:
                    del counters[key]

    # Second pass: refine the counts for the words in the counters
    refined_counts = {word: 0 for word in counters}
    for word in words:
        if word in refined_counts:
            refined_counts[word] += 1

    return refined_counts


In [3]:
if __name__ == "__main__":
    file_url = "https://raw.githubusercontent.com/MonaTlili/Assignment-MAS/main/Moby_dick/pg2701.txt"

    try:
        response = requests.get(file_url)
        response.raise_for_status()
        moby_dick_text = response.text

        words = moby_dick_text.split()
        k = 100
        word_counts = misra_gries_word_count(words, k)

        print("Approximate Word Counts (Top 20):")
        sorted_word_counts = sorted(word_counts.items(), key=lambda x: -x[1])
        for word, count in sorted_word_counts[:20]:
            print(f"{word}: {count}")

    except requests.RequestException as e:
        print(f"Error fetching the book: {e}")

Approximate Word Counts (Top 20):
the: 13862
of: 6642
and: 5997
a: 4549
to: 4531
in: 3908
that: 2691
his: 2428
with: 1695
as: 1600
this: 1171
by: 1124
from: 1072
not: 1050
you: 789
or: 759
are: 589
which: 559
we: 394
any: 339
