In [8]:
# Importing required libraries

import time
import re
from multiprocessing import Pool, cpu_count   
import numpy as np

In [9]:
# Get all the books
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()  # Read file content 
    return text

# Example usage
A_Journey = load_text_file("A Journey to the Centre of the Earth.txt")
Around_The_World = load_text_file("Around the World in Eighty Days.txt")
Twenty_Thousand = load_text_file("Twenty Thousand Leagues under the Sea.txt")

# Put them in a list and create an empty list for storing the cleaned books
books = [A_Journey, Around_The_World, Twenty_Thousand]
cleaned_books = []

# Split the texts in the books into sentences for smaller chunks of data and store them in the list

for book in books:
    book = book.replace('!', '.').replace('?', '.')
    book = book.split('.')
    cleaned_books.append(book)
    
# Use list comprehension to create one list with all the cleaned chunks from all books
cleaned_chunks = [chunk for book in cleaned_books for chunk in book]
 

In [10]:
# Misra-Gries Algorithm for Word Count
def misra_gries_word_count(words, k):
    """
    Misra-Gries algorithm to count word frequencies approximately.

    :param words: List of words to process
    :param k: Parameter determining the number of counters (frequency threshold)
    :return: Approximate word counts as a dictionary
    """

    counters = {}

    # First pass: count words with a maximum of k-1 counters
    for word in words:
        if word in counters:
            counters[word] += 1
        elif len(counters) < k - 1:
            counters[word] = 1
        else:
            # Decrement all counters if a new word can't be added
            for key in list(counters.keys()):
                counters[key] -= 1
                if counters[key] == 0:
                    del counters[key]

    # Second pass: refine the counts for the words in the counters
    refined_counts = {word: 0 for word in counters}
    for word in words:
        if word in refined_counts:
            refined_counts[word] += 1

    return refined_counts

In [11]:
# Function for splitting each chunk into words in the parallel_wordcount function
def split_chunk(chunk):
    words = chunk.split()  
    return words 

In [13]:
# Function for processing the cunks for the Misra-Gries algortihm
# Needs to be defines like this so that eac core can use this function, can't be a lambda function
def process_chunk_for_misra_gries(chunk, k):
    return misra_gries_word_count(split_chunk(chunk), k)

In [17]:
# Parallel MapReduce implementation
def parallel_wordcount(text_chunks):
    # Number of slots for Misra-Gries
    k = 100
    # Multi processing pool is created 
    with Pool(cpu_count()) as pool:    
        word_counts = pool.map(
            # Process chunks into words for misra-gries so it can count the words
            process_chunk_for_misra_gries,
            #using starmap for passing multiple arguments to the function  
            # Chunk is the peice of text that is processed
            # k is the number of counters for misra-gries
            # Unpacks each tuple and passes them to each worker process which executes misra-gries
            [(chunk, k) for chunk in text_chunks]
        )
    
    return word_counts

In [None]:
# Performance counter for parallel misra-gries function
start_time = time.perf_counter()

parallel_counts = parallel_wordcount(cleaned_chunks)

# Calculate duration
parallel_duration = time.perf_counter() - start_time

# Display results
print("Parallel Word Count (Misra-Gries Algorithm):")
print(parallel_counts)
print(f"Parallel Duration: {parallel_duration:.6f} seconds\n")