In [3]:
import os
import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

In [4]:

def preprocessor(text):
    text = re.sub(r'[^\w\s]', ' ', text)  
    text = re.sub(r'\d+', ' ', text)     
    text = text.lower()                   
    return text

def reader(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        first_line = f.readline().strip()
        docID, content = first_line.split('\t', 1)
        content = preprocessor(content)
        words = content.split()
    return docID, words

def mapper(file_path):
    docID, words = reader(file_path)
    unigram_count = defaultdict(int)
    if words:
        for word in words:
            if word:
                unigram_count[word] += 1
    return docID, unigram_count

def reducer(unigram_counts):
    count_dict = defaultdict(lambda: defaultdict(int))
    for docID, unigram_count in unigram_counts:
        if docID:
            for word, count in unigram_count.items():
                count_dict[word][docID] += count
    return count_dict
    


In [5]:
input_folder = r"C:\Users\harib\Desktop\USC\Classes\IR\HW03\fulldata"
files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.txt')]
unigram_counts = []

with ThreadPoolExecutor() as executor:
    future_to_file = {executor.submit(mapper, file): file for file in files}
    for future in as_completed(future_to_file):
        result = future.result()
        if result and result[0]:  
            unigram_counts.append(result)

final_counts = reducer(unigram_counts)

output_file = "unigram_index.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for word, docID_counts in final_counts.items():
        counts = ' '.join([f"{docID}: {count}" for docID, count in docID_counts.items()])
        f.write(f"{word} {counts}\n")

print(f"Unigram written to: {output_file}")


Unigram written to: unigram_index.txt
