In [None]:
import os
import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

In [5]:

TARGET_BIGRAMS = {
    "computer science", 
    "information retrieval", 
    "power politics", 
    "los angeles", 
    "bruce willis"
}

def preprocessor(text):

    text = re.sub(r'[^\w\s]', ' ', text) 
    text = re.sub(r'\d+', ' ', text)     
    text = text.lower()                   
    return text

def reader(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        first_line = f.readline().strip()
        docID, content = first_line.split('\t', 1)
        content = preprocessor(content)
        words = content.split()
    return docID, words

def mapper(file_path):
  
    docID, words = reader(file_path)
    bigram_count = defaultdict(int)
    
    if words:
        for i in range(len(words) - 1):
            bigram = f"{words[i]} {words[i + 1]}"
            if bigram in TARGET_BIGRAMS:
                bigram_count[bigram] += 1

    return docID, bigram_count

def reducer(bigram_counts):

    final_count = defaultdict(lambda: defaultdict(int))
    for docID, bigram_count in bigram_counts:
        if docID:
            for bigram, count in bigram_count.items():
                final_count[bigram][docID] += count
    return final_count


In [6]:
input_folder = r"C:\Users\harib\Desktop\USC\Classes\IR\HW03\devdata"
files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.txt')]
bigram_counts = []


with ThreadPoolExecutor() as executor:
    future_to_file = {executor.submit(mapper, file): file for file in files}
    for future in as_completed(future_to_file):
        result = future.result()
        if result and result[0]:  
            bigram_counts.append(result)

final_counts = reducer(bigram_counts)

output_file = "selected_bigram_index.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for bigram, docID_counts in final_counts.items():
        counts = ' '.join([f"{docID}: {count}" for docID, count in docID_counts.items()])
        f.write(f"{bigram} {counts}\n")

print(f"Bigram written to {output_file}")

Bigram written to selected_bigram_index.txt
