In [4]:
from collections import deque

In [3]:
from pathlib import Path

# Load and preprocess words_alpha.txt into a list without overwriting existing 'words' below
wordlist_path = Path("words_alpha.txt")
if not wordlist_path.exists():
    raise FileNotFoundError(f"{wordlist_path!s} not found. Place the file in the notebook working directory.")

with wordlist_path.open("r", encoding="utf-8") as f:
    lines = f.read().splitlines()

# Normalize, filter, deduplicate and sort
words_alpha = sorted({w.strip().lower() for w in lines if w.strip() and w.strip().isalpha()})
original_length = len(words_alpha)
# Filter out short words (less than 4 characters)   
words_alpha = [w for w in words_alpha if len(w) >= 4]
# Report filtering
filtered_length = len(words_alpha)
print(f"Filtered out {original_length - filtered_length} short words. Remaining: {filtered_length}")
# Summary
print(f"Loaded {len(words_alpha)} words. Sample: {words_alpha[:10]}")

Filtered out 2583 short words. Remaining: 367522
Loaded 367522 words. Sample: ['aahed', 'aahing', 'aahs', 'aalii', 'aaliis', 'aals', 'aani', 'aardvark', 'aardvarks', 'aardwolf']


In [1]:
# Example dictionary (replace with a real word list later)
words = [
    "graph", "net", "network", "work", "data", "science", "model", "learn", "learning",
    "deep", "bio", "chem", "quantum"
]

# Common preprocessing: lowercase and unique
words = sorted(set(w.strip().lower() for w in words if w.strip()))
len(words), words[:10]

(13,
 ['bio',
  'chem',
  'data',
  'deep',
  'graph',
  'learn',
  'learning',
  'model',
  'net',
  'network'])

In [None]:
class AhoCorasick:
    def __init__(self):
        self.next = [dict()]
        self.fail = [0]
        self.out = [[]]
        self.words = []
    
    def add_word(self, word, index):
        word_id = len(self.words)
        self.words.append(word)
        
        state = 0
        for ch in word:
            nxt = self.next[state].get(ch)
            if nxt is None:
                nxt = len(self.next)
                self.next[state][ch] = nxt
                self.next.append({})
                self.fail.append(0)
                self.out.append([])
            state=nxt
        self.out[state].append(word_id)