In [1]:
# adapted from Alex Martelli's example in "Re-learning Python"
# http://www.aleax.it/Python/accu04_Relearn_Python_alex.pdf
# (slide 41) Ex: lines-by-word file index

# BEGIN INDEX_DEFAULT
"""Build an index mapping word -> list of occurrences"""

'Build an index mapping word -> list of occurrences'

In [2]:
import sys
import re
import collections

In [3]:
WORD_RE = re.compile(r'\w+')

In [4]:
index = collections.defaultdict(list)     # <1>
with open(sys.argv[2], encoding='utf-8') as fp:
    for line_no, line in enumerate(fp, 1):
        for match in WORD_RE.finditer(line):
            word = match.group()
            column_no = match.start()+1
            location = (line_no, column_no)
            index[word].append(location)  # <2>

In [5]:
# print in alphabetical order
for word in sorted(index, key=str.upper):
    print(word, index[word])
# END INDEX_DEFAULT

0 [(7, 14), (7, 16)]
1 [(7, 18)]
127 [(7, 10)]
64818 [(2, 17)]
64819 [(3, 17)]
64820 [(4, 17)]
64821 [(6, 14)]
64822 [(5, 19)]
6daf6b657e8acfa601b47674 [(8, 20)]
7c8e761e [(8, 11)]
control_port [(5, 4)]
hb_port [(6, 4)]
hmac [(10, 24)]
iopub_port [(3, 4)]
ip [(7, 4)]
kernel_name [(11, 4)]
key [(8, 4)]
sha256 [(10, 29)]
shell_port [(2, 4)]
signature_scheme [(10, 4)]
stdin_port [(4, 4)]
tcp [(9, 17)]
transport [(9, 4)]
