In [1]:
# adapted from Alex Martelli's example in "Re-learning Python"
# http://www.aleax.it/Python/accu04_Relearn_Python_alex.pdf
# (slide 41) Ex: lines-by-word file index

# BEGIN INDEX0
"""Build an index mapping word -> list of occurrences"""

'Build an index mapping word -> list of occurrences'

In [2]:
import sys
import re

In [3]:
WORD_RE = re.compile(r'\w+')

In [4]:
index = {}
with open(sys.argv[2], encoding='utf-8') as fp:
    for line_no, line in enumerate(fp, 1):
        for match in WORD_RE.finditer(line):
            word = match.group()
            column_no = match.start()+1
            location = (line_no, column_no)
            # this is ugly; coded like this to make a point
            occurrences = index.get(word, [])  # <1>
            occurrences.append(location)       # <2>
            index[word] = occurrences          # <3>

In [5]:
# print in alphabetical order
for word in sorted(index, key=str.upper):  # <4>
    print(word, index[word])
# END INDEX0

0 [(7, 14), (7, 16)]
1 [(7, 18)]
127 [(7, 10)]
64904 [(2, 17)]
64905 [(3, 17)]
64906 [(4, 17)]
64907 [(6, 14)]
64908 [(5, 19)]
ae3e661a [(8, 11)]
b83c71b8e951565cc9ce5b87 [(8, 20)]
control_port [(5, 4)]
hb_port [(6, 4)]
hmac [(10, 24)]
iopub_port [(3, 4)]
ip [(7, 4)]
kernel_name [(11, 4)]
key [(8, 4)]
sha256 [(10, 29)]
shell_port [(2, 4)]
signature_scheme [(10, 4)]
stdin_port [(4, 4)]
tcp [(9, 17)]
transport [(9, 4)]
