Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 112 lines (97 sloc) 3.453 kb
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
1 #!/usr/bin/env python
2
3 import sys
4 import re
5 import frequency
6
7 max_occurrences = 1000
8
9 filter_re = sys.argv[1:] if len(sys.argv) > 1 else ['.*']
10
11 files = [
e92a432 @mikeshulman improvements to index-helper:
mikeshulman authored
12 # "macros.tex",
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
13 "front.tex",
14 "preface.tex",
15 "introduction.tex",
16 "preliminaries.tex",
17 "basics.tex",
18 "logic.tex",
19 "equivalences.tex",
20 "induction.tex",
21 "hits.tex",
22 "hlevels.tex",
23 "homotopy.tex",
24 "categories.tex",
25 "setmath.tex",
26 "reals.tex",
27 "formal.tex"
28 ]
29
30
31 words = {}
32 macros = set({})
33 antimacros = set({})
34 antifiles = ['symbols.tex',
35 'macros.tex',
36 'opt-letter.tex',
37 'opt-ustrade.tex',
38 'opt-color.tex',
39 'hott-ustrade.tex',
40 'hott-letter.tex',
41 'hott-online.tex']
42
e92a432 @mikeshulman improvements to index-helper:
mikeshulman authored
43 def matchtospaces(m):
44 return ' ' * len(m.group(0))
45
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
46 for fn in files:
47 with open(fn, "r") as f:
48 text = f.read()
49 # Remove environment names
50 #text = re.sub(r'\\(begin|end){[^}]+}', ' ', text)
51 # Remove all labels and refs
52 #text = re.sub(r'(\\(label|cref|autoref|eqref|ref){[^}]+})', ' ', text)
53 # Remove hyphenation hints
54 #text = re.sub(r'\\-', '', text)
55 # Remove quotes
56 #text = re.sub(r"['`]", ' ', text)
57 # Replace --- with space
e92a432 @mikeshulman improvements to index-helper:
mikeshulman authored
58 text = re.sub(r'---', ' ', text)
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
59 # Replace punctuations with space
60 #text = re.sub(r'[,.;:?!]', ' ', text)
61 # Replace newlines with spaces
62 #text = re.sub(r'\n', ' ', text)
63 # Find macros
64 for m in re.findall(r"\\[a-zA-Z]+\b", text):
65 if fn in antifiles:
66 antimacros.add(m)
67 else:
68 macros.add(m)
69 # Delete macros
70 #text = re.sub(r'\\[a-zA-Z]+\b', ' ', text)
e92a432 @mikeshulman improvements to index-helper:
mikeshulman authored
71 # Delete cross-references, labels, citations, urls, math terms, urls, environments, index entries
66293c9 @mikeshulman index-helper: exclude \indexdefs containing backslashes
mikeshulman authored
72 text = re.sub(r'\\(autoref|cref|cite|label|ref|eqref|mathsf|href|url|begin|end|index|indexdef|indexfoot|indexsee){[0-9a-zA-Z-_:,!@$* \\]*}', matchtospaces, text)
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
73 # Find words, try to include things like "$(n-2)$-connected"
e92a432 @mikeshulman improvements to index-helper:
mikeshulman authored
74 for m in re.finditer(r"(?<=(.{20}))[^\\]\b(\$[^$]*\$-)?([a-zA-Z]([a-zA-Z-']|\\-)*[a-zA-Z-])\b(?=(.{20}))", text, re.DOTALL):
75 key = str(m.group(3)).lower()
0d38af3 @andrejbauer index helper
andrejbauer authored
76 key = re.sub(r'\\-', '', key) # remove hyphenation hints
e92a432 @mikeshulman improvements to index-helper:
mikeshulman authored
77 pos = m.start(3)
78 excerpt = str(m.group(1) + m.group(0) + m.group(5))
0d38af3 @andrejbauer index helper
andrejbauer authored
79 excerpt = re.sub(r'\n', ' ', excerpt) # replace newlines with spaces
80 if key in words:
81 words[key].append((excerpt, fn, pos))
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
82 else:
0d38af3 @andrejbauer index helper
andrejbauer authored
83 words[key] = [(excerpt, fn, pos)]
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
84
85 # Macros which appear somewhere but are not in the symbols index, macros.tex,
86 # or configuration files.
87 macros -= antimacros
88
89 # Uncomment to see the macros.
90 #for macro in sorted(macros - antimacros):
91 # print (macro)
92
93 def sortkey(word):
94 return (frequency.get_frequency(word), word)
95
96 def filter_word(w, fs):
97 for r in fs:
8cb7602 @andrejbauer improved index-helper script, cf. issue #94
andrejbauer authored
98 if re.search(r, w, flags = re.IGNORECASE): return True
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
99 return False
100
101 for key in sorted(words.keys(), key = sortkey):
102 if filter_word(key, filter_re):
103 freq = frequency.get_frequency(key)
66293c9 @mikeshulman index-helper: exclude \indexdefs containing backslashes
mikeshulman authored
104 if freq > 1100000:
105 continue
748a8f6 @andrejbauer Saving before major change
andrejbauer authored
106 print("\n\n======== %s [%d]\n\n" % (key, freq))
107 for (excerpt, fn, pos) in words[key][:max_occurrences]:
108 print (" ...%s... [%s @ %d]" % (excerpt, fn, pos))
109 if len(words[key]) > max_occurrences:
110 print ("\n [[%d omitted occurrences]]" % (len(words[key]) - max_occurrences))
111
Something went wrong with that request. Please try again.