Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Add in a new linter to catch some malformed i18n strings.

Summary: This adds a new `make lint` command for exercises to check and see if there is an unexpected element contained within a string.

I had wanted to move this out into a separate program but I found that I was going to be duplicating a lot of logic from the original extractor - so it really made more sense to have the linting happen in there.

I want to have this run as a part of the official Khan Academy `make lint` as well. (Or maybe this could run before the exercise packing runs? I'm open to suggestions.)

The output from a failing `make lint` run will look something like this:

```
$ make lint
python build/extract_strings.py --lint exercises/*html
Lint error in file: exercises/logical_arguments_deductive_reasoning.html
Contains invalid node:
<div>
                        Identify the <span class="hint_blue">first hypothesis</span>, the <span class="hint_green">first conclusion</span>, the <span class="hint_red">second hypothesis</span>, and the <span class="hint_purple">second conclusion</span>.
                        <div class="graphie">
                            $( "#hyp_a" ).addClass( "hint_blue" );
                            $( "#conc_a" ).addClass( "hint_green" );
                            $( "#hyp_b" ).addClass( "hint_red" );
                            $( "#conc_b" ).addClass( "hint_purple" );
                        </div>
                    </div>
Invalid node:
<div class="graphie">
                            $( "#hyp_a" ).addClass( "hint_blue" );
                            $( "#conc_a" ).addClass( "hint_green" );
                            $( "#hyp_b" ).addClass( "hint_red" );
                            $( "#conc_b" ).addClass( "hint_purple" );
                        </div>
1 error detected.
make: *** [lint] Error 1
```

Test Plan: Run `make lint` and verify that there are no errors. Then modify one of the exercises. For example I modified `exercises/logical_arguments_deductive_reasoning.html` and removed the `<p>...</p>` immediately preceding a graphie block (causing it to be included in the extraction). This generated the above error and make the linter fail.

Reviewers: csilvers

CC:

Maniphest Tasks: T1002
  • Loading branch information...
commit 6fac2e69a2e3e66f38783043b3789d8ff361bfbf 1 parent 4469124
@jeresig jeresig authored
Showing with 65 additions and 16 deletions.
  1. +3 −0  Makefile
  2. +62 −16 build/extract_strings.py
View
3  Makefile
@@ -7,3 +7,6 @@ utils/calculator.js: build/calculator/calculator.jison build/calculator/calculat
# in exercises-packed is newer.
pack packed:
cd exercises && find * -name '*.html' | while read infile; do outfile="../exercises-packed/$$infile"; [ "$$outfile" -nt "$$infile" ] && continue; echo "$$infile"; mkdir -p "`dirname $$outfile`" && ruby ../build/pack.rb < "$$infile" > "$$outfile" || { rm "$$outfile"; exit 1; } done
+
+lint:
+ python build/extract_strings.py --lint exercises/*html
View
78 build/extract_strings.py
@@ -28,18 +28,22 @@
_HAS_TEXT = '*[./text()[normalize-space(.)!=""]]'
_XPATH_FIND_NODES = '//%s[not(ancestor::%s)]' % (_HAS_TEXT, _HAS_TEXT)
-# All the tags that we want to ignore and not extract strings from
-_IGNORE_NODES = [
+# All the tags that we want to make sure that strings don't contain
+_REJECT_NODES = [
'style',
'script',
- 'var',
- 'code',
'div[@class="validator-function"]',
'*[contains(@data-type,"regex")]',
'*[contains(@class,"graphie")]',
'*[contains(@class,"guess")]'
]
+# All the tags that we want to ignore and not extract strings from
+_IGNORE_NODES = _REJECT_NODES + [
+ 'var',
+ 'code'
+]
+
# Make an HTML 5 Parser that will be used to turn the HTML documents
# into a usable DOM. Make sure that we ignore the implied HTML namespace.
_PARSER = lxml.html.html5parser.HTMLParser(namespaceHTMLElements=False)
@@ -62,9 +66,16 @@ def main():
help='The format of the output. (default: %(default)s)')
arg_parser.add_argument('--quiet', action='store_true',
help='Do not emit status to stderr on successful runs.')
+ arg_parser.add_argument('--lint', action='store_true',
+ help='Run a linter against the input files.')
args = arg_parser.parse_args()
+ if args.lint:
+ matches = lint(args.html_files, not args.quiet)
+ num_errors = len(matches)
+ sys.exit(min(num_errors, 127))
+
if args.format == 'po':
# Output a PO file by default
results = make_potfile(args.html_files, not args.quiet)
@@ -82,6 +93,15 @@ def main():
# Otherwise just write the output to STDOUT
print results
+def lint(files, verbose):
+ matches = extract_files(files, verbose, lint=True)
+ num_matches = len(matches)
+
+ if verbose and num_matches:
+ print >>sys.stderr, ('%s error%s detected.'
+ % (num_matches, "" if num_matches == 1 else "s"))
+
+ return matches
def make_potfile(files, verbose):
"""Generate a PO file from a collection of HTML files.
@@ -107,7 +127,7 @@ def make_potfile(files, verbose):
return unicode(output_pot).encode('utf-8')
-def extract_files(files, verbose):
+def extract_files(files, verbose, lint=False):
"""Extract a collection of translatable strings from a set of HTML files.
Returns:
@@ -126,12 +146,12 @@ def extract_files(files, verbose):
# Go through all the exercise files.
if files:
for filename in files:
- if verbose:
+ if verbose and not lint:
print >>sys.stderr, 'Extracting strings from: %s' % filename
- extract_file(filename, matches)
+ extract_file(filename, matches, verbose, lint)
- num_matches = len(matches)
- if verbose:
+ if verbose and not lint:
+ num_matches = len(matches)
print >>sys.stderr, ('%s string%s extracted.'
% (num_matches, "" if num_matches == 1 else "s"))
@@ -146,7 +166,7 @@ def extract_files(files, verbose):
return retval
-def extract_file(filename, matches):
+def extract_file(filename, matches, verbose=False, lint=False):
"""Extract a collection of translatable strings from an HTML file.
This function modifies matches in place with new content that it
@@ -169,12 +189,30 @@ def extract_file(filename, matches):
for name in _IGNORE_NODES:
search_expr += "[not(ancestor-or-self::%s)]" % name
+ if lint:
+ # Construct an XPath expression for finding rejected nodes
+ lint_expr = "|".join([".//%s" % name for name in _REJECT_NODES])
+
# Search for the matching nodes
nodes = html_tree.xpath(search_expr)
for node in nodes:
+ # If we're linting the file and the string doesn't contain any
+ # rejected nodes then we just ignore it
+ if lint:
+ lint_matches = node.xpath(lint_expr)
+ if not lint_matches:
+ continue
+ elif verbose:
+ print >>sys.stderr, "Lint error in file: %s" % filename
+ for lint_node in lint_matches:
+ print >>sys.stderr, "Contains invalid node:"
+ print >>sys.stderr, _get_outerhtml(node)
+ print >>sys.stderr, "Invalid node:"
+ print >>sys.stderr, _get_outerhtml(lint_node)
+
# Get a string version of the contents of the node
- contents = _get_innerhtml(lxml.html.tostring(node))
+ contents = _get_innerhtml(node)
# Bail if we're dealing with an empty element
if not contents:
@@ -208,17 +246,25 @@ def default(self, obj):
return json.JSONEncoder.default(self, obj)
-def _get_innerhtml(html_string):
- """Strip the leading and trailing tag from an lxml-generated HTML string.
-
- Also cleanup endlines and extraneous spaces.
+def _get_outerhtml(html_node):
+ """Get a string representation of an HTML node.
(lxml doesn't provide an easy way to get the 'innerHTML'.)
Note: lxml also includes the trailing text for a node when you
call tostring on it, we need to snip that off too.
"""
+ html_string = lxml.html.tostring(html_node)
+ return re.sub(r'[^>]*$', '', html_string, count=1)
+
+
+def _get_innerhtml(html_node):
+ """Strip the leading and trailing tag from an lxml-generated HTML string.
+
+ Also cleanup endlines and extraneous spaces.
+ """
+ html_string = _get_outerhtml(html_node)
html_string = re.sub(r'^<[^>]*>', '', html_string, count=1)
- html_string = re.sub(r'</[^>]*>[^>]*$', '', html_string, count=1)
+ html_string = re.sub(r'</[^>]*>$', '', html_string, count=1)
return re.sub(r'\s+', ' ', html_string).strip()
Please sign in to comment.
Something went wrong with that request. Please try again.