Add in a new linter to catch some malformed i18n strings.

Summary: This adds a new `make lint` command for exercises to check and see if there is an unexpected element contained within a string. I had wanted to move this out into a separate program but I found that I was going to be duplicating a lot of logic from the original extractor - so it really made more sense to have the linting happen in there. I want to have this run as a part of the official Khan Academy `make lint` as well. (Or maybe this could run before the exercise packing runs? I'm open to suggestions.) The output from a failing `make lint` run will look something like this: ``` $ make lint python build/extract_strings.py --lint exercises/*html Lint error in file: exercises/logical_arguments_deductive_reasoning.html Contains invalid node: <div> Identify the <span class="hint_blue">first hypothesis</span>, the <span class="hint_green">first conclusion</span>, the <span class="hint_red">second hypothesis</span>, and the <span class="hint_purple">second conclusion</span>. <div class="graphie"> $( "#hyp_a" ).addClass( "hint_blue" ); $( "#conc_a" ).addClass( "hint_green" ); $( "#hyp_b" ).addClass( "hint_red" ); $( "#conc_b" ).addClass( "hint_purple" ); </div> </div> Invalid node: <div class="graphie"> $( "#hyp_a" ).addClass( "hint_blue" ); $( "#conc_a" ).addClass( "hint_green" ); $( "#hyp_b" ).addClass( "hint_red" ); $( "#conc_b" ).addClass( "hint_purple" ); </div> 1 error detected. make: *** [lint] Error 1 ``` Test Plan: Run `make lint` and verify that there are no errors. Then modify one of the exercises. For example I modified `exercises/logical_arguments_deductive_reasoning.html` and removed the `<p>...</p>` immediately preceding a graphie block (causing it to be included in the extraction). This generated the above error and make the linter fail. Reviewers: csilvers CC: Maniphest Tasks: T1002
Khan · Mar 14, 2013 · 6fac2e6 · 6fac2e6
1 parent 4469124
commit 6fac2e6
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 16 deletions.
diff --git a/Makefile b/Makefile
@@ -7,3 +7,6 @@ utils/calculator.js: build/calculator/calculator.jison build/calculator/calculat
 # in exercises-packed is newer.
 pack packed:
 	cd exercises && find * -name '*.html' | while read infile; do outfile="../exercises-packed/$$infile"; [ "$$outfile" -nt "$$infile" ] && continue; echo "$$infile"; mkdir -p "`dirname $$outfile`" && ruby ../build/pack.rb < "$$infile" > "$$outfile" || { rm "$$outfile"; exit 1; } done
+
+lint:
+	python build/extract_strings.py --lint exercises/*html
diff --git a/build/extract_strings.py b/build/extract_strings.py
@@ -28,18 +28,22 @@
 _HAS_TEXT = '*[./text()[normalize-space(.)!=""]]'
 _XPATH_FIND_NODES = '//%s[not(ancestor::%s)]' % (_HAS_TEXT, _HAS_TEXT)
 
-# All the tags that we want to ignore and not extract strings from
-_IGNORE_NODES = [
+# All the tags that we want to make sure that strings don't contain
+_REJECT_NODES = [
     'style',
     'script',
-    'var',
-    'code',
     'div[@class="validator-function"]',
     '*[contains(@data-type,"regex")]',
     '*[contains(@class,"graphie")]',
     '*[contains(@class,"guess")]'
 ]
 
+# All the tags that we want to ignore and not extract strings from
+_IGNORE_NODES = _REJECT_NODES + [
+    'var',
+    'code'
+]
+
 # Make an HTML 5 Parser that will be used to turn the HTML documents
 # into a usable DOM. Make sure that we ignore the implied HTML namespace.
 _PARSER = lxml.html.html5parser.HTMLParser(namespaceHTMLElements=False)
@@ -62,9 +66,16 @@ def main():
         help='The format of the output. (default: %(default)s)')
     arg_parser.add_argument('--quiet', action='store_true',
         help='Do not emit status to stderr on successful runs.')
+    arg_parser.add_argument('--lint', action='store_true',
+        help='Run a linter against the input files.')
 
     args = arg_parser.parse_args()
 
+    if args.lint:
+        matches = lint(args.html_files, not args.quiet)
+        num_errors = len(matches)
+        sys.exit(min(num_errors, 127))
+
     if args.format == 'po':
         # Output a PO file by default
         results = make_potfile(args.html_files, not args.quiet)
@@ -82,6 +93,15 @@ def main():
         # Otherwise just write the output to STDOUT
         print results
 
+def lint(files, verbose):
+    matches = extract_files(files, verbose, lint=True)
+    num_matches = len(matches)
+
+    if verbose and num_matches:
+        print >>sys.stderr, ('%s error%s detected.'
+                            % (num_matches, "" if num_matches == 1 else "s"))
+
+    return matches
 
 def make_potfile(files, verbose):
     """Generate a PO file from a collection of HTML files.
@@ -107,7 +127,7 @@ def make_potfile(files, verbose):
     return unicode(output_pot).encode('utf-8')
 
 
-def extract_files(files, verbose):
+def extract_files(files, verbose, lint=False):
     """Extract a collection of translatable strings from a set of HTML files.
 
     Returns:
@@ -126,12 +146,12 @@ def extract_files(files, verbose):
     # Go through all the exercise files.
     if files:
         for filename in files:
-            if verbose:
+            if verbose and not lint:
                 print >>sys.stderr, 'Extracting strings from: %s' % filename
-            extract_file(filename, matches)
+            extract_file(filename, matches, verbose, lint)
 
-    num_matches = len(matches)
-    if verbose:
+    if verbose and not lint:
+        num_matches = len(matches)
         print >>sys.stderr, ('%s string%s extracted.'
                              % (num_matches, "" if num_matches == 1 else "s"))
 
@@ -146,7 +166,7 @@ def extract_files(files, verbose):
     return retval
 
 
-def extract_file(filename, matches):
+def extract_file(filename, matches, verbose=False, lint=False):
     """Extract a collection of translatable strings from an HTML file.
 
     This function modifies matches in place with new content that it
@@ -169,12 +189,30 @@ def extract_file(filename, matches):
     for name in _IGNORE_NODES:
         search_expr += "[not(ancestor-or-self::%s)]" % name
 
+    if lint:
+        # Construct an XPath expression for finding rejected nodes
+        lint_expr = "|".join([".//%s" % name for name in _REJECT_NODES])
+
     # Search for the matching nodes
     nodes = html_tree.xpath(search_expr)
 
     for node in nodes:
+        # If we're linting the file and the string doesn't contain any
+        # rejected nodes then we just ignore it
+        if lint:
+            lint_matches = node.xpath(lint_expr)
+            if not lint_matches:
+                continue
+            elif verbose:
+                print >>sys.stderr, "Lint error in file: %s" % filename
+                for lint_node in lint_matches:
+                    print >>sys.stderr, "Contains invalid node:"
+                    print >>sys.stderr, _get_outerhtml(node)
+                    print >>sys.stderr, "Invalid node:"
+                    print >>sys.stderr, _get_outerhtml(lint_node)
+
         # Get a string version of the contents of the node
-        contents = _get_innerhtml(lxml.html.tostring(node))
+        contents = _get_innerhtml(node)
 
         # Bail if we're dealing with an empty element
         if not contents:
@@ -208,17 +246,25 @@ def default(self, obj):
         return json.JSONEncoder.default(self, obj)
 
 
-def _get_innerhtml(html_string):
-    """Strip the leading and trailing tag from an lxml-generated HTML string.
-
-    Also cleanup endlines and extraneous spaces.
+def _get_outerhtml(html_node):
+    """Get a string representation of an HTML node.
 
     (lxml doesn't provide an easy way to get the 'innerHTML'.)
     Note: lxml also includes the trailing text for a node when you
           call tostring on it, we need to snip that off too.
     """
+    html_string = lxml.html.tostring(html_node)
+    return re.sub(r'[^>]*$', '', html_string, count=1)
+
+
+def _get_innerhtml(html_node):
+    """Strip the leading and trailing tag from an lxml-generated HTML string.
+
+    Also cleanup endlines and extraneous spaces.
+    """
+    html_string = _get_outerhtml(html_node)
     html_string = re.sub(r'^<[^>]*>', '', html_string, count=1)
-    html_string = re.sub(r'</[^>]*>[^>]*$', '', html_string, count=1)
+    html_string = re.sub(r'</[^>]*>$', '', html_string, count=1)
     return re.sub(r'\s+', ' ', html_string).strip()