Skip to content

Commit

Permalink
Perl 6 documentation parser, take 1
Browse files Browse the repository at this point in the history
  • Loading branch information
GlitchMr committed Jul 31, 2012
1 parent 96aefde commit 3da70e6
Show file tree
Hide file tree
Showing 7 changed files with 353 additions and 0 deletions.
10 changes: 10 additions & 0 deletions perl6_doc/README.md
@@ -0,0 +1,10 @@
Perl 6 documentation fetcher and parser for DuckDuckGo

# Dependencies

* wget
* Perl v5.6
* CPAN modules:
* `strictures`
* `HTML::Parser`
* `URI::Escape`
7 changes: 7 additions & 0 deletions perl6_doc/fetch.sh
@@ -0,0 +1,7 @@
#!/bin/bash
# Alternatively, it could download https://github.com/perl6/doc repo, but
# generating files from it would require having Perl 6 (not yet ready for
# production usage).
mkdir -p download
cd download
wget -np -nc -r -l 2 http://doc.perl6.org/
15 changes: 15 additions & 0 deletions perl6_doc/meta.txt
@@ -0,0 +1,15 @@
# This is the name of the source as people would refer to it, e.g. Wikipedia or PerlDoc
Name: Perl 6 Documentation

# This is the base domain where the source pages are located.
Domain: doc.perl6.org

# This is what gets put in quotes next to the source
# It can be blank if it is a source with completely general info spanning many types of topics like Facebook.
Type: Perl 6 Documentation

# Whether the source is from MediaWiki (1) or not (0).
MediaWiki: 0

# Keywords uses to trigger (or prefer) the source over others.
Keywords: perl 6, perl6
215 changes: 215 additions & 0 deletions perl6_doc/output.txt

Large diffs are not rendered by default.

103 changes: 103 additions & 0 deletions perl6_doc/parse.pl
@@ -0,0 +1,103 @@
use 5.006;
use strictures 1;
use autodie;
use Encode;
use HTML::Parser;
use URI::Escape;
binmode STDOUT, ':encoding(UTF-8)';
chdir 'download/doc.perl6.org/routine';
opendir my $dh, '.';

sub duck_escape {
my %replaces = (
'\\' => '\\\\',
"\n" => '\n',
"\t" => '\t',
);
my ($string) = @_;
# I don't know how exactly escaping works, but I hope that this
# trick won't do too much damage with tricky data (not like such
# data is planned, but it's better to be prepared for that).
$string =~ s{ ( [\n\t] | \\ (?= \\* [\n\t] ) ) }{$replaces{$1}}gmsx;
$string;
}

my @fields;

# Only files count, magical directories like '.' shouldn't
for my $file ( grep {-f} readdir $dh ) {
my @tags;
my $current_field;
my $description;
my $p;
my $parser = HTML::Parser->new(
api_version => 3,

# Broken text could make parsing harder than it should be.
unbroken_text => 1,
utf8_mode => 1,
start_h => [
sub {
my ($tagname) = @_;
push @tags, $tagname;
if ( $p && $tagname eq 'p' ) {
$description = q[];
}
},
'tagname'
],
text_h => [
sub {
my ( $dtext ) = @_;
$dtext = decode 'UTF-8', $dtext;
if ( @tags > 2 ) {

# <h1> stores name of class.
if ( $tags[-2] eq 'h1' ) {
# Shouldn't happen. If it does then either
# documentation format changed or somebody
# forgot specifying description.
$current_field = {class => $dtext, method => $file};
push @fields, $current_field;
}

# First paragraph after <h2> is description.
elsif ( $tags[-2] eq 'h2' ) {
$p = 1;
}

# <pre> stores method prototype.
elsif ( $tags[-1] eq 'pre' && $current_field->{class} ) {
$current_field->{prototype} ||= $dtext;
}

# In <p> mode, every text is part of description.
elsif ($p) {
$description .= $dtext;
}
}
},
'dtext'
],
end_h => [
sub {
# If current tag is <p> then turn off <p> mode.
if ( pop @tags eq 'p' && $p ) {
$current_field->{description} = $description;
undef $description;
$p = 0;
}
}
],
)->parse_file($file);
}

for my $field (@fields) {
my %field = %$field;
print duck_escape($field{class}), '.', duck_escape($field{method}),
" (Perl 6)\t\thttp://doc.perl6.org/type/",
uri_escape_utf8($field{class}), '#',
uri_escape_utf8($field{method}), "\t",
duck_escape($field{description} || q[]), "\t",
duck_escape($field{prototype} || q[]), "\t\t\t\n";
}
1 change: 1 addition & 0 deletions perl6_doc/parse.sh
@@ -0,0 +1 @@
perl parse.pl > output.txt
2 changes: 2 additions & 0 deletions perl6_doc/queries.txt
@@ -0,0 +1,2 @@
perl 6 chr
perl6 Version.new

0 comments on commit 3da70e6

Please sign in to comment.