forked from duckduckgo/zeroclickinfo-fathead
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
GlitchMr
committed
Jul 31, 2012
1 parent
96aefde
commit 3da70e6
Showing
7 changed files
with
353 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
Perl 6 documentation fetcher and parser for DuckDuckGo | ||
|
||
# Dependencies | ||
|
||
* wget | ||
* Perl v5.6 | ||
* CPAN modules: | ||
* `strictures` | ||
* `HTML::Parser` | ||
* `URI::Escape` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/bash | ||
# Alternatively, it could download https://github.com/perl6/doc repo, but | ||
# generating files from it would require having Perl 6 (not yet ready for | ||
# production usage). | ||
mkdir -p download | ||
cd download | ||
wget -np -nc -r -l 2 http://doc.perl6.org/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# This is the name of the source as people would refer to it, e.g. Wikipedia or PerlDoc | ||
Name: Perl 6 Documentation | ||
|
||
# This is the base domain where the source pages are located. | ||
Domain: doc.perl6.org | ||
|
||
# This is what gets put in quotes next to the source | ||
# It can be blank if it is a source with completely general info spanning many types of topics like Facebook. | ||
Type: Perl 6 Documentation | ||
|
||
# Whether the source is from MediaWiki (1) or not (0). | ||
MediaWiki: 0 | ||
|
||
# Keywords uses to trigger (or prefer) the source over others. | ||
Keywords: perl 6, perl6 |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
use 5.006; | ||
use strictures 1; | ||
use autodie; | ||
use Encode; | ||
use HTML::Parser; | ||
use URI::Escape; | ||
binmode STDOUT, ':encoding(UTF-8)'; | ||
chdir 'download/doc.perl6.org/routine'; | ||
opendir my $dh, '.'; | ||
|
||
sub duck_escape { | ||
my %replaces = ( | ||
'\\' => '\\\\', | ||
"\n" => '\n', | ||
"\t" => '\t', | ||
); | ||
my ($string) = @_; | ||
# I don't know how exactly escaping works, but I hope that this | ||
# trick won't do too much damage with tricky data (not like such | ||
# data is planned, but it's better to be prepared for that). | ||
$string =~ s{ ( [\n\t] | \\ (?= \\* [\n\t] ) ) }{$replaces{$1}}gmsx; | ||
$string; | ||
} | ||
|
||
my @fields; | ||
|
||
# Only files count, magical directories like '.' shouldn't | ||
for my $file ( grep {-f} readdir $dh ) { | ||
my @tags; | ||
my $current_field; | ||
my $description; | ||
my $p; | ||
my $parser = HTML::Parser->new( | ||
api_version => 3, | ||
|
||
# Broken text could make parsing harder than it should be. | ||
unbroken_text => 1, | ||
utf8_mode => 1, | ||
start_h => [ | ||
sub { | ||
my ($tagname) = @_; | ||
push @tags, $tagname; | ||
if ( $p && $tagname eq 'p' ) { | ||
$description = q[]; | ||
} | ||
}, | ||
'tagname' | ||
], | ||
text_h => [ | ||
sub { | ||
my ( $dtext ) = @_; | ||
$dtext = decode 'UTF-8', $dtext; | ||
if ( @tags > 2 ) { | ||
|
||
# <h1> stores name of class. | ||
if ( $tags[-2] eq 'h1' ) { | ||
# Shouldn't happen. If it does then either | ||
# documentation format changed or somebody | ||
# forgot specifying description. | ||
$current_field = {class => $dtext, method => $file}; | ||
push @fields, $current_field; | ||
} | ||
|
||
# First paragraph after <h2> is description. | ||
elsif ( $tags[-2] eq 'h2' ) { | ||
$p = 1; | ||
} | ||
|
||
# <pre> stores method prototype. | ||
elsif ( $tags[-1] eq 'pre' && $current_field->{class} ) { | ||
$current_field->{prototype} ||= $dtext; | ||
} | ||
|
||
# In <p> mode, every text is part of description. | ||
elsif ($p) { | ||
$description .= $dtext; | ||
} | ||
} | ||
}, | ||
'dtext' | ||
], | ||
end_h => [ | ||
sub { | ||
# If current tag is <p> then turn off <p> mode. | ||
if ( pop @tags eq 'p' && $p ) { | ||
$current_field->{description} = $description; | ||
undef $description; | ||
$p = 0; | ||
} | ||
} | ||
], | ||
)->parse_file($file); | ||
} | ||
|
||
for my $field (@fields) { | ||
my %field = %$field; | ||
print duck_escape($field{class}), '.', duck_escape($field{method}), | ||
" (Perl 6)\t\thttp://doc.perl6.org/type/", | ||
uri_escape_utf8($field{class}), '#', | ||
uri_escape_utf8($field{method}), "\t", | ||
duck_escape($field{description} || q[]), "\t", | ||
duck_escape($field{prototype} || q[]), "\t\t\t\n"; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
perl parse.pl > output.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
perl 6 chr | ||
perl6 Version.new |