forked from duckduckgo/zeroclickinfo-fathead
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.pl
100 lines (91 loc) · 2.96 KB
/
parse.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
use 5.006;
use strictures 1;
use autodie;
use Encode;
use HTML::Parser;
use URI::Escape;
binmode STDOUT, ':encoding(UTF-8)';
chdir 'download/doc.perl6.org/routine';
opendir my $dh, '.';
sub duck_escape {
my %replaces = (
'\\' => '\\\\',
"\n" => '\n',
"\t" => '\t',
);
my ($string) = @_;
# I don't know how exactly escaping works, but I hope that this
# trick won't do too much damage with tricky data (not like such
# data is planned, but it's better to be prepared for that).
$string =~ s{ ( [\n\t] | \\ (?= \\* [\n\t] ) ) }{$replaces{$1}}gmsx;
$string;
}
my @fields;
# Only files count, magical directories like '.' shouldn't
for my $file ( grep {-f} readdir $dh ) {
my @tags;
my $current_field;
my $description;
my $p;
my $parser = HTML::Parser->new(
api_version => 3,
# Broken text could make parsing harder than it should be.
unbroken_text => 1,
utf8_mode => 1,
start_h => [
sub {
my ($tagname) = @_;
push @tags, $tagname;
if ( $p && $tagname eq 'p' ) {
$description = q[];
}
},
'tagname'
],
text_h => [
sub {
my ( $dtext ) = @_;
$dtext = decode 'UTF-8', $dtext;
if ( @tags > 2 ) {
# <h1> stores name of class.
if ( $tags[-2] eq 'h1' ) {
$current_field = {class => $dtext, method => $file};
push @fields, $current_field;
}
# First paragraph after <h2> is description.
elsif ( $tags[-2] eq 'h2' ) {
$p = 1;
}
# <pre> stores method prototype.
elsif ( $tags[-1] eq 'pre' && $current_field->{class} ) {
$current_field->{prototype} ||= $dtext;
}
# In <p> mode, every text is part of description.
elsif ($p) {
$description .= $dtext;
}
}
},
'dtext'
],
end_h => [
sub {
# If current tag is <p> then turn off <p> mode.
if ( pop @tags eq 'p' && $p ) {
$current_field->{description} = $description;
undef $description;
$p = 0;
}
}
],
)->parse_file($file);
}
for my $field (@fields) {
my %field = %$field;
print duck_escape($field{class}), '.', duck_escape($field{method}),
" (Perl 6)\t\thttp://doc.perl6.org/type/",
uri_escape_utf8($field{class}), '#',
uri_escape_utf8($field{method}), "\t",
duck_escape($field{description} || q[]), "\t",
duck_escape($field{prototype} || q[]), "\t\t\t\n";
}