Skip to content

Commit

Permalink
Update prosesize logic
Browse files Browse the repository at this point in the history
* Iterate through paragraphs, like the script does
* Convert HTML references back to characters
* Drop deprecated disablepp option
  • Loading branch information
Jarry1250 committed Jun 10, 2017
1 parent d61a8ad commit 4e97094
Showing 1 changed file with 11 additions and 15 deletions.
26 changes: 11 additions & 15 deletions script.php
Original file line number Diff line number Diff line change
Expand Up @@ -226,27 +226,23 @@ function getApplicableLength( $pagename, $dykname ) {
$revId = $page['revisions'][0]['revid'];

// Count prose size of article at that time
$json = getJSON( $apiBase . "action=parse&oldid=$revId&prop=text&disablepp" );
$json = getJSON( $apiBase . "action=parse&oldid=$revId&prop=text" );
$text = str_replace( "\n", "", $json['parse']['text']['*'] );
$text = html_entity_decode( $text );

$tagsToStrip = array( 'div', 'ul', 'sub', 'sup' );
foreach( $tagsToStrip as $tagToStrip ){
$regex = "/[<]" . $tagToStrip . ".*?[<]\/" . $tagToStrip . "[>]/";
while( preg_match( $regex, $text ) ){
$text = preg_replace( $regex, '', $text );
}
}
$paras = preg_split( '/<p[> ]/', $text );
array_shift( $paras );
foreach( $paras as &$para ){
list( $para, ) = explode( '</p>', $para, 2 );
preg_match_all( '/<p( [^>]+)?>.*?<\/p>/', $text, $paras );
$count = 0;
foreach( $paras[0] as $para ){
// <sub> and <sup> should be removed in their entirety
$para = preg_replace( '/[<]su(b|p).*?[>].*?[<]\/su(b|p)[>]/', '', $para );

// For everything else, retain innerHTML
while( preg_match( '/[<].*?[>]/', $para ) ){
$para = preg_replace( '/[<].*?[>]/', '', $para );
}
$para = preg_replace( '/\[[0-9]{1,3}\]/', '', $para );
$count += mb_strlen( $para, 'UTF-8' );
}
$prose = implode( '', $paras );
return mb_strlen( $prose, 'UTF-8' );
return $count;
}

// n.b. getApplicableAge returns 4 for all ages less than 5 for performance reasons
Expand Down

0 comments on commit 4e97094

Please sign in to comment.