Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Redo BBC Weather source

Seems the BBC had the urge to prettify (and break) their weather pages.  The
search now returns a different ID than the RSS feeds that we have been using.
Luckily, the RSS ID number seems to still be buried in the HTML, and they are
saying they'll have the RSS feeds linked "very soon".  Meanwhile this should
work.

This adds another requirement for Perl modules: JSON.

Fixes #10204
(cherry picked from commit b7c2e38)
  • Loading branch information...
commit 623ae27f54f2c19bea0bfd9412b7b32632a51990 1 parent 662aa3d
@Beirdo Beirdo authored
View
1  mythplugins/configure
@@ -461,6 +461,7 @@ if ! disabled weather; then
check_pl_lib "DateTime::Format::ISO8601" ||
disable_weather "DateTime::Format::ISO8601"
check_pl_lib "SOAP::Lite" || disable_weather "SOAP::Lite"
+ check_pl_lib "JSON" || disable_weather "JSON"
fi
if test "$dcraw" != "no" ; then
View
270 mythplugins/mythweather/mythweather/scripts/uk_bbc/BBCLocation.pm
@@ -1,120 +1,212 @@
+#! /usr/bin/perl
+# vim:ts=4:sw=4:ai:et:si:sts=4
+
package BBCLocation;
use strict;
+use warnings;
require Exporter;
-use LWP::Simple;
-our @EXPORT = qw(Search);
-our $VERSION = 0.2;
+use utf8;
+use encoding 'utf8';
+use LWP::UserAgent;
+use JSON;
+use XML::XPath;
+use XML::XPath::XMLParser;
+use URI::Escape;
+
+
+our @EXPORT = qw(Search FindLoc);
+our $VERSION = 0.3;
my @searchresults;
my @resulturl;
my $resultcount = -1;
sub Search {
+ my ($search_string, $dir, $timeout, $logdir) = @_;
+ $search_string = uri_escape($search_string);
- my $base_url = 'http://news.bbc.co.uk/weather/util/search/Search.xhtml?';
- my $world_base_url = $base_url . 'lowgraphics=true&region=world&search=';
- my $local_base_url = $base_url . 'lowgraphics=true&region=uk&search=';
+ my $base_url = 'http://www.bbc.co.uk/locator/client/weather/en-GB/' .
+ 'search.json';
+ my $search_url = $base_url . '?ptrt=/&search=';
- my $search_string = shift;
- my $world_response = get $world_base_url . $search_string;
- my $local_response = get $local_base_url . $search_string;
+ my $file = $search_string;
+ getCachedJSON($search_url . $search_string, $dir, $file, $timeout, $logdir);
- &parseResults($world_response) if defined($world_response);
- &parseResults($local_response) if defined($local_response);
+ my $cachefile = "$dir/$file.json";
+ my $cachefile1 = "$dir/$file-results.html";
+ my $cachefile2 = "$dir/$file-pagination.html";
- if ( ($resultcount > 0 ) && ($#searchresults < 0) ) {
- foreach my $url (@resulturl) {
- my $url_response = get $base_url . $url;
- die unless defined $url_response;
- &parseResults($url_response);
- }
- }
+ open OF, "<:utf8", $cachefile or die "Can't read $cachefile: $!\n";
+ my $content = do { local $/; <OF> };
+ close OF;
- return @searchresults;
-}
+ my $decoded = decode_json $content;
+ $resultcount = $decoded->{"noOfResults"};
-sub parseResults {
- my $response = shift;
- my $isresults = 0;
- my $resultline = "";
-
- # Initialise a hash for the $locid & $locname results.
-
- # Use of a hash indexed by $locid ensures that more informative results
- # (e.g. "Sale, Australia" vs. "Sale") coming from <p class="response">
- # section will overwrite less informative results coming from
- # <span id="printbutton_Forecast"> section
-
my %loc_hash = ();
- foreach (split("\n", $response)) {
-
- # Declare a result if either the '<p class="response"> OR <span id="printbutton_Forecast"> strings are found
- # This ensures that single and multiple matches are caught
-
- if (/<p class=\"response\">/ || /<span id=\"printbutton_Forecast\">/) {
- $isresults = 1;
+ get_results($cachefile1, \%loc_hash);
+
+ if (exists $decoded->{"pagination"}) {
+ my %pages = ();
+ my $xp = XML::XPath->new(filename => $cachefile2);
+ my $nodeset = $xp->find("//ol/li/a");
+ foreach my $node ($nodeset->get_nodelist) {
+ my $url = $node->getAttribute("href");
+ my $num = $node->string_value;
+ $url =~ s/&amp;/&/;
+ $num =~ s/ //g;
+ $pages{$num} = $url;
}
- my $locname;
- my $locid;
- my $url;
-
- if ($isresults) {
- last if (/There are no forecasts matching/);
-
- $resultcount = $1 if (/<strong>There \w{2,3} (\d*) forecasts? matching/);
-
- # Collect result URLs
- if (/<a id=\"result_\d*\" .* href \=\"?.*search\=.*/) {
- $url = $_;
- $url =~ s/.*href \=\"(.*)\".*/$1/s;
- push (@resulturl, $url);
- }
-
- # Collect location IDs and location names
- elsif (/<a id=\"result_\d*\" .* href \=\"\/weather\/forecast\//) {
- $locid = $_;
- $locid =~ s/.*\/weather\/forecast\/(\d{0,5})\?.*/$1/s;
-
- $locname = $_;
- $locname =~ s/.*<a id=\"result_\d*\".*>(.*)<\/a>.*/$1/s;
-
- $resultline = $locid . "::" . $locname;
-
- $loc_hash{$locid} = $locname;
- }
-
- # Extract location ID and name from "Print <location>" link
-
- # This string is always present (provided valid search result - invalid results are caught above)
- # irresepective of whether single or multiple matches are returned
-
- elsif (/<span id=\"printbutton_Forecast\"><a title=\"Print (.+)\" href=\"\/weather\/forecast\/(\d{0,5})?/o)
- {
- $locid = $2;
- $locname = $1;
-
- $loc_hash{$locid} = $locname;
- }
+ foreach my $page (keys %pages)
+ {
+ getCachedJSON($base_url . $pages{$page}, $dir, $file . "-$page",
+ $timeout, $logdir);
+
+ my $cachefile3 = "$dir/$file-$page-results.html";
+ get_results($cachefile3, \%loc_hash);
}
}
-
- # Loop through contents of %loc_hash, check for existence within @searchresults, and add as necessary
-
+
+ my @searchresults = ();
foreach my $key (keys %loc_hash)
{
my $resultline = $key."::".$loc_hash{$key};
-
- if (! grep(/^$key/, @searchresults))
- {
- push (@searchresults, $resultline);
- }
+ push (@searchresults, $resultline);
}
-
return @searchresults;
}
+sub getCachedJSON {
+ my ($url, $dir, $file, $timeout, $logdir) = @_;
+
+ my $cachefile = "$dir/$file.json";
+ my $cachefile1 = "$dir/$file-results.html";
+ my $cachefile2 = "$dir/$file-pagination.html";
+
+ my $now = time();
+ my $decoded;
+
+ log_print( $logdir, "Loading URL: $url\n" );
+
+ if( (-e $cachefile) and ((stat($cachefile))[9] >= ($now - $timeout)) ) {
+ # File cache is still recent.
+ log_print( $logdir, "cached in $cachefile\n" );
+ } else {
+ log_print( $logdir, "$url\ncaching to $cachefile\n" );
+
+ my $ua = LWP::UserAgent->new;
+ $ua->timeout(30);
+ $ua->env_proxy;
+ $ua->default_header('Accept-Language' => "en");
+
+ my $response = $ua->get($url);
+ if ( !$response->is_success ) {
+ die $response->status_line;
+ }
+
+ open OF, ">:utf8", $cachefile or die "Can't open $cachefile: $!\n";
+ print OF $response->content;
+ close OF;
+
+ $decoded = decode_json $response->content;
+
+ open OF, ">:utf8", $cachefile1 or die "Can't open $cachefile1: $!\n";
+ print OF "<html>".$decoded->{"results"}."</html>";
+ close OF;
+
+ if (exists $decoded->{"pagination"}) {
+ open OF, ">:utf8", $cachefile2 or
+ die "Can't open $cachefile2: $!\n";
+ print OF "<html>".$decoded->{"pagination"}."</html>";
+ close OF;
+ } else {
+ unlink $cachefile2;
+ }
+ }
+}
+
+sub get_results {
+ my ($file, $outhash) = @_;
+
+ my $xp = XML::XPath->new(filename => $file);
+ my $nodeset = $xp->find("//ul/li/a");
+ foreach my $node ($nodeset->get_nodelist) {
+ my $url = $node->getAttribute("href");
+ my $loc = $node->string_value;
+
+ $url =~ s/^\/weather\///;
+ $outhash->{$url} = $loc;
+
+ print "$url"."::$loc\n";
+ }
+}
+
+sub log_print {
+ return if not defined $::opt_D;
+ my $dir = shift;
+
+ open OF, ">>$dir/uk_bbc.log";
+ print OF @_;
+ close OF;
+}
+
+sub FindLoc {
+ my ($locid, $dir, $timeout, $logdir) = @_;
+
+ my $url = "http://www.bbc.co.uk/weather/$locid";
+
+ my $file = "$locid.html";
+ getCachedHTML($url, $dir, $file, $timeout, $logdir);
+
+ my $cachefile = "$dir/$file";
+
+ open OF, "<:utf8", $cachefile;
+ my $contents = do { local $/; <OF>; };
+ close OF;
+
+ my ($rssid) = ($contents =~ /data-loc="(.*?)"/);
+ die "No RSS Location found for ID $locid!\n" unless defined $rssid;
+
+ $rssid =~ s/^LOC-//;
+ return $rssid;
+}
+
+
+sub getCachedHTML {
+ my ($url, $dir, $file, $timeout, $logdir) = @_;
+
+ my $cachefile = "$dir/$file";
+
+ my $now = time();
+
+ log_print( $logdir, "Loading URL: $url\n" );
+
+ if( (-e $cachefile) and ((stat($cachefile))[9] >= ($now - $timeout)) ) {
+ # File cache is still recent.
+ log_print( $logdir, "cached in $cachefile\n" );
+ } else {
+ log_print( $logdir, "$url\ncaching to $cachefile\n" );
+
+ my $ua = LWP::UserAgent->new;
+ $ua->timeout(30);
+ $ua->env_proxy;
+ $ua->default_header('Accept-Language' => "en");
+
+ my $response = $ua->get($url);
+ if ( !$response->is_success ) {
+ die $response->status_line;
+ }
+
+ my $content = $response->content;
+ open OF, ">:utf8", $cachefile or die "Can't open $cachefile: $!\n";
+ print OF $content;
+ close OF;
+ }
+}
+
+
1;
View
61 mythplugins/mythweather/mythweather/scripts/uk_bbc/bbccurrentxml.pl
@@ -1,13 +1,13 @@
#! /usr/bin/perl
-
-#
-# Based on nwsxml.pl by Lucien Dunning
-#
+# vim:ts=4:sw=4:ai:et:si:sts=4
use strict;
use warnings;
+use utf8;
+use encoding 'utf8';
use English;
+
use File::Basename;
use Cwd 'abs_path';
use lib dirname(abs_path($0 or $PROGRAM_NAME)),
@@ -16,47 +16,61 @@
use XML::Simple;
use LWP::Simple;
-# Ideally we would use the If-Modified-Since header
-# to reduce server load, but they ignore it
-#use HTTP::Cache::Transparent;
use Getopt::Std;
+use File::Path;
+
use File::Basename;
use lib dirname($0);
use BBCLocation;
-our ($opt_v, $opt_t, $opt_T, $opt_l, $opt_u, $opt_d);
+our ($opt_v, $opt_t, $opt_T, $opt_l, $opt_u, $opt_d, $opt_D);
my $name = 'BBC-Current-XML';
-my $version = 0.2;
-my $author = 'Stuart Morgan';
-my $email = 'stuart@tase.co.uk';
+my $version = 0.3;
+my $author = 'Gavin Hurlbut / Stuart Morgan';
+my $email = 'gjhurlbu@gmail.com / stuart@tase.co.uk';
my $updateTimeout = 120*60;
# 2 Hours, BBC updates infrequently ~3 hours
-# Given that the option to update in the background now exists
-# potentially we could be hitting the server 12 times in a day
my $retrieveTimeout = 30;
my @types = ('cclocation', 'station_id', 'copyright',
'observation_time', 'weather', 'temp', 'relative_humidity',
'wind_dir', 'pressure', 'visibility', 'weather_icon',
'appt', 'wind_spdgst');
-my $dir = "./";
+my $dir = "/tmp/uk_bbc";
+my $logdir = "/tmp/uk_bbc";
+
+binmode(STDOUT, ":utf8");
+
+if (!-d $logdir) {
+ mkpath( $logdir, {mode => 0755} );
+}
getopts('Tvtlu:d:');
if (defined $opt_v) {
print "$name,$version,$author,$email\n";
+ log_print( $logdir, "-v\n" );
exit 0;
}
if (defined $opt_T) {
print "$updateTimeout,$retrieveTimeout\n";
+ log_print( $logdir, "-t\n" );
exit 0;
}
-if (defined $opt_l) {
+if (defined $opt_d) {
+ $dir = $opt_d;
+}
+if (!-d $dir) {
+ mkpath( $dir, {mode => 0755} );
+}
+
+if (defined $opt_l) {
my $search = shift;
- my @results = BBCLocation::Search($search);
+ log_print( $logdir, "-l $search\n" );
+ my @results = BBCLocation::Search($search, $dir, $updateTimeout, $logdir);
my $result;
foreach (@results) {
@@ -71,13 +85,9 @@
exit 0;
}
-if (defined $opt_d) {
- $dir = $opt_d;
-}
-
# we get here, we're doing an actual retrieval, everything must be defined
-my $locid = shift;
+my $locid = BBCLocation::FindLoc(shift, $dir, $updateTimeout, $logdir);
if (!(defined $opt_u && defined $locid && !$locid eq "")) {
die "Invalid usage";
}
@@ -248,3 +258,12 @@
printf $datalabel . "::" . $datavalue . "\n";
}
+
+sub log_print {
+ return if not defined $::opt_D;
+ my $dir = shift;
+
+ open OF, ">>$dir/uk_bbc.log";
+ print OF @_;
+ close OF;
+}
View
59 mythplugins/mythweather/mythweather/scripts/uk_bbc/bbcthreedayxml.pl
@@ -1,13 +1,13 @@
#! /usr/bin/perl
-
-#
-# Based on nwsxml.pl by Lucien Dunning
-#
+# vim:ts=4:sw=4:ai:et:si:sts=4
use strict;
use warnings;
+use utf8;
+use encoding 'utf8';
use English;
+
use File::Basename;
use Cwd 'abs_path';
use lib dirname(abs_path($0 or $PROGRAM_NAME)),
@@ -16,44 +16,60 @@
use XML::Simple;
use LWP::Simple;
-# Ideally we would use the If-Modified-Since header
-# to reduce server load, but they ignore it
-#use HTTP::Cache::Transparent;
use Getopt::Std;
+use File::Path;
+
use File::Basename;
use lib dirname($0);
use BBCLocation;
-our ($opt_v, $opt_t, $opt_T, $opt_l, $opt_u, $opt_d);
+our ($opt_v, $opt_t, $opt_T, $opt_l, $opt_u, $opt_d, $opt_D);
my $name = 'BBC-3day-XML';
-my $version = 0.2;
-my $author = 'Stuart Morgan';
-my $email = 'stuart@tase.co.uk';
+my $version = 0.3;
+my $author = 'Gavin Hurlbut / Stuart Morgan';
+my $email = 'gjhurlbu@gmail.com / stuart@tase.co.uk';
my $updateTimeout = 360*60; # 6 Hours
my $retrieveTimeout = 30;
my @types = ('3dlocation', 'station_id', 'copyright', 'weather_icon',
'date-0', 'icon-0', 'low-0', 'high-0',
'date-1', 'icon-1', 'low-1', 'high-1',
'date-2', 'icon-2', 'low-2', 'high-2', 'updatetime');
-my $dir = "./";
+my $dir = "/tmp/uk_bbc";
+my $logdir = "/tmp/uk_bbc";
+
+binmode(STDOUT, ":utf8");
+
+if (!-d $logdir) {
+ mkpath( $logdir, {mode => 0755} );
+}
getopts('Tvtlu:d:');
if (defined $opt_v) {
print "$name,$version,$author,$email\n";
+ log_print( $logdir, "-v\n" );
exit 0;
}
if (defined $opt_T) {
print "$updateTimeout,$retrieveTimeout\n";
+ log_print( $logdir, "-t\n" );
exit 0;
}
-if (defined $opt_l) {
+if (defined $opt_d) {
+ $dir = $opt_d;
+}
+if (!-d $dir) {
+ mkpath( $dir, {mode => 0755} );
+}
+
+if (defined $opt_l) {
my $search = shift;
- my @results = BBCLocation::Search($search);
+ log_print( $logdir, "-l $search\n" );
+ my @results = BBCLocation::Search($search, $dir, $updateTimeout, $logdir);
my $result;
foreach (@results) {
@@ -68,13 +84,9 @@
exit 0;
}
-if (defined $opt_d) {
- $dir = $opt_d;
-}
-
# we get here, we're doing an actual retrieval, everything must be defined
-my $locid = shift;
+my $locid = BBCLocation::FindLoc(shift, $dir, $updateTimeout, $logdir);
if (!(defined $opt_u && defined $locid && !$locid eq "")) {
die "Invalid usage";
}
@@ -232,3 +244,12 @@
$i++;
}
+
+sub log_print {
+ return if not defined $::opt_D;
+ my $dir = shift;
+
+ open OF, ">>$dir/uk_bbc.log";
+ print OF @_;
+ close OF;
+}
Please sign in to comment.
Something went wrong with that request. Please try again.