Skip to content

Commit

Permalink
upload initial write scraper in php (have it somewhere else, rather t…
Browse files Browse the repository at this point in the history
…han write it in ruby) - quicker. :-)
  • Loading branch information
LoveMyData committed May 19, 2018
1 parent 6351137 commit 8e36b39
Show file tree
Hide file tree
Showing 4 changed files with 257 additions and 0 deletions.
16 changes: 16 additions & 0 deletions README.md
@@ -0,0 +1,16 @@
# Albury City Council Scraper

* Server - Unknown but I like it
* Cookie tracking - Yes
* Agreement - Yes
* Pagnation - No
* JSON - Yes - Yay....

Setup MORPH_PERIOD for data recovery, available options are

* thisweek (default, today -10 days)
* thismonth
* lastmonth
* year (eg. 2016)

Enjoy
16 changes: 16 additions & 0 deletions composer.json
@@ -0,0 +1,16 @@
{
"repositories": [
{
"url": "https://github.com/openaustralia/scraperwiki-php.git",
"type": "git"
}
],
"require": {
"openaustralia/scraperwiki": "dev-morph_defaults",
"ext-sqlite3": "*",
"ext-pdo_sqlite": "*",
"ext-gd": "*",
"ext-mbstring": "*",
"byjg/pgbrowser" : "1.0.*"
}
}
134 changes: 134 additions & 0 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

91 changes: 91 additions & 0 deletions scraper.php
@@ -0,0 +1,91 @@
<?php
### Albury City Council Scraper
require_once 'vendor/autoload.php';
require_once 'vendor/openaustralia/scraperwiki/scraperwiki.php';

use PGuardiario\PGBrowser;
date_default_timezone_set('Australia/Sydney');

# Default to 'thisweek', use MORPH_PERIOD to change to 'thismonth' or 'lastmonth' for data recovery
switch(getenv('MORPH_PERIOD')) {
case 'thismonth' :
$sdate = date('01/m/Y');
$edate = date('t/m/Y');
break;
case 'lastmonth' :
$sdate = date('01/m/Y', strtotime('-1 month'));
$edate = date('t/m/Y', strtotime('-1 month'));
break;
default :
if ( preg_match('/^(19[0-9]{2}|20[0-9]{2})$/', getenv('MORPH_PERIOD'), $matches) == true) {
$sdate = date('01/01/Y', strtotime($matches[0]. '-01-01'));
$edate = date('31/12/Y', strtotime($matches[0]. '-12-31'));
} else {
$sdate = date('d/m/Y', strtotime('-10 days'));
$edate = date('d/m/Y');
}
break;
}
print "Getting data between " .$sdate. " and " .$edate. ", changable via MORPH_PERIOD environment\n";

$url_base = "https://eservice.alburycity.nsw.gov.au/ApplicationTracker";
$comment_base = "mailto:info@alburycity.nsw.gov.au";

# Agreed Terms
$browser = new PGBrowser();
$page = $browser->get($url_base . "/");
$form = $page->form();
$form->set('agreed', 'true');
$page = $form->submit();

/* Request the actual payload
* Note: $junk has been modified to download 1000 records - Not for slow server!
*/
$headers = ["Accept: application/json, text/javascript, */*; q=0.01"];
$junk = "draw=1&columns%5B0%5D%5Bdata%5D=0&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=false&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=1&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=false&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=2&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=true&columns%5B2%5D%5Borderable%5D=false&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=3&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=true&columns%5B3%5D%5Borderable%5D=false&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=4&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=true&columns%5B4%5D%5Borderable%5D=false&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&start=0&length=1000&search%5Bvalue%5D=&search%5Bregex%5D=false&json=";
$json = '{"ApplicationNumber":null,"ApplicationYear":null,"DateFrom":"01/04/2017","DateTo":"01/04/2017","DateType":"1","RemoveUndeterminedApplications":false,"ApplicationDescription":null,"ApplicationType":null,"UnitNumberFrom":null,"UnitNumberTo":null,"StreetNumberFrom":null,"StreetNumberTo":null,"StreetName":null,"SuburbName":null,"PostCode":null,"PropertyName":null,"LotNumber":null,"PlanNumber":null,"ShowOutstandingApplications":false,"ShowExhibitedApplications":false,"PropertyKeys":null,"PrecinctValue":null,"IncludeDocuments":false}';
$json = json_decode($json);
$json->DateFrom = $sdate;
$json->DateTo = $edate;
$json = json_encode($json);
$page = $browser->post($url_base. "/Application/GetApplications", $junk. urlencode($json), $headers);

# get payload from the HTTP respond
$payload = preg_split("#\n\s*\n#Uis", $page->html);
$payload = json_decode($payload[1]);

if ($payload->recordsTotal > 0) {
foreach ($payload->data as $record) {
$description = explode("<b>", $record[4])[1];
$description = strip_tags($description);
$description = empty($description) ? $record[2] : preg_replace('/\s+/', ' ', $description);

$date_received = explode("/", $record[3]);
$date_received = $date_received[2]. "-" .$date_received[1]. "-" .$date_received[0];

# Put all information in an array
$application = [
'council_reference' => $record[1],
'address' => explode(" <br/>", $record[4])[0],
'description' => $description,
'info_url' => $url_base . "/Application/ApplicationDetails/" .$record[0],
'comment_url' => $comment_base,
'date_scraped' => date('Y-m-d'),
'date_received' => $date_received
];

# Check if record exist, if not, INSERT, else do nothing
$existingRecords = scraperwiki::select("* from data where `council_reference`='" . $application['council_reference'] . "'");
if (count($existingRecords) == 0) {
print ("Saving record " . $application['council_reference'] . " - " .$application['address']. "\n");
print_r ($application);
scraperwiki::save(['council_reference'], $application);
} else {
print ("Skipping already saved record " . $application['council_reference'] . "\n");
}
}
}
else {
print ("No data returned from feed");
}
?>

0 comments on commit 8e36b39

Please sign in to comment.