Skip to content

Commit

Permalink
Changed to derive the suburb based on the street (where the street ex…
Browse files Browse the repository at this point in the history
…ists in only one suburb).
  • Loading branch information
MichaelBone committed May 11, 2019
1 parent 12954d7 commit 72f8a08
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 13 deletions.
1 change: 1 addition & 0 deletions package.json
Expand Up @@ -6,6 +6,7 @@
"main": "scraper.js",
"dependencies": {
"cheerio": "1.0.0-rc.2",
"didyoumean2": "^2.0.2",
"fs": "0.0.1-security",
"moment": "2.24.0",
"pdfjs-dist": "2.0.943",
Expand Down
122 changes: 115 additions & 7 deletions scraper.js

Large diffs are not rendered by default.

148 changes: 142 additions & 6 deletions scraper.ts
Expand Up @@ -13,6 +13,7 @@ import * as sqlite3 from "sqlite3";
import * as urlparser from "url";
import * as moment from "moment";
import * as pdfjs from "pdfjs-dist";
import didYouMean, * as didyoumean from "didyoumean2";

sqlite3.verbose();

Expand Down Expand Up @@ -46,7 +47,7 @@ async function initializeDatabase() {

async function insertRow(database, developmentApplication) {
return new Promise((resolve, reject) => {
let sqlStatement = database.prepare("insert or ignore into [data] values (?, ?, ?, ?, ?, ?, ?)");
let sqlStatement = database.prepare("insert or replace into [data] values (?, ?, ?, ?, ?, ?, ?)");
sqlStatement.run([
developmentApplication.applicationNumber,
developmentApplication.address,
Expand All @@ -60,10 +61,7 @@ async function insertRow(database, developmentApplication) {
console.error(error);
reject(error);
} else {
if (this.changes > 0)
console.log(` Inserted: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" into the database.`);
else
console.log(` Skipped: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" because it was already present in the database.`);
console.log(` Saved application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" to the database.`);
sqlStatement.finalize(); // releases any locks
resolve(row);
}
Expand Down Expand Up @@ -356,6 +354,140 @@ async function parseElements(page) {
return elements;
}

// Formats (and corrects) an address.

function formatAddress(applicationNumber: string, address: string) {
address = address.trim();
if (address.startsWith("LOT:") || address.startsWith("No Residential Address"))
return "";

// Remove the comma in house numbers larger than 1000. For example, the following addresses:
//
// 4,665 Princes HWY MENINGIE 5264
// 11,287 Princes HWY SALT CREEK 5264
//
// would be converted to the following:
//
// 4665 Princes HWY MENINGIE 5264
// 11287 Princes HWY SALT CREEK 5264

if (/^\d,\d\d\d/.test(address))
address = address.substring(0, 1) + address.substring(2);
else if (/^\d\d,\d\d\d/.test(address))
address = address.substring(0, 2) + address.substring(3);

let tokens = address.split(" ");

let postCode = undefined;
let token = tokens.pop();
if (/^\d\d\d\d$/.test(token))
postCode = token;
else
tokens.push(token);

// Ensure that a state code is added before the post code if a state code is not present.

let state = "SA";
token = tokens.pop();
if ([ "ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC", "WA" ].includes(token.toUpperCase()))
state = token.toUpperCase();
else
tokens.push(token);

// Construct a fallback address to be used if the suburb name cannot be determined later.

let fallbackAddress = (postCode === undefined) ? address : [ ...tokens, state, postCode].join(" ");

// Pop tokens from the end of the array until a valid suburb name is encountered (allowing
// for a few spelling errors). Note that this starts by examining for longer matches
// (consisting of four tokens) before examining shorter matches. This approach ensures
// that the following address:
//
// 2,800 Woods Well RD COLEBATCH 5266
//
// is correctly converted to the following address:
//
// 2800 WOODS WELL ROAD, COLEBATCH SA 5266
//
// rather than (incorrectly) to the following address (notice that the street name has "BELL"
// instead of "WELL" because there actually is a street named "BELL ROAD").
//
// 2800 Woods BELL ROAD, COLEBATCH SA 5266

let suburbName = undefined;
for (let index = 4; index >= 1; index--) {
let suburbNameMatch = <string>didYouMean(tokens.slice(-index).join(" "), Object.keys(SuburbNames), { caseSensitive: false, returnType: didyoumean.ReturnTypeEnums.FIRST_CLOSEST_MATCH, thresholdType: didyoumean.ThresholdTypeEnums.EDIT_DISTANCE, threshold: 1, trimSpaces: true });
if (suburbNameMatch !== null) {
suburbName = SuburbNames[suburbNameMatch];
tokens.splice(-index, index); // remove elements from the end of the array
break;
}
}

// Expand any street suffix (for example, this converts "ST" to "STREET").

token = tokens.pop();
let streetSuffix = StreetSuffixes[token.toUpperCase()];
if (streetSuffix === undefined)
streetSuffix = Object.values(StreetSuffixes).find(streetSuffix => streetSuffix === token.toUpperCase()); // the street suffix is already expanded

if (streetSuffix === undefined)
tokens.push(token); // unrecognised street suffix
else
tokens.push(streetSuffix); // add back the expanded street suffix

// Pop tokens from the end of the array until a valid street name is encountered (allowing
// for a few spelling errors). Similar to the examination of suburb names, this examines
// longer matches before examining shorter matches (for the same reason).

let streetName = undefined;
for (let index = 5; index >= 1; index--) {
let streetNameMatch = <string>didYouMean(tokens.slice(-index).join(" "), Object.keys(StreetNames), { caseSensitive: false, returnType: didyoumean.ReturnTypeEnums.FIRST_CLOSEST_MATCH, thresholdType: didyoumean.ThresholdTypeEnums.EDIT_DISTANCE, threshold: 1, trimSpaces: true });
if (streetNameMatch !== null) {
streetName = streetNameMatch;
let suburbNames = StreetNames[streetNameMatch];
tokens.splice(-index, index); // remove elements from the end of the array

// If the suburb was not determined earlier then attempt to obtain the suburb based
// on the street (ie. if there is only one suburb associated with the street). For
// example, this would automatically add the suburb to "22 Jefferson CT 5263",
// producing the address "22 JEFFERSON COURT, WELLINGTON EAST SA 5263".

if (suburbName === undefined && suburbNames.length === 1)
suburbName = SuburbNames[suburbNames[0]];

break;
}
}

// If a post code was included in the original address then use it to override the post code
// included in the suburb name (because the post code in the original address is more likely
// to be correct).

if (postCode !== undefined && suburbName !== undefined)
suburbName = suburbName.replace(/\s+\d\d\d\d$/, " " + postCode);

// Do not allow an address that does not have a suburb name.

if (suburbName === undefined) {
console.log(`Ignoring the development application "${applicationNumber}" because a suburb name could not be determined for the address: ${address}`);
return "";
}

// Reconstruct the address with a comma between the street address and the suburb.

if (suburbName === undefined || suburbName.trim() === "")
address = fallbackAddress;
else {
if (streetName !== undefined && streetName.trim() !== "")
tokens.push(streetName);
let streetAddress = tokens.join(" ").trim();
address = streetAddress + (streetAddress === "" ? "" : ", ") + suburbName;
}

return address;
}

// Parses the development applications in the specified date range.

async function parsePdf(url: string) {
Expand Down Expand Up @@ -523,6 +655,10 @@ async function parsePdf(url: string) {
continue;
}

address = formatAddress(applicationNumber, address);
if (address === "")
continue;

// Construct the description.

let description = "";
Expand Down Expand Up @@ -643,7 +779,7 @@ async function main() {
if (global.gc)
global.gc();

console.log(`Inserting development applications into the database.`);
console.log(`Saving development applications to the database.`);
for (let developmentApplication of developmentApplications)
await insertRow(database, developmentApplication);
}
Expand Down

0 comments on commit 72f8a08

Please sign in to comment.