Skip to content

Commit

Permalink
Changed to include the state and postcode in addresses.
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelBone committed May 12, 2019
1 parent 6edc8a0 commit f1d85cb
Show file tree
Hide file tree
Showing 6 changed files with 372 additions and 42 deletions.
46 changes: 23 additions & 23 deletions hundrednames.txt
@@ -1,23 +1,23 @@
BOCKELBERG
CAMPBELL
CHANDADA
CUNGENA
FINLAYSON
FORREST
HASLAM
INKSTER
KALDOONERA
KARCULTABY
KOOLGERA
MURRAY
PERLUBIE
RIPON
ROUNSEVELL
SCOTT
TARLTON
WALLALA
WALPUPPIE
WITERA
WRENFORDSLEY
WRIGHT
YANTANABIE
BOCKELBERG,BOCKELBERG;LOCKES CLAYPAN
CAMPBELL,CHANDADA;INKSTER;MARYVALE;STREAKY BAY
CHANDADA,CHANDADA;CUNGENA;POOCHERA
CUNGENA,CUNGENA;KALDOONERA;YANTANABIE
FINLAYSON,HASLAM;PERLUBIE;PETINA
FORREST,MARYVALE;STREAKY BAY;YANERBIE
HASLAM,CARAWA;HASLAM;PETINA
INKSTER,CHANDADA;INKSTER;MOUNT COOPER
KALDOONERA,KALDOONERA
KARCULTABY,KARCULTABY;POOCHERA
KOOLGERA,KOOLGERA;PUREBA;WALLALA
MURRAY,CHANDADA;CHILPENUNDA;CUNGENA;PIEDNIPPIE
PERLUBIE,PETINA;PIMBAACLA;WIRRULLA
RIPON,STREAKY BAY;WESTALL;YANERBIE
ROUNSEVELL,COLLEY;MORTANA;TYRINGA;WITERA
SCOTT,EBA ANCHORAGE;PERLUBIE;PIEDNIPPIE;STREAKY BAY
TARLTON,CHILPENUNDA;CUNGENA;PETINA
WALLALA,PUREBA;WALLALA;WIRRULLA
WALPUPPIE,WALLALA;WIRRULLA;YANTANABIE
WITERA,COLLEY;INKSTER;MOUNT COOPER;WITERA
WRENFORDSLEY,BAIRD BAY;CALCA;MORTANA;SCEALE BAY;YANERBIE
WRIGHT,COLLEY;WITERA
YANTANABIE,KALDOONERA;KOOLGERA;PUREBA;WALLALA;YANTANABIE
1 change: 1 addition & 0 deletions package.json
Expand Up @@ -6,6 +6,7 @@
"main": "scraper.js",
"dependencies": {
"cheerio": "1.0.0-rc.2",
"didyoumean2": "^2.0.4",
"fs": "0.0.1-security",
"moment": "2.24.0",
"pdfjs-dist": "2.0.943",
Expand Down
167 changes: 157 additions & 10 deletions scraper.js

Large diffs are not rendered by default.

196 changes: 187 additions & 9 deletions scraper.ts
Expand Up @@ -13,6 +13,7 @@ import * as sqlite3 from "sqlite3";
import * as urlparser from "url";
import * as moment from "moment";
import * as pdfjs from "pdfjs-dist";
import didYouMean, * as didyoumean from "didyoumean2";

sqlite3.verbose();

Expand Down Expand Up @@ -44,7 +45,7 @@ async function initializeDatabase() {

async function insertRow(database, developmentApplication) {
return new Promise((resolve, reject) => {
let sqlStatement = database.prepare("insert or ignore into [data] values (?, ?, ?, ?, ?, ?, ?)");
let sqlStatement = database.prepare("insert or replace into [data] values (?, ?, ?, ?, ?, ?, ?)");
sqlStatement.run([
developmentApplication.applicationNumber,
developmentApplication.address,
Expand All @@ -58,10 +59,7 @@ async function insertRow(database, developmentApplication) {
console.error(error);
reject(error);
} else {
if (this.changes > 0)
console.log(` Inserted: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" into the database.`);
else
console.log(` Skipped: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" because it was already present in the database.`);
console.log(` Saved application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" to the database.`);
sqlStatement.finalize(); // releases any locks
resolve(row);
}
Expand Down Expand Up @@ -184,6 +182,7 @@ function parseOldFormatApplicationElements(elements: Element[], informationUrl:
height: propertyAddressHeadingElement.height
};
let address = elements.filter(element => getPercentageOfElementInRectangle(element, addressBounds) > 10).map(element => element.text).join(" ").trim().replace(/\s\s+/g, " ");
address = formatAddress(applicationNumber, address);

if (address === "") {
let elementSummary = elements.map(element => `[${element.text}]`).join("");
Expand Down Expand Up @@ -288,6 +287,7 @@ function parseNewFormatApplicationElements(elements: Element[], informationUrl:
height: (developmentDescriptionHeadingElement === undefined) ? 2 * assessmentNumberHeadingElement.height : (developmentDescriptionHeadingElement.y - (assessmentNumberHeadingElement.y + assessmentNumberHeadingElement.height))
};
let address = elements.filter(element => getPercentageOfElementInRectangle(element, addressBounds) > 10).map(element => element.text).join(" ").trim().replace(/\s\s+/g, " ");
address = formatAddress(applicationNumber, address);

if (address === "") {
let elementSummary = elements.map(element => `[${element.text}]`).join("");
Expand Down Expand Up @@ -319,6 +319,182 @@ function parseNewFormatApplicationElements(elements: Element[], informationUrl:
}
}

// Formats (and corrects) an address.

function formatAddress(applicationNumber: string, address: string) {
address = address.trim().replace(/[-–]+$/, "").replace(/\s\s+/g, " ").trim(); // remove trailing dashes and multiple white space characters
if (address.replace(/[\s,0-]/g, "") === "" || address.startsWith("No Residential Address")) // ignores addresses such as "0 0, 0" and "-"
return "";

// Remove the comma in house numbers larger than 1000. For example, the following addresses:
//
// 4,665 Princes HWY MENINGIE 5264
// 11,287 Princes HWY SALT CREEK 5264
//
// would be converted to the following:
//
// 4665 Princes HWY MENINGIE 5264
// 11287 Princes HWY SALT CREEK 5264

if (/^\d,\d\d\d/.test(address))
address = address.substring(0, 1) + address.substring(2);
else if (/^\d\d,\d\d\d/.test(address))
address = address.substring(0, 2) + address.substring(3);

let tokens = address.split(" ");

let postCode = undefined;
let token = tokens.pop();
if (token === undefined)
return address;
if (/^\d\d\d\d$/.test(token))
postCode = token;
else
tokens.push(token);

// Ensure that a state code is added before the post code if a state code is not present.

let state = "SA";
token = tokens.pop();
if (token === undefined)
return address;
if ([ "ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC", "WA" ].includes(token.toUpperCase()))
state = token.toUpperCase();
else
tokens.push(token);

// Construct a fallback address to be used if the suburb name cannot be determined later.

let fallbackAddress = (postCode === undefined) ? address : [ ...tokens, state, postCode].join(" ").trim();

// Pop tokens from the end of the array until a valid suburb name is encountered (allowing
// for a few spelling errors). Note that this starts by examining for longer matches
// (consisting of four tokens) before examining shorter matches. This approach ensures
// that the following address:
//
// 2,800 Woods Well RD COLEBATCH 5266
//
// is correctly converted to the following address:
//
// 2800 WOODS WELL ROAD, COLEBATCH SA 5266
//
// rather than (incorrectly) to the following address (notice that the street name has "BELL"
// instead of "WELL" because there actually is a street named "BELL ROAD").
//
// 2800 Woods BELL ROAD, COLEBATCH SA 5266
//
// This also allows for addresses that contain hundred names such as the following:
//
// Sec 26 Hd Palabie
// Lot no 1, Standley Road, Sect 16, Hundred of Pygery

let suburbName = undefined;
let hasHundredName = false;

for (let index = 4; index >= 1; index--) {
let tryHundredName = tokens.slice(-index).join(" ").toUpperCase();
if (tryHundredName.startsWith("HD OF ") || tryHundredName.startsWith("HUNDRED OF") || tryHundredName.startsWith("HD ") || tryHundredName.startsWith("HUNDRED ")) {
tryHundredName = tryHundredName.replace(/^HD OF /, "").replace(/^HUNDRED OF /, "").replace(/^HD /, "").replace(/^HUNDRED /, "").trim();
let hundredNameMatch = <string>didYouMean(tryHundredName, Object.keys(HundredNames), { caseSensitive: false, returnType: didyoumean.ReturnTypeEnums.FIRST_CLOSEST_MATCH, thresholdType: didyoumean.ThresholdTypeEnums.EDIT_DISTANCE, threshold: 1, trimSpaces: true });
if (hundredNameMatch !== null) {
hasHundredName = true;
let suburbNames = HundredNames[hundredNameMatch];
if (suburbNames.length === 1) { // if a unique suburb exists for the hundred then use that suburb
suburbName = SuburbNames[suburbNames[0]];
tokens.splice(-index, index); // remove elements from the end of the array
}
break;
}
}
}

// Only search for a suburb name if there was no hundred name (because a suburb name is
// unlikely to appear before a hundred name).

if (!hasHundredName) {
for (let index = 4; index >= 1; index--) {
let trySuburbName = tokens.slice(-index).join(" ");
let suburbNameMatch = <string>didYouMean(trySuburbName, Object.keys(SuburbNames), { caseSensitive: false, returnType: didyoumean.ReturnTypeEnums.FIRST_CLOSEST_MATCH, thresholdType: didyoumean.ThresholdTypeEnums.EDIT_DISTANCE, threshold: 1, trimSpaces: true });
if (suburbNameMatch !== null) {
suburbName = SuburbNames[suburbNameMatch];
tokens.splice(-index, index); // remove elements from the end of the array
break;
}
}
}

// Expand any street suffix (for example, this converts "ST" to "STREET").

token = tokens.pop();
if (token !== undefined) {
token = token.trim().replace(/,+$/, "").trim(); // removes trailing commas
let streetSuffix = StreetSuffixes[token.toUpperCase()];
if (streetSuffix === undefined)
streetSuffix = Object.values(StreetSuffixes).find(streetSuffix => streetSuffix === token.toUpperCase()); // the street suffix is already expanded
if (streetSuffix === undefined)
tokens.push(token); // unrecognised street suffix
else
tokens.push(streetSuffix); // add back the expanded street suffix
}

// Pop tokens from the end of the array until a valid street name is encountered (allowing
// for a few spelling errors). Similar to the examination of suburb names, this examines
// longer matches before examining shorter matches (for the same reason).

let streetName = undefined;
for (let index = 5; index >= 1; index--) {
let tryStreetName = tokens.slice(-index).join(" ").trim().replace(/,+$/, "").trim(); // allows for commas after the street name
let streetNameMatch = <string>didYouMean(tryStreetName, Object.keys(StreetNames), { caseSensitive: false, returnType: didyoumean.ReturnTypeEnums.FIRST_CLOSEST_MATCH, thresholdType: didyoumean.ThresholdTypeEnums.EDIT_DISTANCE, threshold: 1, trimSpaces: true });
if (streetNameMatch !== null) {
streetName = streetNameMatch;
let suburbNames = StreetNames[streetNameMatch];
tokens.splice(-index, index); // remove elements from the end of the array

// If the suburb was not determined earlier then attempt to obtain the suburb based
// on the street (ie. if there is only one suburb associated with the street). For
// example, this would automatically add the suburb to "22 Jefferson CT 5263",
// producing the address "22 JEFFERSON COURT, WELLINGTON EAST SA 5263".

if (suburbName === undefined && suburbNames.length === 1)
suburbName = SuburbNames[suburbNames[0]];

break;
}
}

// If a post code was included in the original address then use it to override the post code
// included in the suburb name (because the post code in the original address is more likely
// to be correct).

if (postCode !== undefined && suburbName !== undefined)
suburbName = suburbName.replace(/\s+\d\d\d\d$/, " " + postCode);

// Do not allow an address that does not have a suburb name.

if (suburbName === undefined) {
console.log(`Ignoring the development application "${applicationNumber}" because a suburb name could not be determined for the address: ${address}`);
return "";
}

// Reconstruct the address with a comma between the street address and the suburb.

if (suburbName === undefined || suburbName.trim() === "")
address = fallbackAddress;
else {
if (streetName !== undefined && streetName.trim() !== "")
tokens.push(streetName);
let streetAddress = tokens.join(" ").trim().replace(/,+$/, "").trim(); // removes trailing commas
address = streetAddress + (streetAddress === "" ? "" : ", ") + suburbName;
}

// Ensure that the address includes the state "SA".

if (address !== "" && !/\bSA\b/g.test(address))
address += " SA";

return address;
}

// Parses the development applications in the specified date range.

async function parsePdf(url: string) {
Expand Down Expand Up @@ -427,9 +603,11 @@ async function main() {
SuburbNames[suburbTokens[0].trim()] = suburbTokens[1].trim();
}

HundredNames = [];
for (let line of fs.readFileSync("hundrednames.txt").toString().replace(/\r/g, "").trim().split("\n"))
HundredNames.push(line.trim().toUpperCase());
HundredNames = {};
for (let line of fs.readFileSync("hundrednames.txt").toString().replace(/\r/g, "").trim().split("\n")) {
let hundredNameTokens = line.toUpperCase().split(",");
HundredNames[hundredNameTokens[0].trim()] = hundredNameTokens[1].trim().split(";");
}

// Read the main page that has links to each year of development applications.

Expand Down Expand Up @@ -525,7 +703,7 @@ async function main() {
if (global.gc)
global.gc();

console.log(`Inserting development applications into the database.`);
console.log(`Saving development applications to the database.`);
for (let developmentApplication of developmentApplications)
await insertRow(database, developmentApplication);
}
Expand Down
3 changes: 3 additions & 0 deletions streetnames.txt
Expand Up @@ -223,6 +223,9 @@ MORONEY ROAD,CHANDADA
MOUNT COOPER ROAD,COLLEY
MOUNT HALL ROAD,MORTANA
MOUNT MARIA DRIVE,STREAKY BAY
MT COOPER ROAD,COLLEY
MT HALL ROAD,MORTANA
MT MARIA DRIVE,STREAKY BAY
MUDGE ROAD,CHANDADA
MUDGE ROAD,STREAKY BAY
MUDGE TERRACE,STREAKY BAY
Expand Down
1 change: 1 addition & 0 deletions suburbnames.txt
Expand Up @@ -15,6 +15,7 @@ KOOLGERA,KOOLGERA SA 5661
MARYVALE,MARYVALE SA 5680
MORTANA,MORTANA SA 5671
MOUNT COOPER,MOUNT COOPER SA 5671
MT COOPER,MOUNT COOPER SA 5671
PERLUBIE,PERLUBIE SA 5680
PETINA,PETINA SA 5680
PIEDNIPPIE,PIEDNIPPIE SA 5680
Expand Down

0 comments on commit f1d85cb

Please sign in to comment.