-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b41abc9
commit 8bfb70b
Showing
7 changed files
with
280 additions
and
63 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,9 @@ | ||
# Ignore output of scraper | ||
# Ignore output of scraper. | ||
data.sqlite | ||
|
||
# Ignore dependency cache. | ||
node_modules | ||
package-lock.json | ||
|
||
# Ignore Visual Studio Code files. | ||
.vscode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) | ||
This is a scraper that runs on [Morph](https://morph.io). It scrapes lodged development applications from the South Australian [City of Playford web site](https://www.playford.sa.gov.au). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,20 @@ | ||
{ | ||
"version": "0.0.1", | ||
"version": "1.0.1", | ||
"engines": { | ||
"node": "10.6.0" | ||
}, | ||
"main": "scraper.js", | ||
"dependencies": { | ||
"cheerio": "latest", | ||
"request": "latest", | ||
"sqlite3": "latest" | ||
"@types/node": "^10.5.6", | ||
"cheerio": "^0.22.0", | ||
"csv-parse": "^2.5.0", | ||
"moment": "^2.22.2", | ||
"request": "^2.87.0", | ||
"request-promise-native": "^1.0.5", | ||
"sqlite3": "^4.0.1" | ||
}, | ||
"keywords": [ | ||
"scraper", | ||
"morph" | ||
] | ||
} | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
// Parses the development applications at the South Australian City of Playford web site and places | ||
// them in a database. | ||
// | ||
// In each VSCode session: to automatically compile this TypeScript script into JavaScript whenever | ||
// the TypeScript is changed and saved, press Ctrl+Shift+B and select "tsc:watch - tsconfig.json". | ||
// This starts a task that watches for changes to the TypeScript script. | ||
// | ||
// Michael Bone | ||
// 5th August 2018 | ||
|
||
"use strict"; | ||
|
||
import * as cheerio from "cheerio"; | ||
import * as parse from "csv-parse/lib/sync"; | ||
import * as request from "request-promise-native"; | ||
import * as sqlite3 from "sqlite3"; | ||
import * as moment from "moment"; | ||
|
||
sqlite3.verbose(); | ||
|
||
const DevelopmentApplicationsUrl = "https://data.sa.gov.au/data/dataset/development-application-register"; | ||
const CommentUrl = "mailto:Playford@playford.sa.gov.au"; | ||
|
||
// Sets up an sqlite database. | ||
|
||
async function initializeDatabase() { | ||
return new Promise((resolve, reject) => { | ||
let database = new sqlite3.Database("data.sqlite"); | ||
database.serialize(() => { | ||
database.run("create table if not exists [data] ([council_reference] text primary key, [address] text, [description] text, [info_url] text, [comment_url] text, [date_scraped] text, [date_received] text, [on_notice_from] text, [on_notice_to] text)"); | ||
resolve(database); | ||
}); | ||
}); | ||
} | ||
|
||
// Inserts a row in the database if it does not already exist. | ||
|
||
async function insertRow(database, developmentApplication) { | ||
return new Promise((resolve, reject) => { | ||
let sqlStatement = database.prepare("insert or ignore into [data] values (?, ?, ?, ?, ?, ?, ?, ?, ?)"); | ||
sqlStatement.run([ | ||
developmentApplication.applicationNumber, | ||
developmentApplication.address, | ||
developmentApplication.reason, | ||
developmentApplication.informationUrl, | ||
developmentApplication.commentUrl, | ||
developmentApplication.scrapeDate, | ||
developmentApplication.receivedDate, | ||
null, | ||
null | ||
], function(error, row) { | ||
if (error) { | ||
console.error(error); | ||
reject(error); | ||
} else { | ||
if (this.changes > 0) | ||
console.log(` Inserted: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\" and reason \"${developmentApplication.reason}\" into the database.`); | ||
else | ||
console.log(` Skipped: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\" and reason \"${developmentApplication.reason}\" because it was already present in the database.`); | ||
sqlStatement.finalize(); // releases any locks | ||
resolve(row); | ||
} | ||
}); | ||
}); | ||
} | ||
|
||
// Parses the development applications. | ||
|
||
async function main() { | ||
// Ensure that the database exists. | ||
|
||
let database = await initializeDatabase(); | ||
|
||
// Retrieve the main page. | ||
|
||
console.log(`Retrieving page: ${DevelopmentApplicationsUrl}`); | ||
let body = await request({ url: DevelopmentApplicationsUrl }); | ||
let $ = cheerio.load(body); | ||
|
||
for (let element of $("a.resource-url-analytics").get()) { | ||
console.log(`Retrieving: ${element.attribs.href}`); | ||
let body = await request({ url: element.attribs.href }); | ||
let rows = parse(body); | ||
console.log(rows); | ||
} | ||
|
||
// Retrieve the results of a search for the last month. | ||
|
||
// let dateFrom = encodeURIComponent(moment().subtract(1, "months").format("DD/MM/YYYY")); | ||
// let dateTo = encodeURIComponent(moment().format("DD/MM/YYYY")); | ||
// let developmentApplicationSearchUrl = DevelopmentApplicationSearchUrl.replace(/\{0\}/g, dateFrom).replace(/\{1\}/g, dateTo); | ||
// console.log(`Retrieving search results for: ${developmentApplicationSearchUrl}`); | ||
// let body = await request({ url: developmentApplicationSearchUrl, jar: jar, rejectUnauthorized: false }); // the cookie jar contains the JSESSIONID_live cookie | ||
// let $ = cheerio.load(body); | ||
|
||
// // Parse the search results. | ||
|
||
// for (let headerElement of $("h4.non_table_headers").get()) { | ||
// let address: string = $(headerElement).text().trim().replace(/\s\s+/g, " "); // reduce multiple consecutive spaces in the address to a single space | ||
// let applicationNumber = ""; | ||
// let reason = ""; | ||
// let receivedDate = moment.invalid(); | ||
|
||
// for (let divElement of $(headerElement).next("div").get()) { | ||
// for (let paragraphElement of $(divElement).find("p.rowDataOnly").get()) { | ||
// let key: string = $(paragraphElement).children("span.key").text().trim(); | ||
// let value: string = $(paragraphElement).children("span.inputField").text().trim(); | ||
// if (key === "Type of Work") | ||
// reason = value; | ||
// else if (key === "Application No.") | ||
// applicationNumber = value; | ||
// else if (key === "Date Lodged") | ||
// receivedDate = moment(value, "D/MM/YYYY", true); // allows the leading zero of the day to be omitted | ||
// } | ||
// } | ||
|
||
// // Ensure that at least an application number and address have been obtained. | ||
|
||
// if (applicationNumber !== "" && address !== "") { | ||
// await insertRow(database, { | ||
// applicationNumber: applicationNumber, | ||
// address: address, | ||
// reason: reason, | ||
// informationUrl: DevelopmentApplicationMainUrl, | ||
// commentUrl: CommentUrl, | ||
// scrapeDate: moment().format("YYYY-MM-DD"), | ||
// receivedDate: receivedDate.isValid ? receivedDate.format("YYYY-MM-DD") : "" | ||
// }); | ||
// } | ||
// } | ||
} | ||
|
||
main().then(() => console.log("Complete.")).catch(error => console.error(error)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"compileOnSave": true, | ||
"compilerOptions": { | ||
"lib": [ | ||
"es2018", | ||
"dom" | ||
], | ||
"target": "es2018", | ||
"module": "commonjs", | ||
"sourceMap": true | ||
}, | ||
"exclude": [ | ||
"node_modules" | ||
] | ||
} |