Skip to content

Commit

Permalink
Added parsing of CSV data.
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelBone committed Aug 5, 2018
1 parent b41abc9 commit 8bfb70b
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 63 deletions.
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,9 @@
# Ignore output of scraper
# Ignore output of scraper.
data.sqlite

# Ignore dependency cache.
node_modules
package-lock.json

# Ignore Visual Studio Code files.
.vscode
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
This is a scraper that runs on [Morph](https://morph.io). It scrapes lodged development applications from the South Australian [City of Playford web site](https://www.playford.sa.gov.au).
17 changes: 12 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
{
"version": "0.0.1",
"version": "1.0.1",
"engines": {
"node": "10.6.0"
},
"main": "scraper.js",
"dependencies": {
"cheerio": "latest",
"request": "latest",
"sqlite3": "latest"
"@types/node": "^10.5.6",
"cheerio": "^0.22.0",
"csv-parse": "^2.5.0",
"moment": "^2.22.2",
"request": "^2.87.0",
"request-promise-native": "^1.0.5",
"sqlite3": "^4.0.1"
},
"keywords": [
"scraper",
"morph"
]
}
}
166 changes: 110 additions & 56 deletions scraper.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions scraper.js.map

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

133 changes: 133 additions & 0 deletions scraper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Parses the development applications at the South Australian City of Playford web site and places
// them in a database.
//
// In each VSCode session: to automatically compile this TypeScript script into JavaScript whenever
// the TypeScript is changed and saved, press Ctrl+Shift+B and select "tsc:watch - tsconfig.json".
// This starts a task that watches for changes to the TypeScript script.
//
// Michael Bone
// 5th August 2018

"use strict";

import * as cheerio from "cheerio";
import * as parse from "csv-parse/lib/sync";
import * as request from "request-promise-native";
import * as sqlite3 from "sqlite3";
import * as moment from "moment";

sqlite3.verbose();

const DevelopmentApplicationsUrl = "https://data.sa.gov.au/data/dataset/development-application-register";
const CommentUrl = "mailto:Playford@playford.sa.gov.au";

// Sets up an sqlite database.

async function initializeDatabase() {
return new Promise((resolve, reject) => {
let database = new sqlite3.Database("data.sqlite");
database.serialize(() => {
database.run("create table if not exists [data] ([council_reference] text primary key, [address] text, [description] text, [info_url] text, [comment_url] text, [date_scraped] text, [date_received] text, [on_notice_from] text, [on_notice_to] text)");
resolve(database);
});
});
}

// Inserts a row in the database if it does not already exist.

async function insertRow(database, developmentApplication) {
return new Promise((resolve, reject) => {
let sqlStatement = database.prepare("insert or ignore into [data] values (?, ?, ?, ?, ?, ?, ?, ?, ?)");
sqlStatement.run([
developmentApplication.applicationNumber,
developmentApplication.address,
developmentApplication.reason,
developmentApplication.informationUrl,
developmentApplication.commentUrl,
developmentApplication.scrapeDate,
developmentApplication.receivedDate,
null,
null
], function(error, row) {
if (error) {
console.error(error);
reject(error);
} else {
if (this.changes > 0)
console.log(` Inserted: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\" and reason \"${developmentApplication.reason}\" into the database.`);
else
console.log(` Skipped: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\" and reason \"${developmentApplication.reason}\" because it was already present in the database.`);
sqlStatement.finalize(); // releases any locks
resolve(row);
}
});
});
}

// Parses the development applications.

async function main() {
// Ensure that the database exists.

let database = await initializeDatabase();

// Retrieve the main page.

console.log(`Retrieving page: ${DevelopmentApplicationsUrl}`);
let body = await request({ url: DevelopmentApplicationsUrl });
let $ = cheerio.load(body);

for (let element of $("a.resource-url-analytics").get()) {
console.log(`Retrieving: ${element.attribs.href}`);
let body = await request({ url: element.attribs.href });
let rows = parse(body);
console.log(rows);
}

// Retrieve the results of a search for the last month.

// let dateFrom = encodeURIComponent(moment().subtract(1, "months").format("DD/MM/YYYY"));
// let dateTo = encodeURIComponent(moment().format("DD/MM/YYYY"));
// let developmentApplicationSearchUrl = DevelopmentApplicationSearchUrl.replace(/\{0\}/g, dateFrom).replace(/\{1\}/g, dateTo);
// console.log(`Retrieving search results for: ${developmentApplicationSearchUrl}`);
// let body = await request({ url: developmentApplicationSearchUrl, jar: jar, rejectUnauthorized: false }); // the cookie jar contains the JSESSIONID_live cookie
// let $ = cheerio.load(body);

// // Parse the search results.

// for (let headerElement of $("h4.non_table_headers").get()) {
// let address: string = $(headerElement).text().trim().replace(/\s\s+/g, " "); // reduce multiple consecutive spaces in the address to a single space
// let applicationNumber = "";
// let reason = "";
// let receivedDate = moment.invalid();

// for (let divElement of $(headerElement).next("div").get()) {
// for (let paragraphElement of $(divElement).find("p.rowDataOnly").get()) {
// let key: string = $(paragraphElement).children("span.key").text().trim();
// let value: string = $(paragraphElement).children("span.inputField").text().trim();
// if (key === "Type of Work")
// reason = value;
// else if (key === "Application No.")
// applicationNumber = value;
// else if (key === "Date Lodged")
// receivedDate = moment(value, "D/MM/YYYY", true); // allows the leading zero of the day to be omitted
// }
// }

// // Ensure that at least an application number and address have been obtained.

// if (applicationNumber !== "" && address !== "") {
// await insertRow(database, {
// applicationNumber: applicationNumber,
// address: address,
// reason: reason,
// informationUrl: DevelopmentApplicationMainUrl,
// commentUrl: CommentUrl,
// scrapeDate: moment().format("YYYY-MM-DD"),
// receivedDate: receivedDate.isValid ? receivedDate.format("YYYY-MM-DD") : ""
// });
// }
// }
}

main().then(() => console.log("Complete.")).catch(error => console.error(error));
15 changes: 15 additions & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"compileOnSave": true,
"compilerOptions": {
"lib": [
"es2018",
"dom"
],
"target": "es2018",
"module": "commonjs",
"sourceMap": true
},
"exclude": [
"node_modules"
]
}

0 comments on commit 8bfb70b

Please sign in to comment.