Skip to content

Commit

Permalink
Added packet tracing.
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelBone committed Aug 9, 2018
1 parent 34f71b9 commit fb20e35
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 8 deletions.
3 changes: 2 additions & 1 deletion package.json
Expand Up @@ -14,7 +14,8 @@
"request": "^2.87.0",
"request-promise-native": "^1.0.5",
"sqlite3": "^4.0.1",
"url": "^0.11.0"
"url": "^0.11.0",
"pcap": "^latest"
},
"keywords": [
"scraper",
Expand Down
28 changes: 21 additions & 7 deletions scraper.ts
Expand Up @@ -13,6 +13,7 @@ import * as urlparser from "url";
import * as moment from "moment";
import * as fs from "fs";
import pdf2json = require("pdf2json");
import pcap = require("pcap");

// import path = require("path");
// let loader = require.extensions[".js"];
Expand Down Expand Up @@ -48,8 +49,6 @@ async function readPDF() {
console.log(`Complete: page count is ${pdf.numPages}.`);
}

readPDF();

sqlite3.verbose();

const DevelopmentApplicationsUrl = "https://www.mountbarker.sa.gov.au/developmentregister";
Expand Down Expand Up @@ -110,8 +109,21 @@ async function main() {

// Retrieve the page contains the links to the PDFs.

let pcapSession = pcap.createSession("", "tcp");
pcapSession.on('packet', function (raw_packet) {
var packet = pcap.decode.packet(raw_packet),
data = packet.link.ip.tcp.data;

if (data) {
console.log(pcap.print.packet(packet));
console.log(data.toString());
}
});

console.log(`Retrieving page: ${DevelopmentApplicationsUrl}`);
let body = await request({ url: DevelopmentApplicationsUrl, proxy: process.env.MORPH_PROXY, headers: {

// let body = await request({ url: DevelopmentApplicationsUrl, proxy: process.env.MORPH_PROXY, headers: {
let body = await request({ url: DevelopmentApplicationsUrl, headers: {
"Accept": "text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8",
"Accept-Encoding": "",
"Accept-Language": "en-AU, en-US; q=0.7, en; q=0.3",
Expand All @@ -125,6 +137,9 @@ async function main() {

let $ = cheerio.load(body);

console.log("Stopping early.");
return;

let pdfUrls: string[] = [];
for (let element of $("td.uContentListDesc a[href$='.pdf']").get()) {
let pdfUrl = new urlparser.URL(element.attribs.href, DevelopmentApplicationsUrl).href;
Expand All @@ -143,9 +158,8 @@ async function main() {

let selectedPdfUrls: string[] = [];
selectedPdfUrls.push(pdfUrls.shift());
console.log("Just selecting one PDF.");
// if (pdfUrls.length > 0)
// selectedPdfUrls.push(pdfUrls[getRandom(1, pdfUrls.length)]);
if (pdfUrls.length > 0)
selectedPdfUrls.push(pdfUrls[getRandom(1, pdfUrls.length)]);

for (let pdfUrl of selectedPdfUrls) {
console.log(`Retrieving document: ${pdfUrl}`);
Expand Down Expand Up @@ -411,4 +425,4 @@ function convertPdfToText(pdf) {
return rows;
}

// main().catch(error => console.error(error));
main().catch(error => console.error(error));

0 comments on commit fb20e35

Please sign in to comment.