Skip to content

Commit

Permalink
Added a user agent string and updated the request.js version.
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelBone committed Sep 13, 2018
1 parent 7807fb4 commit d39f096
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 3 deletions.
2 changes: 1 addition & 1 deletion package.json
Expand Up @@ -5,7 +5,7 @@
"cheerio": "^0.22.0",
"moment": "^2.22.2",
"pdf2json": "^1.1.7",
"request": "^2.87.0",
"request": "^2.88.0",
"sqlite3": "^4.0.1",
"url": "^0.11.0"
},
Expand Down
17 changes: 15 additions & 2 deletions scraper.js
Expand Up @@ -62,7 +62,13 @@ function insertRow(database, pdfFileName, developmentApplication) {

function requestPage(url, callback) {
console.log(`${moment().format("YYYY-MM-DD HH:mm:ss")} Requesting page: ${url}`);
request({ url: url, headers: { "Connection": "keep-alive" } }, function(error, response, body) {
request({
url: url,
headers: {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134"
}
}, function(error, response, body) {
console.log(`${moment().format("YYYY-MM-DD HH:mm:ss")} Request for page complete: ${url}`);
if (error)
console.log(`${moment().format("YYYY-MM-DD HH:mm:ss")} Error requesting page ${url}: ${error}`);
Expand Down Expand Up @@ -115,7 +121,14 @@ function parsePdfs(database, url) {

count++;
console.log(`${moment().format("YYYY-MM-DD HH:mm:ss")} Requesting data from PDF ${count} of ${selectedPdfUrls.length} at: ${pdfUrl}`);
request({ url: pdfUrl, encoding: null, headers: { "Connection": "keep-alive" } }, function(error, response, pdfBuffer) {
request({
url: pdfUrl,
encoding: null,
headers: {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134"
}
}, function(error, response, pdfBuffer) {
console.log(`${moment().format("YYYY-MM-DD HH:mm:ss")} Obtained data from PDF at: ${pdfUrl}`);
let pdfParser = new pdf2json();
pdfParser
Expand Down

0 comments on commit d39f096

Please sign in to comment.