forked from planningalerts-scrapers/unley
/
scraper.rb
100 lines (84 loc) · 3.62 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
require 'scraperwiki'
require 'mechanize'
require 'date'
require 'logger'
base_url = "https://online.unley.sa.gov.au/ePathway/Production/Web/GeneralEnquiry/"
url = "#{base_url}enquirylists.aspx"
agent = Mechanize.new do |a|
a.keep_alive = true # to avoid a "Net::HTTP::Persistent::Error:too many connection resets" condition
# https://github.com/tenderlove/mechanize/issues/123#issuecomment-6432074
# a.log = Logger.new $stderr
# a.agent.http.debug_output = $stderr
# a.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
p "Getting first page"
first_page = agent.get url
# p first_page.body
p "Getting first page again with " + first_page.body.scan(/js=-?\d+/)[0]
url_query = url + '?' + first_page.body.scan(/js=-?\d+/)[0]
first_page = agent.get url_query
# p first_page.title.strip
first_page_form = first_page.forms.first
# select the "List of Development Applications" radio button
first_page_form.radiobuttons[0].click
search_page = first_page_form.click_button
# select the "Date Lodged" tab
search_form = search_page.forms.first
# search_form['__EVENTTARGET'] = 'ctl00$MainBodyContent$mGeneralEnquirySearchControl$mTabControl$tabControlMenu'
# search_form['__EVENTARGUMENT'] = '3'
#search_form['__LASTFOCUS'] = ''
#search_form['ctl00$MainBodyContent$mGeneralEnquirySearchControl$mEnquiryListsDropDownList'] = '10'
#search_form['ctl00$MainBodyContent$mGeneralEnquirySearchControl$mTabControl$ctl04$mStreetNameTextBox'] = ''
#search_form['ctl00$MainBodyContent$mGeneralEnquirySearchControl$mTabControl$ctl04$mStreetNumberTextBox'] = ''
#search_form['ctl00$MainBodyContent$mGeneralEnquirySearchControl$mTabControl$ctl04$mStreetTypeDropDown'] = '(any)'
#search_form['ctl00$MainBodyContent$mGeneralEnquirySearchControl$mTabControl$ctl04$mSuburbTextBox'] = ''
#search_form['ctl00$mHeight'] = '807'
#search_form['ctl00$mWidth'] = '1184'
p "Clicking Date Lodged tab"
search_page = agent.submit(search_form)
p "Searching"
# p search_page.title.strip
search_form = search_page.forms.first
# get the button you want from the form
button = search_form.button_with(:value => "Search")
# submit the form using that button
summary_page = agent.submit(search_form, button)
# p summary_page.title.strip
count = 0
das_data = []
while summary_page
table = summary_page.root.at_css('.ContentPanel')
headers = table.css('th').collect { |th| th.inner_text.strip }
das_data = das_data + table.css('.ContentPanel, .AlternateContentPanel').collect do |tr|
tr.css('td').collect { |td| td.inner_text.strip }
end
next_page_img = summary_page.root.at_xpath("//td/input[contains(@src, 'nextPage')]")
summary_page = nil
if next_page_img
next_page_path = next_page_img['onclick'].split(',').find { |e| e =~ /.*PageNumber=\d+.*/ }.gsub('"', '').strip
p "Found another page: " + next_page_path
summary_page = agent.get "#{base_url}#{next_page_path}"
count += 1
if count > 10
p "Stopping paging after " + count.to_s + " pages."
break
end
end
end
comment_url = 'mailto:pobox1@unley.sa.gov.au'
das = das_data.collect do |da_item|
page_info = {}
page_info['council_reference'] = da_item[headers.index('Number')]
# There is a direct link but you need a session to access it :(
page_info['info_url'] = url
page_info['description'] = da_item[headers.index('Description')]
page_info['date_received'] = Date.strptime(da_item[headers.index('Lodgement Date')], '%d/%m/%Y').to_s
page_info['address'] = da_item[headers.index('Location')]
page_info['date_scraped'] = Date.today.to_s
page_info['comment_url'] = comment_url
page_info
end
das.each do |record|
ScraperWiki.save_sqlite(['council_reference'], record)
end
p "Complete."