-
Notifications
You must be signed in to change notification settings - Fork 3
/
crs_scraper.py
301 lines (257 loc) · 9.15 KB
/
crs_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# CRS reports scraper.
import sys
import re
import os
import os.path
import datetime
import time
import json
import hashlib
import base64
import sqlite3
import scrapelib
import dropbox
# Create a scraper that automatically throttles our requests
# so that we don't overload the CRS server.
scraper = scrapelib.Scraper(
requests_per_minute=35,
retry_attempts=2,
retry_wait_seconds=10)
# Open our local database that remembers what we've already
# fetched & uploaded so that we don't repeatedly fetch the
# same thing.
db = sqlite3.connect('crs_scraper_database.db')
#####################################################################
def run_in_background():
def MessageBox(message, buttons=0):
from ctypes import windll
return windll.user32.MessageBoxW(0, message, "CRS Reports Scraper", buttons)
# Ask the user if they want to start the scraper in the background?
if MessageBox("Would you like to start the background CRS reports scraper?", buttons=4) != 6:
MessageBox("Okay, program stopping.")
return
try:
# Periodically run the scraper in the background around 6pm. But start
# one run immediately so that we can test that it works.
first = True
while True:
# Is it close to 6pm?
if not first and datetime.datetime.now().hour != 18:
# Pause and check again.
wait_time = datetime.timedelta(minutes=23)
time.sleep(wait_time.total_seconds())
continue
# Run the scraper once.
first = False
run_scraper()
# Delay 12 hours to make sure we don't scrape more than once
# on the same day. Then start waiting again until it's the
# right time for another scrape.
wait_time = datetime.timedelta(hours=12)
time.sleep(wait_time.total_seconds())
continue
except Exception as e:
MessageBox("The CRS scraper ran into an error and is terminating: " + str(e))
raise
def run_scraper():
# Loop through the pages of the listing of CRS reports.
pageNumber = 1
while True:
print("Fetching page", pageNumber, "...")
had_stuff = fetch_from_json_listing(pageNumber)
if had_stuff:
pageNumber += 1
continue
else:
break
#####################################################################
def create_db_tables():
# Create database tables on first run. Ignore errors about
# tables already existing if this isn't first run.
c = db.cursor()
for table_name, table_def in [
("fetched", "filename text, fetch_date datetime, content_hash text")
]:
try:
c.execute("CREATE TABLE %s (%s)" % (table_name, table_def))
print("Initialized database table '%s'." % table_name)
except sqlite3.OperationalError as e:
if ("table %s already exists" % table_name) not in str(e):
raise e
db.commit()
def fetch_from_json_listing(pageNumber):
if "--test" not in sys.argv:
# Requests one of the CRS JSON URLs that lists documents.
url = "http://www.crs.gov/search/results?term=orderBy=Date&navIds=4294952831&navIds=4294938681&pageNumber=%d" \
% pageNumber
body = scraper.get(url).content
documents = json.loads(body.decode("utf8"))
else:
# Use test data from a file on disk.
documents = json.load(open(os.path.join("test", "SearchResults_%d.json" % pageNumber)))
# process each document
for doc in documents["SearchResults"]:
fetch_document(doc)
# If this JSON file had any reports, return True. If we got back an
# empty list, then this was the last page of results, so return False.
return len(documents["SearchResults"]) > 0
def fetch_document(document):
# Fetch a "document". Each document is composed of one or more files.
# We'll treat the metadata record itself as a file, and then each
# file mentioned in its FormatList is another file.
# Create a filename for the metadata. We'll use the CoverDate and SHA1
# of the JSON to form a unique filename that changes whenever the
# metadata changes. Ensure that weird data in CRS fields doesn't
# create invalid filenames for us by sanitizing the fields.
crs_product_id = re.sub(r"[^A-Za-z0-9]", "", document['ProductNumber'])
crs_report_date = re.sub(r"\D", "", document['CoverDate'])[0:8]
metadata_filename = \
"documents/" \
+ "_".join([
crs_report_date,
crs_product_id,
sha1(json.dumps(document, sort_keys=True).encode("utf8"))
]) \
+ ".json"
# Don't bother fetching files that are very old. We already have
# these.
if crs_report_date < "201604":
return False
# If we've already seen this, then we're done.
if has_gotten_file(metadata_filename):
print("Already got", metadata_filename)
return False
# Add some stuff to the metadata.
document["_fetched"] = datetime.datetime.utcnow().isoformat() # UTC
# Fetch the files mentioned in the metadata and augment the metadata
# with the SHA1 of each file.
for file in document["FormatList"]:
if "--test" not in sys.argv:
# Construct full URL.
file_url = "http://crs.gov/" + file["Url"]
# Fetch content.
print(file_url, '...')
response = scraper.get(file_url)
file_url = response.url
file_content = response.content
file_encoding = response.encoding
file_headers = dict(response.headers)
else:
# Load test data.
file_url = "file://local/path"
file_content = open(os.path.join("test", file["Url"]), "rb").read()
file_encoding = "unknown"
file_headers = { }
# Assign it a filename using the document's cover date, the document's
# product identifier, the SHA1 hash of the file's content, and the
# file's type as a file extension (for convenience).
file_filename = \
"files/" \
+ "_".join([
crs_report_date,
document['ProductNumber'],
sha1(file_content)
]) \
+ { "PDF": ".pdf", "HTML": ".html" }.get(file["FormatType"], "")
# Add file metadata to the document.
file["_"] = {
"encoding": file_encoding,
"url": file_url,
"headers": file_headers,
"sha1": sha1(file_content),
"filename": file_filename,
"images": { },
}
# Save the file to disk and add it to our database so we know we don't
# need to fetch it again.
save_file(file_filename, file_content)
# Scan HTML documents for images and fetch those too.
if file["FormatType"] == "HTML":
for img in re.findall(b'src="(/products/Getimages/\?directory=[^"]+&id=/[^"]+\.png)"', file_content):
img = img.decode("ascii")
# Fetch and save the image, and add a mapping in the
# metadata from CRS.gov image paths to the filename
# we stored it in.
if "--test" not in sys.argv:
image_url = "http://crs.gov" + img
print(image_url, '...')
response = scraper.get(image_url)
image_content = response.content
else:
# Load test data.
image_content = b"someinvalidrawdata"
image_filename = \
"files/" \
+ "_".join([
crs_report_date,
document['ProductNumber'],
"images",
sha1(image_content)
]) \
+ ".png"
file["_"]["images"][img] = image_filename
save_file(image_filename, image_content)
# Save the metadata record as a file as well, and that's how we'll know what
# PDF/HTML file corresponds with what report.
save_file(
metadata_filename,
json.dumps(document, indent=2).encode("utf8"))
return True
def sha1(stuff):
h = hashlib.sha1()
h.update(stuff)
return h.hexdigest()
def has_gotten_file(filename):
# Returns the content_hash stored in our database of the named file.
# If the file isn't in our database, returns None.
cur = db.cursor()
cur.execute("SELECT content_hash FROM fetched WHERE filename = ?", (filename,))
r = cur.fetchone()
if r:
r = r[0]
return r
def save_file(filename, payload):
print(">", filename)
if not dropbox_client:
# Save the file. Make a directory for it if the directory doesn't exist.
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "wb") as f:
f.write(payload)
else:
# Upload to Dropbox.
dropbox_client.files_upload(payload, dropbox_root_path + '/' + filename)
# Record the file in our database. Cleear an existing record in case
# we're saving a file with a changed hash (although that'd be strange
# since the hash is in the filename).
content_hash = sha1(payload)
cur = db.cursor()
cur.execute("DELETE FROM fetched WHERE content_hash = ?", (content_hash,))
cur.execute("INSERT INTO fetched values (?, ?, ?)", (
filename,
datetime.datetime.utcnow(),
content_hash,
))
db.commit()
#####################################################################
# Initialize the database on first use.
create_db_tables()
# Load Dropbox auth from dropbox_access_token.txt, which should look
# like:
# TOKEN={generate an access token from the app page}
# PATH=/name
# If the app has permission to upload to an app directory only, the
# uploads will be put in Apps\CRS-Scraper-Uploads\name.
dropbox_client = None
dropbox_root_path = None
if os.path.exists("dropbox_access_token.txt"):
dropbox_authz = dict(line.strip().split("=",1) for line in open("dropbox_access_token.txt") if line.strip())
dropbox_client = dropbox.Dropbox(dropbox_authz["TOKEN"])
dropbox_user = dropbox_client.users_get_current_account()
dropbox_root_path = dropbox_authz["PATH"]
print("Uploading to Dropbox account", dropbox_user.name.display_name, dropbox_user.email, "at", dropbox_root_path)
if not sys.stdin is None:
# We're running on a console.
run_scraper()
else:
# We're running inside pythonw without a console.
run_in_background()