This repository has been archived by the owner on Oct 17, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 41
/
traffic_pubsub_generator.py
294 lines (264 loc) · 12.7 KB
/
traffic_pubsub_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env python
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script reads traffic sensor data from a file and publishes that data
to PubSub. If you run it on a GCE instance, this instance must be
created with the "Cloud Platform" Project Access enabled. Click on
"Show advanced options" when creating the image to find this setting.
Before you run the script, create two PubSub topics, in the same project as the
GCE instance you will run on, to publish to. Edit the TRAFFIC_TOPIC and
INCIDENT_TOPIC variables in the script to point to these topics, or you can
pass in their names as command-line arguments.
Before you run this script, download an input data file (~2GB):
curl -O \
http://storage.googleapis.com/aju-sd-traffic/unzipped/Freeways-5Minaa2010-01-01_to_2010-02-15.csv
Or, for a smaller test file, you can use:
http://storage.googleapis.com/aju-sd-traffic/unzipped/Freeways-5Minaa2010-01-01_to_2010-02-15_test2.csv
These files contain real traffic sensor data from San Diego freeways.
See this file for copyright info:
http://storage.googleapis.com/aju-sd-traffic/freeway_detector_config/Freeways-Metadata-2010_01_01/copyright(san%20diego).txt
Usage:
Run the script like this to 'replay', with pauses in data publication
consistent with pauses in the series of data timestamps, which arrive every 5
minutes:
% python traffic_pubsub_generator.py --filename 'yourdatafile.csv' --replay
To restrict to N lines, do something like this:
% python traffic_pubsub_generator.py --filename 'yourdatafile.csv' \
--num_lines 10 --replay
To alter the data timestamps to start from the script time, add
the --current flag.
If you want to set the topics from the command line, use
the --topic and --incident_topic flags.
Run 'python traffic_pubsub_generator.py -h' for more information.
"""
import argparse
import base64
import csv
import datetime
import random
import sys
import time
from apiclient import discovery
from dateutil.parser import parse
from oauth2client.client import GoogleCredentials
# default; set to your traffic topic. Can override on command line.
TRAFFIC_TOPIC = 'projects/your-project/topics/your-topic'
# default; set to your incident topic. Can override on command line.
INCIDENT_TOPIC = 'projects/your-project/topics/your-incident-topic'
LINE_BATCHES = 100 # report periodic progress
PUBSUB_SCOPES = ['https://www.googleapis.com/auth/pubsub']
NUM_RETRIES = 3
INCIDENT_TYPES = ['Traffic Hazard - Vehicle', 'Traffic Collision - No Details',
'Traffic Collision - No Injuries',
'Traffic Collision - Ambulance Responding',
'Hit and Run - No Injuries',
'Vehicle Fire', 'Pedestrian on a Highway', 'Animal on Road']
INCIDENT_DURATION_RANGE = 60 # max in minutes of a randomly generated duration
# The following value is used to determine whether an
# 'incident' is generated for a given reading. Increase/decrease this value
# to increase/decrease the likelihood that an incident is generated for a given
# reading.
INCIDENT_THRESH = 0.005
def create_pubsub_client():
"""Build the pubsub client."""
credentials = GoogleCredentials.get_application_default()
if credentials.create_scoped_required():
credentials = credentials.create_scoped(PUBSUB_SCOPES)
return discovery.build('pubsub', 'v1beta2', credentials=credentials)
def publish(client, pubsub_topic, data_line, msg_attributes=None):
"""Publish to the given pubsub topic."""
data = base64.b64encode(data_line)
msg_payload = {'data': data}
if msg_attributes:
msg_payload['attributes'] = msg_attributes
body = {'messages': [msg_payload]}
resp = client.projects().topics().publish(
topic=pubsub_topic, body=body).execute(num_retries=NUM_RETRIES)
return resp
def maybe_add_delay(line, ts_int):
"""Randomly determine whether to simulate a publishing delay with this
data element."""
# 10 mins in ms. Edit this value to change the amount of delay.
ms_delay = 600000
threshold = .005
if random.random() < threshold:
ts_int -= ms_delay # generate 10-min apparent delay
print line
line[0] = "%s" % datetime.datetime.utcfromtimestamp(ts_int/1000)
print ("Delaying ts attr %s, %s, %s" %
(ts_int, datetime.datetime.utcfromtimestamp(ts_int/1000),
line))
return (line, ts_int)
def process_current_mode(orig_date, diff, line, replay, random_delays):
"""When using --current flag, modify original data to generate updated time
information."""
epoch = datetime.datetime(1970, 1, 1)
if replay: # use adjusted date from line of data
new_date = orig_date + diff
line[0] = new_date.strftime("%Y-%m-%d %H:%M:%S")
ts_int = int(
((new_date - epoch).total_seconds() * 1000) +
new_date.microsecond/1000)
# 'random_delays' indicates whether to include random apparent delays
# in published data
if random_delays:
(line, ts_int) = maybe_add_delay(line, ts_int)
return (line, str(ts_int))
else: # simply using current time
currtime = datetime.datetime.utcnow()
ts_int = int(
((currtime - epoch).total_seconds() * 1000) +
currtime.microsecond/1000)
line[0] = currtime.strftime("%Y-%m-%d %H:%M:%S")
# 'random_delays' indicates whether to include random apparent delays
# in published data
if random_delays:
(line, ts_int) = maybe_add_delay(line, ts_int)
return (line, str(ts_int))
def process_noncurrent_mode(orig_date, line, random_delays):
"""Called when not using --current flag; retaining original time
information in data."""
epoch = datetime.datetime(1970, 1, 1)
ts_int = int(
((orig_date - epoch).total_seconds() * 1000) +
orig_date.microsecond/1000)
# 'random_delays' indicates whether to include random apparent delays
# in published data
if random_delays:
(line, ts_int) = maybe_add_delay(line, ts_int)
return (line, str(ts_int))
def publish_random_incident(client, incident_topic, incident_id,
timestamp, station_id, freeway, travel_direction,
msg_attributes=None):
"""Generate a random traffic 'incident' based on information from the
given traffic reading, and publish it to the specified 'incidents' pubsub
topic."""
duration = random.randrange(INCIDENT_DURATION_RANGE) # minutes
cause = INCIDENT_TYPES[random.randrange(len(INCIDENT_TYPES))]
data_line = '%s,%s,%s,%s,%s,%s,%s' % (incident_id, timestamp, duration,
station_id, freeway,
travel_direction, cause)
print "incident data: %s" % data_line
publish(client, incident_topic, data_line, msg_attributes)
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument("--replay", help="Replay in 'real time'",
action="store_true")
parser.add_argument("--current",
help="Use date adjusted from script start time.",
action="store_true")
parser.add_argument("--incidents",
help="Whether to generate and publish (fake) " +
"traffic incidents. Requires a second PubSub topic " +
"to be specified.",
action="store_true")
parser.add_argument("--random_delays",
help="Whether to randomly alter the data to " +
"sometimes introduce delays between log date and " +
"publish timestamp.",
action="store_true")
parser.add_argument("--filename", help="input filename")
parser.add_argument("--num_lines", type=int, default=0,
help="The number of lines to process. " +
"0 indicates all.")
parser.add_argument("--topic", default=TRAFFIC_TOPIC,
help="The pubsub 'traffic' topic to publish to. " +
"Should already exist.")
parser.add_argument("--incident_topic", default=INCIDENT_TOPIC,
help="The pubsub 'incident' topic to publish to. " +
"Only used if the --incidents flag is set. " +
"If so, should already exist.")
args = parser.parse_args()
pubsub_topic = args.topic
print "Publishing to pubsub 'traffic' topic: %s" % pubsub_topic
incidents = args.incidents
random_delays = args.random_delays
if incidents:
incident_topic = args.incident_topic
print "Publishing to pubsub 'incident' topic: %s" % incident_topic
filename = args.filename
print "filename: %s" % filename
replay = args.replay
print "replay mode: %s" % replay
current = args.current
print "current date mode: %s" % current
num_lines = args.num_lines
if num_lines:
print "processing %s lines" % num_lines
client = create_pubsub_client()
dt = parse('01/01/2010 00:00:00') # earliest date in the traffic files
now = datetime.datetime.utcnow()
# used if altering date to replay from start time
diff = now - dt
# used if running in 'replay' mode, reflecting pauses in the data
prev_date = dt
restart_time = now
line_count = 0
incident_count = 0
print "processing %s" % filename # process the traffic data file
with open(filename) as data_file:
reader = csv.reader(data_file)
for line in reader:
line_count += 1
if num_lines: # if terminating after num_lines processed
if line_count >= num_lines:
print "Have processed %s lines" % num_lines
break
if (line_count % LINE_BATCHES) == 0:
print "%s lines processed" % line_count
ts = ""
try:
timestring = line[0]
orig_date = parse(timestring)
if current: # if using --current flag
(line, ts) = process_current_mode(
orig_date, diff, line, replay, random_delays)
else: # not using --current flag
(line, ts) = process_noncurrent_mode(
orig_date, line, random_delays)
if replay and orig_date != prev_date:
date_delta = orig_date - prev_date
print "date delta: %s" % date_delta.total_seconds()
current_time = datetime.datetime.utcnow()
timelapse = current_time - restart_time
print "timelapse: %s" % timelapse.total_seconds()
d2 = date_delta - timelapse
sleeptime = d2.total_seconds()
print "sleeping %s" % sleeptime
time.sleep(sleeptime)
restart_time = datetime.datetime.utcnow()
print "restart_time is set to: %s" % restart_time
prev_date = orig_date
msg_attributes = {'timestamp': ts}
publish(client, pubsub_topic, ",".join(line), msg_attributes)
if incidents: # if generating traffic 'incidents' as well
# randomly determine whether we'll generate an incident
# associated with this reading.
if random.random() < INCIDENT_THRESH:
print "Generating a traffic incident for %s." % line
# grab the timestring, station id, freeway, and
# direction of travel.
# Then generate some 'incident' data and publish it to
# the incident topic. Use the incident count as a
# simplistic id.
incident_count += 1
publish_random_incident(client, incident_topic,
incident_count,
line[0], line[1], line[2],
line[3], msg_attributes)
except ValueError, e:
sys.stderr.write("---Error: %s for %s\n" % (e, line))
if __name__ == '__main__':
main(sys.argv)