forked from marktnoonan/marta-mobility-component
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.php
336 lines (271 loc) · 12.4 KB
/
scrape.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
<?php
// logging in with cURL based on http://thisinterestsme.com/php-login-to-website-with-curl/
// this returns the DOM of the page it fetched as a string
// simple HTML DOM turns that string back into a tree that we can traverse with selectors.
require_once('simple_html_dom.php');
date_default_timezone_set('America/New_York');
$currentDay = date('m-d-Y'); // m includes leading zero, we need this for comparison later.
$tomorrow = date("m-d-Y", strtotime('tomorrow'));
$currentTime = explode(":", date('H:i'));
$currentTimeInMinutes = (intval($currentTime[0]) * 60) + intval($currentTime[1]);
// if UN and PW are set, and not equal to "test" or empty strings.
if (isset($_POST['providedUsername']) && isset($_POST['providedPassword']) && $_POST['providedUsername'] !== '' && $_POST['providedPassword'] !== '' && strtolower($_POST['providedUsername']) !== 'test' && $_POST['providedPassword'] !== 'test') {
$html = martaLogin();
} elseif (strtolower($_POST['providedUsername']) === 'test' && $_POST['providedPassword'] === 'test') { // fetch dummy data for test user Joanna M Customer
$html = file_get_html('MARTAEXAMPLE.html');
} else { // nothing posted
exit("username or password is missing!");
}
$customerInfo = $html->find('div[class=portletContent even]', 0);
$bookingIDs = $html->find('td[class=tripHeader]');
/* adding the booking IDs inside a function lead to blank booking IDs...
I guess it's a scoping issue of some kind, I couldn't get my head around it so
I moved the loop back outside just to get things working again.
*/
$justBookingIDNumbers;
$i = 0;
foreach ($bookingIDs as $bookingID) {
$bookingIDNumber = substr($bookingID->plaintext, 12);
$justBookingIDNumbers[$i] = $bookingIDNumber;
$arrayOfBookings[$i]["bookingID"] = $justBookingIDNumbers[$i];
$arrayOfBookings[$i]["iteratorBookingID"] = $i;
$i++;
}
$datesAndTimes = $html->find('td[valign=middle]');
$arrayOfBookings = datesAndTimes($arrayOfBookings, $datesAndTimes);
$locations = $html->find('td[width=5]');
$arrayOfBookings = locations($arrayOfBookings, $locations);
$arrayOfBookings = removePastBookings($arrayOfBookings);
$json = [
(object) ['clientName' => strip_tags($customerInfo->plaintext), 'bookings' => $arrayOfBookings, 'updatedAt' => date('g:i A')]
];
header('Content-type:application/json');
echo json_encode($json);
function martaLogin()
{
define('USERNAME', $_POST['providedUsername']);
define('PASSWORD', $_POST['providedPassword']);
//Set a user agent. This basically tells the server that we are using Chrome ;)
define('USER_AGENT', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36');
//Where our cookie information will be stored (needed for authentication).
define('COOKIE_FILE', 'cookie.txt');
//URL of the login form.
define('LOGIN_FORM_URL', 'http://mobility.itsmarta.com/hiwire');
//Login action URL. Sometimes, this is the same URL as the login form.
define('LOGIN_ACTION_URL', 'http://mobility.itsmarta.com/hiwire');
//An associative array that represents the required form fields.
//You will need to change the keys / index names to match the name of the form
//fields.
$postValues = array(
'UN' => USERNAME,
'PW' => PASSWORD
);
//Initiate cURL.
$curl = curl_init();
//Set the URL that we want to send our POST request to. In this
//case, it's the action URL of the login form.
curl_setopt($curl, CURLOPT_URL, LOGIN_ACTION_URL);
//Tell cURL that we want to carry out a POST request.
curl_setopt($curl, CURLOPT_POST, true);
//Set our post fields / date (from the array above).
curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($postValues));
//We don't want any HTTPS errors.
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
//Where our cookie details are saved. This is typically required
//for authentication, as the session ID is usually saved in the cookie file.
curl_setopt($curl, CURLOPT_COOKIEJAR, COOKIE_FILE);
//Sets the user agent. Some websites will attempt to block bot user agents.
//Hence the reason I gave it a Chrome user agent.
curl_setopt($curl, CURLOPT_USERAGENT, USER_AGENT);
//Tells cURL to return the output once the request has been executed.
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
//Allows us to set the referer header. In this particular case, we are
//fooling the server into thinking that we were referred by the login form.
curl_setopt($curl, CURLOPT_REFERER, LOGIN_FORM_URL);
//Do we want to follow any redirects?
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, false);
//Execute the login request.
curl_exec($curl);
//Check for errors!
if(curl_errno($curl)){
throw new Exception(curl_error($curl));
}
//We should be logged in by now. Let's attempt to access a password protected page
curl_setopt($curl, CURLOPT_URL, 'http://mobility.itsmarta.com/hiwire?.a=pViewTrips&.s=8ff56be8');
//Use the same cookie file.
curl_setopt($curl, CURLOPT_COOKIEJAR, COOKIE_FILE);
//Use the same user agent, just in case it is used by the server for session validation.
curl_setopt($curl, CURLOPT_USERAGENT, USER_AGENT);
//We don't want any HTTPS / SSL errors.
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
//Execute the GET request and print out the result.
$html = str_get_html(curl_exec($curl));
return $html;
}
function formatTime($time) {
// removing the leading space on shorter times
$time = preg_replace("/ /", "", $time);
if (mb_strlen($time) === 4) {
$time = "0" . $time;
return $time;
}
else {
return $time;
}
}
function datesAndTimes($arrayOfBookings, $datesAndTimes)
{
global $currentDay, $currentTimeInMinutes, $tomorrow;
$i = 0;
foreach ($datesAndTimes as $dateOrTimeNugget) {
// nuggets are what we pull out of the DOM, all need a little different parsing.
$nugget = $dateOrTimeNugget->plaintext;
if (strpos($nugget, 'Ready')) {
$eta = substr($nugget, 35, 5);
//using the preg_replace because the time is not always 5 chars long... eg 7:05 vs 10:05. But sometimes it is, so we need to catch 5 chars anyway. And just remove the line break char if we capture it.
$eta = preg_replace("/\r/", "", $eta);
if (isset($eta) && $eta !== '') {
$eta = formatTime($eta);
$displayEta = date("g:i A", strtotime($eta));
$arrayOfBookings[$i]["displayEta"] = $displayEta;
}
$arrayOfBookings[$i]["eta"] = $eta;
$arrayOfBookings[$i]["iteratorreadynugget"] = $i;
} elseif (strpos($nugget, 'ate')) {
$tripDate = substr($nugget, 6);
// we need to replace the hyphens with slashes to get correct results from the strtotime function. If we don't, it returns day of the week based on European format.
$displayDate = str_replace("-", "/", $tripDate);
$displayDate = date("l, M j", strtotime($displayDate));
if ($tripDate === $currentDay){
$displayDate = "Today";
}
if ($tripDate === $tomorrow) {
$displayDate = "Tomorrow";
}
$arrayOfBookings[$i]["displayDate"] = $displayDate;
$arrayOfBookings[$i]["date"] = $tripDate;
$arrayOfBookings[$i]["iteratorDate"] = $i;
} elseif (strpos($nugget, 'tart')) {
$readyTime = formatTime(substr($nugget, 14));
$displayReadyTime = date("g:i A", strtotime($readyTime));
$arrayOfBookings[$i]['readyTime'] = $readyTime;
$arrayOfBookings[$i]["displayReadyTime"] = $displayReadyTime;
$arrayOfBookings[$i]["iteratorreadyTime"] = $i;
} elseif (strpos($nugget, 'nd Window')) {
$formattedEndWindow = formatTime(substr($nugget, 12));
$displayEndWindow = date("g:i A", strtotime($formattedEndWindow));
$arrayOfBookings[$i]['displayEndWindow'] = $displayEndWindow;
$arrayOfBookings[$i]['endWindow'] = $formattedEndWindow;
} elseif (strpos($nugget, 'Booked')) {
// this is awlays the last in the set we are looking for, so we increment $i here.
//this plaintext has a trailing space, so we remove it.
$status = preg_replace("/ /", "", substr($nugget, 17));
if ($status == "NotScheduled"){
$status = "Van not assigned.";
}
$arrayOfBookings[$i]["status"] = $status;
$i++;
} elseif (strpos($nugget, 'Cancelled')) {
// this is awlays the last in the set we are looking for, so we increment $i here.
//this plaintext has a trailing space, so we remove it.
$status = substr($nugget, 8);
$arrayOfBookings[$i]["status"] = $status;
$i++;
}elseif (strpos($nugget, 'o-Show')) {
// this is awlays the last in the set we are looking for, so we increment $i here.
$status = substr($nugget, 7);
$arrayOfBookings[$i]["status"] = "! $status !";
$i++;
}
}
return $arrayOfBookings;
}
function locations($arrayOfBookings, $locations)
{
/*
$locations is tricky - the td[width=5] is a spacer and it's only used between
the labels "Pick-up" and "Drop-off" and the addresses... so it works as a reference point and we can grab the information from the prev_sibling and
next_sibling. This is daft but it works.
*/
$i = 0;
foreach ($locations as $location) {
if($location->prev_sibling()->plaintext === "Pick-up:"){
$arrayOfBookings[$i]["pickupAddress"] = $location->next_sibling()->plaintext;
$arrayOfBookings[$i]["iteratorLocation"] = $i;
}
elseif ($location->prev_sibling()->plaintext === "Drop-off: ") {
$arrayOfBookings[$i]["dropOffAddress"] = $location->next_sibling()->plaintext;
$i++;
}
}
return $arrayOfBookings;
}
function removePastBookings($arrayOfBookings)
{
global $currentDay, $currentTimeInMinutes;
foreach ($arrayOfBookings as &$booking) {
if (isset($booking["eta"])){
$bookingEta = explode(":", $booking["eta"]);
} else {
$booking["eta"] = $booking["readyTime"];
$bookingEta = explode(":", $booking["readyTime"]);
}
$bookingEtaInMinutes = (intval($bookingEta[0]) * 60) + intval($bookingEta[1]);
$endWindowInMinutesArr = explode(":", $booking["endWindow"]);
$booking["endWindowInMinutes"] = (intval($endWindowInMinutesArr[0]) * 60) + intval($endWindowInMinutesArr[1]);
$booking["delayInMinutes"] = 30 - ($booking["endWindowInMinutes"] - $bookingEtaInMinutes);
$booking["currentDay"] = $currentDay;
$booking["currentTimeInMinutes"] = $currentTimeInMinutes;
$booking["etaInMinutes"] = $bookingEtaInMinutes;
$booking["math"] = ($bookingEtaInMinutes + 60) < $currentTimeInMinutes;
$booking["delayInMinutesDescription"] = getDelayInMinutesDescription($booking["delayInMinutes"]);
$booking["statusDescription"] = getStatusDescription($booking["delayInMinutes"], $booking["status"]);
$booking["statusColor"] = getStatusColor($booking["delayInMinutes"], $booking["status"]);
}
return $arrayOfBookings;
}
function getStatusColor($delay, $status){
if ($status == 'Scheduled') {
if ($delay > 30) {
return "red";
} elseif ($delay < 30 && $delay > 0) {
return "yellow";
} else {
return "green";
}
} elseif (strpos($status, 'ancelled')){
return "blue";
} elseif (strpos($status, 'No-Show')){
return "orange";
}
}
function getStatusDescription($delay, $status){
if ($status == 'Scheduled') {
if ($delay > 30) {
return ", running late.";
} elseif ($delay < 30 && $delay > 0) {
return ", arriving in window.";
} else {
return ", arriving on time.";
}
} else {
return "";
}
}
function getDelayInMinutesDescription($delay)
{
if ($delay > 1) {
return "$delay minutes after Ready Time.";
} elseif ($delay == 1) {
return "$delay minute after Ready Time.";
} elseif ($delay < -1) {
return abs($delay) . " minutes before Ready Time.";
} elseif ($delay == -1) {
return abs($delay) . "minute before Ready Time.";
} elseif ($delay == 0) {
return "right on time.";
}
}
?>