/
Parsers.php
204 lines (166 loc) · 6.28 KB
/
Parsers.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
<?php
namespace App\Helpers;
use XPathSelector\Selector;
class Parsers
{
/**
* Extract contract data sorted in standard key/value pairs in an HTML <table>, using XPath.
*
* @param string $html The contract HTML to search through.
* @param string $keyXpath The XPath selector for keys from contract key/value pairs.
* @param string $valueXpath The XPath selector for values from contract key/value pairs.
* @param string $periodSplitString The string that marks a contract date range. (Often has "to" in it.)
* @param array $keyArray The list of keys (from contract key/value pairs) to collect values for.
*
* @return array The contract data, sorted into key/value pairs.
*/
public static function extractContractDataViaGenericXpathParser($html, $keyXpath, $valueXpath, $periodSplitString, $keyArray = [])
{
$values = [];
$defaultKeyArray = [
'vendorName' => 'Vendor Name:',
'referenceNumber' => 'Reference Number:',
'contractDate' => 'Contract Date:',
'description' => 'Description of work:',
'extraDescription' => 'Description (more details):',
'contractPeriodStart' => '',
'contractPeriodEnd' => '',
'contractPeriodRange' => 'Contract Period:',
'deliveryDate' => 'Delivery Date:',
'originalValue' => 'Original Contract Value:',
'contractValue' => 'Contract Value:',
'comments' => 'Comments:',
];
if ($keyArray == []) {
$keyArray = $defaultKeyArray;
}
$cleanKeys = [];
foreach ($keyArray as $key => $label) {
$cleanKeys[$key] = Cleaners::cleanLabelText($label);
}
$labelToKey = array_flip($cleanKeys);
$xs = Selector::loadHTML($html);
// Extracts the keys (from the <th> tags) in order
$keyNodes = $xs->findAll($keyXpath)->map(function ($node, $index) {
return (string)$node;
});
// Extracts the values (from the <td> tags) in hopefully the same order:
$valueNodes = $xs->findAll($valueXpath)->map(function ($node, $index) {
return (string)$node;
});
foreach ($keyNodes as $index => $keyNode) {
$keyNode = Cleaners::cleanLabelText($keyNode);
if (isset($labelToKey[$keyNode]) && $labelToKey[$keyNode] && isset($valueNodes[$index])) {
$values[$labelToKey[$keyNode]] = Cleaners::removeLinebreaks(Cleaners::cleanHtmlValue($valueNodes[$index]));
}
}
// Change the "to" range into start and end values:
if (isset($values['contractPeriodRange']) && $values['contractPeriodRange']) {
$split = explode($periodSplitString, $values['contractPeriodRange']);
if (isset($split[0]) && isset($split[1])) {
$values['contractPeriodStart'] = trim($split[0]);
$values['contractPeriodEnd'] = trim($split[1]);
}
}
return $values;
}
/**
* Run an XPath query on a chunk of HTML, optionally filtering the result
* via a RegEx pattern.
*
* @param string $html The HTML to search.
* @param string $xpathQuery The XPath query to search with.
* @param string $regexPattern The (optional) RegEx pattern to filter the result with.
*
* @return string
*/
public static function xpathRegexComboSearch($html, $xpathQuery, $regexPattern = null)
{
$output = '';
$xs = Selector::loadHTML($html);
$text = $xs->find($xpathQuery)->innerHTML();
if (null === $regexPattern) {
return $text;
}
$matches = [];
$pattern = $regexPattern;
preg_match($pattern, $text, $matches);
if ($matches) {
$output = $matches[1];
}
return $output;
}
public static function xpathReturnSingle($text, $regexPattern)
{
$output = '';
$matches = [];
$pattern = $regexPattern;
preg_match($pattern, $text, $matches);
if ($matches) {
$output = $matches[1];
}
return $output;
}
/**
* Pull an array of items, selected via an XPath selector, from an
* HTML page.
*
* @param $htmlSource string The HTML to run the XPath selector on.
* @param $xpath string The XPath selector to extract the items.
*
* @return string[] The items converted to strings, stored in an array, and deduped.
*/
public static function getArrayFromHtmlViaXpath($htmlSource, $xpath)
{
$xs = Selector::loadHTML($htmlSource);
$items = $xs->findAll($xpath)->map(function ($node, $index) {
return (string)$node;
});
return array_unique($items);
}
/**
* Extract a year from a date string.
*
* @param string $dateInput The date.
*
* @return bool|integer
*/
public static function extractYearFromDate($dateInput)
{
$matches = [];
$pattern = '/([1-2][0-9][0-9][0-9])/';
preg_match($pattern, $dateInput, $matches);
if (!empty($matches)) {
return intval($matches[1]);
}
return false;
}
/**
* Extract a Chart of Accounts Object Code from a contract description.
*
* For example:
* "514- Rental of other buildings" -> 0514
* "1228 - Computer software" -> 1228
*
* The full list of Chart of Accounts Object Codes is available here,
* https://www.tpsgc-pwgsc.gc.ca/recgen/pceaf-gwcoa/1718/ressource-resource-eng.html
* as the last link on the page.
*
* @param string $description The contract description.
*
* @return string The object code.
*/
public static function extractObjectCodeFromDescription($description)
{
$objectCode = '';
$matches = [];
$pattern = '/([0-9]{3,4})/';
preg_match($pattern, $description, $matches);
if ($matches) {
// Get the matching pattern, and left-pad it with zeroes
// Sometimes these show up as eg. 514 and sometimes 0514
$objectCode = str_pad($matches[1], 4, '0', STR_PAD_LEFT);
}
return $objectCode;
}
}