/
scraper.go
298 lines (265 loc) · 9.58 KB
/
scraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
// package iconscraper provides a robust solution to get icons from domains.
//
// # Icon Sources
//
// - `/favicon.ico`
// - [Icon (`<link rel="icon" href="favicon.ico">`)](https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel#icon)
// - [Web app manifest (`<link rel="manifest" href="manifest.json">`)](https://developer.mozilla.org/en-US/docs/Web/Manifest)
// - [`link rel="shortcut icon"`](https://stackoverflow.com/questions/13211206/html5-link-rel-shortcut-icon)
// - [`link rel="apple-touch-icon"`](https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel#non-standard_values)
// - [`link rel="msapplication-TileImage"`](https://stackoverflow.com/questions/61686919/what-is-the-use-of-the-msapplication-tileimage-meta-tag)
// - [`link rel="mask-icon"`](http://microformats.org/wiki/existing-rel-values)
// - [`link rel="image_src"`](http://microformats.org/wiki/existing-rel-values) (also [this post](https://www.niallkennedy.com/blog/2009/03/enhanced-social-share.html))
// - [`meta itemprop="image"`](https://schema.org/image)
//
// # Other sources
//
// These aren't currently scraped, but might be of interest:
//
// - [`link rel="apple-touch-startup-image"`](http://microformats.org/wiki/existing-rel-values)
// - [`meta property="og:image"`](https://ogp.me/)
//
// # Get icons from multiple domains
//
// import "github.com/MeVitae/iconscraper"
//
// config := Config{
// SquareOnly: true,
// TargetHeight: 128,
// MaxConcurrentRequests: 32,
// AllowSvg: false,
// }
//
// domains := []string{"mevitae.com", "example.com", "gov.uk", "golang.org", "rust-lang.org"}
//
// icons := iconscraper.GetIcons(config, domains)
//
// for domain, icon := range icons {
// fmt.Println("Domain: " + domain + ", Icon URL: " + icon.URL)
// }
//
// # Handle errors and warnings.
//
// Errors related to decoding images or resources not being found on a web server (but the connection
// being ok) will be reported as warnings instead of errors.
//
// By default, errors and warnings are only logged to the console. You can handle errors yourself by
// adding your own channel in the config, for example:
//
// import "github.com/MeVitae/iconscraper"
//
// config := Config{
// SquareOnly: true,
// TargetHeight: 128,
// MaxConcurrentRequests: 32,
// AllowSvg: false,
// Errors: make(chan error),
// }
//
// go func(){
// for err := range config.Errors {
// // Handle err
// }
// }()
//
// domains := []string{"mevitae.com", "example.com", "gov.uk", "golang.org", "rust-lang.org"}
//
// icons := iconscraper.GetIcons(config, domains)
//
// for domain, icon := range icons {
// fmt.Println("Domain: " + domain + ", Icon URL: " + icon.URL)
// }
//
// Warnings can be similarly handled using the `Warnings` field.
//
// # Get icon from a single domain
//
// Icons can be scraped for a single domain using `GetIcon`. Errors and warnings are handled in the
// same way.
package iconscraper
import (
"bytes"
"fmt"
"image"
"log"
"regexp"
"golang.org/x/net/html"
)
// logErrors logs all the errors sent on the channel to stderr
func logErrors(errors chan error) {
for err := range errors {
log.Println(err.Error())
}
}
// logWarnings logs all the warnings sent on the channel to stderr
func logWarnings(errors chan error) {
for err := range errors {
log.Println("Warning:", err.Error())
}
}
// Icon is an icon
type Icon struct {
// URL is the source location from which the data was fetched or derived.
URL string
// Type is the sniffed MIME type of the image.
Type string
// Image holds the parsed image config. This is nil for SVGs (type image/svg+xml).
ImageConfig image.Config
// Source is the image source as downloaded.
Source []byte
}
// Config is the config used for GetIcons and GetIcon.
type Config struct {
// SquareOnly determines if only square icons are considered.
SquareOnly bool
// TargetHeight of the icon to be fetched. The shortest image larger than this size will be
// returned and, if none are available, the tallest image smaller than this will be returned.
TargetHeight int
// AllowSvg allows SVGs to be returned. An SVG will always supersede a non-vector image.
AllowSvg bool
// MaxConcurrentRequests sets the maximum number of concurrent HTTP requests.
MaxConcurrentRequests int
// Errors is the channel for receiving errors.
//
// If nil, errors will instead by logged to the default logger.
//
// The channel must not block.
Errors chan error
// Warnings is the channel for receiving warning. Errors related to decoding images or resources
// not being found on a web server (but the connection being ok) will be reported as warnings
// instead of errors.
//
// If nil, warnings will instead by logged to the default logger.
//
// The channel must not block.
Warnings chan error
}
// GetIcons scrapes icons from the provided domains concurrently and returns the results as a map from domain to the best image based on the given target.
//
// It finds the smallest icon taller than targetHeight or, if there are none, the tallest icon.
//
// If no icon is not found for a domain (or no square icon if squareOnly is true), that domain is omitted from the output map.
func GetIcons(config Config, domains []string) map[string]Icon {
// Create error and warning handler channels if not provided. By default, these are consumed and logged.
if config.Errors == nil {
config.Errors = make(chan error)
go logErrors(config.Errors)
defer close(config.Errors)
}
if config.Warnings == nil {
config.Warnings = make(chan error)
go logWarnings(config.Warnings)
defer close(config.Warnings)
}
// HTTP worker pool
http := newHttpWorkerPool(config.MaxConcurrentRequests)
defer http.close()
// Channel to collect results
results := make(chan processReturn)
defer close(results)
// Spawn a goroutine for every domain, these will be rate limited by the http pool.
for _, domain := range domains {
go processDomain(config, domain, http, results)
}
// Collect results
resultMap := make(map[string]Icon, len(domains))
for idx := 0; idx < len(domains); idx++ {
res := <-results
if res.result != nil {
resultMap[res.domain] = *res.result
}
}
return resultMap
}
// GetIcons scrapes icons from the provided domain and finds the smallest icon taller than targetHeight or, if there are none, the tallest icon.
//
// Errors that occur are sent to the config.Errors, unless it's nil, in which case, they are logged.
func GetIcon(config Config, domain string) *Icon {
// Create error and warning handler channels if not provided. By default, these are consumed and logged.
if config.Errors == nil {
config.Errors = make(chan error)
go logErrors(config.Errors)
defer close(config.Errors)
}
if config.Warnings == nil {
config.Warnings = make(chan error)
go logWarnings(config.Warnings)
defer close(config.Warnings)
}
// HTTP worker pool
http := newHttpWorkerPool(config.MaxConcurrentRequests)
defer http.close()
// Channel to collect results
results := make(chan processReturn, 1)
defer close(results)
go processDomain(config, domain, http, results)
return (<-results).result
}
// processReturn is the output of processDomain
type processReturn struct {
// domain is the domain that was processed.
domain string
// result holds the result, or nil if there isn't one.
result *Icon
}
var domainNameRegexp = regexp.MustCompile(`^([a-zA-Z0-9_][a-zA-Z0-9_-]{0,64})(\.[a-zA-Z0-9_][a-zA-Z0-9_-]{0,64})*[\._]?$`)
// couldBeDomain returns false if domain definitely isn't a valid domain.
func couldBeDomain(domain string) bool {
return len(domain) <= 512 && domainNameRegexp.MatchString(domain)
}
// processDomain is a worker function that processes getting images for a domain.
//
// It fetches HTML content from each URL, parses the HTML content, and extracts
// image information based on keys and values variables. It then picks the best
// image from the extracted images based on the `bestSize` parameter and sends
// the best image back on the result channel, or, if not image was found, it
// sends back a nil result.
func processDomain(
config Config,
domain string,
http *httpWorkerPool,
result chan processReturn,
) {
// Check for obvious cases where the domain passed is invalid
if !couldBeDomain(domain) {
config.Errors <- fmt.Errorf("Invalid domain name %s", domain)
result <- processReturn{
domain: domain,
result: nil,
}
}
url := "https://" + domain
httpResult := http.get(url)
// Only check for network errors fetching, if it's an error page, that'll do.
if httpResult.err != nil {
config.Errors <- fmt.Errorf("Failed to get %s: %w", url, httpResult.err)
result <- processReturn{
domain: domain,
result: nil,
}
return
}
// Parse the output HTML
doc, err := html.Parse(bytes.NewReader(httpResult.body))
if err != nil {
config.Errors <- fmt.Errorf("Error parsing HTML from %s: %w", url, err)
result <- processReturn{
domain: domain,
result: nil,
}
return
}
// Our requests will be now rooted at the domain we were redirected to.
redirectDomain := httpResult.url.Host
url = "https://" + redirectDomain
workers := newImageWorkers(redirectDomain, http, config.Errors, config.Warnings)
// Always check for `/favicon.ico`, it's not always linked from the HTML.
workers.spawn(url + "/favicon.ico")
// Spawn workers scraping all the linked icons
getImagesFromHTML(doc, redirectDomain, &workers)
// Pick the best size image from all the results
result <- processReturn{
domain: domain,
result: pickBestImage(config, workers.results()),
}
}