forked from geziyor/geziyor
/
options.go
127 lines (94 loc) · 3.41 KB
/
options.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
package geziyor
import (
"net/http"
"net/url"
"time"
"github.com/Garykom/geziyor/cache"
"github.com/Garykom/geziyor/client"
"github.com/Garykom/geziyor/export"
"github.com/Garykom/geziyor/metrics"
"github.com/Garykom/geziyor/middleware"
"github.com/chromedp/chromedp"
)
// Options is custom options type for Geziyor
type Options struct {
// AllowedDomains is domains that are allowed to make requests
// If empty, any domain is allowed
AllowedDomains []string
// Chrome headless browser WS endpoint.
// If you want to run your own Chrome browser runner, provide its endpoint in here
// For example: ws://localhost:3000
BrowserEndpoint string
// Cache storage backends.
// - Memory
// - Disk
// - LevelDB
Cache cache.Cache
// Policies for caching.
// - Dummy policy (default)
// - RFC2616 policy
CachePolicy cache.Policy
// Response charset detection for decoding to UTF-8
CharsetDetectDisabled bool
// Concurrent requests limit
ConcurrentRequests int
// Concurrent requests per domain limit. Uses request.URL.Host
// Subdomains are different than top domain
ConcurrentRequestsPerDomain int
// If set true, cookies won't send.
CookiesDisabled bool
// ErrorFunc is callback of errors.
// If not defined, all errors will be logged.
ErrorFunc func(g *Geziyor, r *client.Request, err error)
// For extracting data
Exporters []export.Exporter
// Disable logging by setting this true
LogDisabled bool
// Max body reading size in bytes. Default: 1GB
MaxBodySize int64
// Maximum redirection time. Default: 10
MaxRedirect int
// Scraper metrics exporting type. See metrics.Type
MetricsType metrics.Type
// ParseFunc is callback of StartURLs response.
ParseFunc func(g *Geziyor, r *client.Response)
// If true, HTML parsing is disabled to improve performance.
ParseHTMLDisabled bool
// ProxyFunc setting proxy for each request
ProxyFunc func(*http.Request) (*url.URL, error)
// Rendered requests pre actions. Setting this will override the existing default.
// And you'll need to handle all rendered actions, like navigation, waiting, response etc.
// If you need to make custom actions in addition to the defaults, use Request.Actions instead of this.
PreActions []chromedp.Action
// Request delays
RequestDelay time.Duration
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
RequestDelayRandomize bool
// Called before requests made to manipulate requests
RequestMiddlewares []middleware.RequestProcessor
// Called after response received
ResponseMiddlewares []middleware.ResponseProcessor
// RequestsPerSecond limits requests that is made per seconds. Default: No limit
RequestsPerSecond float64
// Which HTTP response codes to retry.
// Other errors (DNS lookup issues, connections lost, etc) are always retried.
// Default: []int{500, 502, 503, 504, 522, 524, 408}
RetryHTTPCodes []int
// Maximum number of times to retry, in addition to the first download.
// Set -1 to disable retrying
// Default: 2
RetryTimes int
// If true, disable robots.txt checks
RobotsTxtDisabled bool
// StartRequestsFunc called on scraper start
StartRequestsFunc func(g *Geziyor)
// First requests will made to this url array. (Concurrently)
StartURLs []string
// Timeout is global request timeout
Timeout time.Duration
// Revisiting same URLs is disabled by default
URLRevisitEnabled bool
// User Agent.
// Default: "Geziyor 1.0"
UserAgent string
}