diff --git a/src/CrawlerDetect.php b/src/CrawlerDetect.php index a83d42c..5cbe6a6 100644 --- a/src/CrawlerDetect.php +++ b/src/CrawlerDetect.php @@ -89,7 +89,7 @@ public function __construct(array $headers = null, string $userAgent = null) $this->compiledExclusions = $this->compileRegex($this->exclusions->getAll()); $this->setHttpHeaders($headers); - $this->setUserAgent($userAgent); + $this->userAgent = $this->setUserAgent($userAgent); } /** @@ -107,7 +107,7 @@ public function compileRegex($patterns): string /** * Set HTTP headers. * - * @param array|null $httpHeaders + * @param void */ public function setHttpHeaders(array $httpHeaders = null): void { @@ -141,22 +141,19 @@ public function getUaHttpHeaders(): array /** * Set the user agent. * - * @param string|null $userAgent + * @param string $userAgent */ - public function setUserAgent(string $userAgent = null): void + public function setUserAgent(string $userAgent = null): ?string { - if (false === empty($userAgent)) { - $this->userAgent = $userAgent; - } else { - $this->userAgent = null; + if (is_null($userAgent)) { foreach ($this->getUaHttpHeaders() as $altHeader) { - if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. - $this->userAgent .= $this->httpHeaders[$altHeader].' '; + if (isset($this->httpHeaders[$altHeader])) { + $userAgent .= $this->httpHeaders[$altHeader].' '; } } - - $this->userAgent = ! empty($this->userAgent) ? trim($this->userAgent) : null; } + + return $userAgent; } /** diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 7928bd1..5fcb913 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -51,6 +51,7 @@ class Crawlers extends AbstractProvider 'a3logics\.in', 'A6-Indexer', 'a\.pr-cy\.ru', + 'Abonti\/', 'Aboundex', 'aboutthedomain', 'Accoona-AI-Agent', @@ -70,6 +71,7 @@ class Crawlers extends AbstractProvider 'alertra', 'alexa site audit', 'Alibaba\.Security\.Heimdall', + 'allloadin\.com', 'alyze\.info', 'amagit', 'AndroidDownloadManager', @@ -133,6 +135,7 @@ class Crawlers extends AbstractProvider 'CERT\.at-Statistics-Survey', 'cg-eye', 'changedetection', + 'ChangesMeter\/', 'Charlotte', 'CheckHost', 'checkprivacy', @@ -142,6 +145,7 @@ class Crawlers extends AbstractProvider 'CJNetworkQuality', 'clips\.ua\.ac\.be', 'Cloud mapping experiment', + 'CloudEndure', 'CloudFlare-AlwaysOnline', 'Cloudinary\/[0-9]', 'cmcm\.com', @@ -189,6 +193,8 @@ class Crawlers extends AbstractProvider 'ec2linkfinder', 'eCairn-Grabber', 'ECCP', + 'echocrawl', + 'eContext\/', 'ElectricMonk', 'elefent', 'EMail Exractor', @@ -210,6 +216,7 @@ class Crawlers extends AbstractProvider 'facebookplatform', 'fairshare', 'Faraday v', + 'fasthttp', 'Faveeo', 'Favicon downloader', 'FavOrg', @@ -239,6 +246,7 @@ class Crawlers extends AbstractProvider 'free thumbnails', 'FreeWebMonitoring SiteChecker', 'Funnelback', + 'G-i-g-a-b-o-t', 'g00g1e\.net', 'GAChecker', 'ganarvisitas\/[0-9]', @@ -251,6 +259,7 @@ class Crawlers extends AbstractProvider 'GetURLInfo\/[0-9]', 'Ghost Inspector', 'GigablastOpenSource', + 'GIS-LABS', 'github\.com\/', 'Go [\d\.]* package http', 'Go-http-client', @@ -279,6 +288,7 @@ class Crawlers extends AbstractProvider 'GoogleDocs', 'GoogleHC\/', 'GoogleProducer', + 'Gookey', 'GoScraper', 'GoSpotCheck', 'GoSquared-Status-Checker', @@ -289,6 +299,7 @@ class Crawlers extends AbstractProvider 'grouphigh', 'grub-client', 'GTmetrix', + 'GuzzleHttp', 'gvfs\/', 'HAA(A)?RTLAND http client', 'Hatena', @@ -304,8 +315,10 @@ class Crawlers extends AbstractProvider 'ht:\/\/check', 'htdig', 'HTMLParser\/', + 'http-get', 'HTTP-Header-Abfrage', 'http-kit', + 'http-request\/', 'HTTP-Tiny', 'HTTP_Compression_Test', 'http_request2', @@ -348,6 +361,7 @@ class Crawlers extends AbstractProvider 'internet_archive', 'InternetSeer', 'internetVista monitor', + 'intraVnews', 'IODC', 'IOI', 'iplabel', @@ -360,6 +374,7 @@ class Crawlers extends AbstractProvider 'iskanie', 'iZSearch', 'janforman', + 'Jaunt\/', 'Jigsaw', 'Jobboerse', 'jobo', @@ -374,7 +389,6 @@ class Crawlers extends AbstractProvider 'Kml-Google', 'knows\.is', 'kouio', - 'KrOWLer', 'kulturarw3', 'KumKie', 'L\.webis', @@ -402,16 +416,18 @@ class Crawlers extends AbstractProvider 'LongURL API', 'looksystems\.net', 'ltx71', + 'lua-resty-http', 'lwp-trivial', 'lycos', 'LYT\.SR', 'mabontland', 'MagpieRSS', 'Mail.Ru', - 'MailChimp\.com', + 'MailChimp', 'Mandrill', 'MapperCmd', 'marketinggrader', + 'masscan\/[0-9]', 'Mediapartners-Google', 'MegaIndex\.ru', 'Melvil Rawi\/', @@ -437,6 +453,7 @@ class Crawlers extends AbstractProvider 'Moreover', 'Morning Paper', 'mowser', + 'MovableType', 'Mrcgiguy', 'mShots', 'MVAClient', @@ -483,6 +500,7 @@ class Crawlers extends AbstractProvider 'Optimizer', 'Orbiter', 'OrgProbe\/[0-9]', + 'Owler', 'ow\.ly', 'ownCloud News', 'OxfordCloudService\/[0-9]', @@ -501,6 +519,7 @@ class Crawlers extends AbstractProvider 'peerindex', 'Peew', 'PhantomJS\/', + 'PhantomJS Screenshoter', 'Photon\/', 'phpcrawl', 'phpservermon', @@ -566,6 +585,7 @@ class Crawlers extends AbstractProvider 'SalesIntelligent', 'SauceNAO', 'SBIder', + 'scalaj-http', 'Scoop', 'scooter', 'ScoutJet', @@ -610,6 +630,7 @@ class Crawlers extends AbstractProvider 'SiteTruth', 'sitexy\.com', 'SkypeUriPreview', + 'Slack\/', 'slider\.com', 'slurp', 'SMRF URL Expander', @@ -620,6 +641,7 @@ class Crawlers extends AbstractProvider 'Snoopy', 'sogou web', 'SortSite', + 'sovereign\.ai', 'spaziodati', 'Specificfeeds', 'speedy', @@ -627,6 +649,7 @@ class Crawlers extends AbstractProvider 'Spinn3r', 'spray-can', 'Sprinklr ', + 'sqlmap', 'spyonweb', 'Sqworm', 'SSL Labs', @@ -644,6 +667,7 @@ class Crawlers extends AbstractProvider 'Symfony2 BrowserKit', 'SynHttpClient-Built', 'Sysomos', + 'Symfony BrowserKit', 'T0PHackTeam', 'Tarantula\/', 'Taringa UGC', @@ -659,10 +683,13 @@ class Crawlers extends AbstractProvider 'ThumbSniper', 'TinEye', 'Tiny Tiny RSS', + 'TLSProbe\/', 'topster', 'touche.com', 'Traackr.com', + 'TrapitAgent', 'truwoGPS', + 'TulipChain', 'tweetedtimes\.com', 'Tweetminster', 'Tweezler\/', @@ -671,6 +698,7 @@ class Crawlers extends AbstractProvider 'ubermetrics-technologies', 'uclassify', 'UdmSearch', + 'UniversalFeedParser', 'Untiny', 'UnwindFetchor', 'updated', diff --git a/tests/crawlers.txt b/tests/crawlers.txt index 607ca3a..9feef01 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3082,3 +3082,55 @@ Netpursual/1.0 Kaspersky Lab CFR link resolver cfradmins@kaspersky.com Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453 Safari/537.36 (compatible; Google-HotelAdsVerifier/2.0) b0t +"echocrawl 2.0" +masscan/1.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots) +Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots +Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexPagechecker/1.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexCalendar/1.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexSitelinks; Dyatel; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexAntivirus/2.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexVertis/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots) +jetmon/1.0 (Jetpack Site Uptime Monitor by WordPress.com) +Mozilla/5.0 (compatible; Abonti/0.8 - http://www.abonti.com) +CloudEndure Scanner (ops@cloudendure.com) +AFB/3.0 (+http://allloadin.com) +TLSProbe/1.0 (+https://scan.trustnet.venafi.com/) +Sens.ai http://sovereign.ai/contact +eContext/1.0 (eContext Classification Engine) +Owler +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Slack/1.2.6 Chrome/45.0.2454.85 AtomShell/0.34.3 Safari/537.36 Slack_SSB/1.2.6 +Mozilla/5.0 (compatible; ChangesMeter/1.9.1; http://intuiware.com/apps/changes-meter) +Mozilla/5.0 (compatible; Gookey.co/1.0; +http://gookey.co/) +TrapitAgent/0.1 (feed processor; +http://trapit.com/about) +sqlmap/1.0-dev-nongit-201612050a8c (http://sqlmap.org) +UniversalFeedParser/3.3 +http://feedparser.org/ +TulipChain/5.xx (http://ostermiller.org/tulipchain/) Java/1.x.1_0x (http://apple.com/) Mac_OS_X/10.2.8 +scalaj-http/1.0 +HggH PhantomJS Screenshoter +fasthttp +fasthttp, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36 +lua-resty-http/0.10 (Lua) ngx_lua/10000 +GIS-LABS:CertMon +GuzzleHttp/6.2.1 PHP/7.0.15-0ubuntu0.16.04.4 +http-request/v0.7.0 (http://git.io/tl_S2w) node.js/v0.10.29 +intraVnews/1.x +MailChimp +G-i-g-a-b-o-t +Symfony BrowserKit +VCPP71 http-get 1.0a +Jaunt/1.2 +Mozilla/5.0 (compatible; Owler/0.4; +; ) +MovableType/x.x