From c145518076811a3fe4eea7de034eac084fd0a40d Mon Sep 17 00:00:00 2001 From: Mark Beech Date: Tue, 9 May 2017 22:08:30 +0100 Subject: [PATCH 1/7] Code cleanup --- src/CrawlerDetect.php | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/CrawlerDetect.php b/src/CrawlerDetect.php index c28aaf9..f03591e 100644 --- a/src/CrawlerDetect.php +++ b/src/CrawlerDetect.php @@ -86,7 +86,7 @@ public function __construct(array $headers = null, $userAgent = null) $this->compiledExclusions = $this->compileRegex($this->exclusions->getAll()); $this->setHttpHeaders($headers); - $this->setUserAgent($userAgent); + $this->userAgent = $this->setUserAgent($userAgent); } /** @@ -106,7 +106,7 @@ public function compileRegex($patterns) * * @param array|null $httpHeaders */ - public function setHttpHeaders($httpHeaders = null) + public function setHttpHeaders($httpHeaders) { // Use global _SERVER if $httpHeaders aren't defined. if (! is_array($httpHeaders) || ! count($httpHeaders)) { @@ -138,22 +138,19 @@ public function getUaHttpHeaders() /** * Set the user agent. * - * @param string|null $userAgent + * @param string $userAgent */ - public function setUserAgent($userAgent = null) + public function setUserAgent($userAgent) { - if (false === empty($userAgent)) { - $this->userAgent = $userAgent; - } else { - $this->userAgent = null; + if (is_null($userAgent)) { foreach ($this->getUaHttpHeaders() as $altHeader) { - if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. - $this->userAgent .= $this->httpHeaders[$altHeader].' '; + if (isset($this->httpHeaders[$altHeader])) { + $userAgent .= $this->httpHeaders[$altHeader].' '; } } - - $this->userAgent = (! empty($this->userAgent) ? trim($this->userAgent) : null); } + + return $userAgent; } /** From d2b03b2dbd0a55ac4b6118cc45edc697225abd7d Mon Sep 17 00:00:00 2001 From: Wirtz Date: Thu, 18 May 2017 09:18:31 +0200 Subject: [PATCH 2/7] Added echocrawl bot --- src/Fixtures/Crawlers.php | 1 + tests/crawlers.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 88280fa..3d467e2 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -189,6 +189,7 @@ class Crawlers extends AbstractProvider 'ec2linkfinder', 'eCairn-Grabber', 'ECCP', + 'echocrawl', 'ElectricMonk', 'elefent', 'EMail Exractor', diff --git a/tests/crawlers.txt b/tests/crawlers.txt index 607ca3a..2629c32 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3082,3 +3082,4 @@ Netpursual/1.0 Kaspersky Lab CFR link resolver cfradmins@kaspersky.com Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453 Safari/537.36 (compatible; Google-HotelAdsVerifier/2.0) b0t +"echocrawl 2.0" \ No newline at end of file From f6154cb856f20823b83e1df5cd9c99f3ed94fb49 Mon Sep 17 00:00:00 2001 From: Mark Beech Date: Thu, 18 May 2017 08:54:50 +0000 Subject: [PATCH 3/7] Apply fixes from StyleCI --- src/Fixtures/Crawlers.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 3d467e2..339f661 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -189,7 +189,7 @@ class Crawlers extends AbstractProvider 'ec2linkfinder', 'eCairn-Grabber', 'ECCP', - 'echocrawl', + 'echocrawl', 'ElectricMonk', 'elefent', 'EMail Exractor', From dd985b2169905a970f4718734ff74c2142747717 Mon Sep 17 00:00:00 2001 From: Mark Beech Date: Mon, 29 May 2017 21:17:58 +0100 Subject: [PATCH 4/7] Add bot - closes #198 --- src/Fixtures/Crawlers.php | 1 + tests/crawlers.txt | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 339f661..1c506f3 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -413,6 +413,7 @@ class Crawlers extends AbstractProvider 'Mandrill', 'MapperCmd', 'marketinggrader', + 'masscan\/[0-9]', 'Mediapartners-Google', 'MegaIndex\.ru', 'Melvil Rawi\/', diff --git a/tests/crawlers.txt b/tests/crawlers.txt index 2629c32..e89d6ef 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3082,4 +3082,5 @@ Netpursual/1.0 Kaspersky Lab CFR link resolver cfradmins@kaspersky.com Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453 Safari/537.36 (compatible; Google-HotelAdsVerifier/2.0) b0t -"echocrawl 2.0" \ No newline at end of file +"echocrawl 2.0" +masscan/1.0 \ No newline at end of file From dd14c05b3aed6ca75acfc77fbd864e37d83777bc Mon Sep 17 00:00:00 2001 From: MaxGiting Date: Sat, 3 Jun 2017 11:19:31 +0100 Subject: [PATCH 5/7] Fix HHVM tests and add more user agents and tests (#201) * Add Abounti user agent * Add CloudEndure #200 * Add 5 more user agents * Try fixing HHVM failure https://github.com/travis-ci/travis-ci/issues/7712 --- .travis.yml | 1 + src/Fixtures/Crawlers.php | 8 +++++++- tests/crawlers.txt | 29 ++++++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index a740bac..7ce2ce5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ matrix: - php: 7.0 - php: 7.1 - php: hhvm + dist: trusty - php: nightly allow_failures: - php: nightly diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 1c506f3..996e824 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -51,6 +51,7 @@ class Crawlers extends AbstractProvider 'a3logics\.in', 'A6-Indexer', 'a\.pr-cy\.ru', + 'Abonti\/', 'Aboundex', 'aboutthedomain', 'Accoona-AI-Agent', @@ -70,6 +71,7 @@ class Crawlers extends AbstractProvider 'alertra', 'alexa site audit', 'Alibaba\.Security\.Heimdall', + 'allloadin\.com', 'alyze\.info', 'amagit', 'AndroidDownloadManager', @@ -142,6 +144,7 @@ class Crawlers extends AbstractProvider 'CJNetworkQuality', 'clips\.ua\.ac\.be', 'Cloud mapping experiment', + 'CloudEndure', 'CloudFlare-AlwaysOnline', 'Cloudinary\/[0-9]', 'cmcm\.com', @@ -190,6 +193,7 @@ class Crawlers extends AbstractProvider 'eCairn-Grabber', 'ECCP', 'echocrawl', + 'eContext\/', 'ElectricMonk', 'elefent', 'EMail Exractor', @@ -375,7 +379,6 @@ class Crawlers extends AbstractProvider 'Kml-Google', 'knows\.is', 'kouio', - 'KrOWLer', 'kulturarw3', 'KumKie', 'L\.webis', @@ -485,6 +488,7 @@ class Crawlers extends AbstractProvider 'Optimizer', 'Orbiter', 'OrgProbe\/[0-9]', + 'Owler', 'ow\.ly', 'ownCloud News', 'OxfordCloudService\/[0-9]', @@ -622,6 +626,7 @@ class Crawlers extends AbstractProvider 'Snoopy', 'sogou web', 'SortSite', + 'sovereign\.ai', 'spaziodati', 'Specificfeeds', 'speedy', @@ -661,6 +666,7 @@ class Crawlers extends AbstractProvider 'ThumbSniper', 'TinEye', 'Tiny Tiny RSS', + 'TLSProbe\/', 'topster', 'touche.com', 'Traackr.com', diff --git a/tests/crawlers.txt b/tests/crawlers.txt index e89d6ef..1e22add 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3083,4 +3083,31 @@ Kaspersky Lab CFR link resolver cfradmins@kaspersky.com Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453 Safari/537.36 (compatible; Google-HotelAdsVerifier/2.0) b0t "echocrawl 2.0" -masscan/1.0 \ No newline at end of file +masscan/1.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots) +Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots +Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexPagechecker/1.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexCalendar/1.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexSitelinks; Dyatel; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexAntivirus/2.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexVertis/3.0; +http://yandex.com/bots) +Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots) +jetmon/1.0 (Jetpack Site Uptime Monitor by WordPress.com) +Mozilla/5.0 (compatible; Abonti/0.8 - http://www.abonti.com) +CloudEndure Scanner (ops@cloudendure.com) +AFB/3.0 (+http://allloadin.com) +TLSProbe/1.0 (+https://scan.trustnet.venafi.com/) +Sens.ai http://sovereign.ai/contact +eContext/1.0 (eContext Classification Engine) +Owler From 187bda4027d8b0fc2ead560f102a49a8e876017f Mon Sep 17 00:00:00 2001 From: Mark Beech Date: Wed, 7 Jun 2017 20:59:42 +0100 Subject: [PATCH 6/7] Add bot - closes #202 --- src/Fixtures/Crawlers.php | 1 + tests/crawlers.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 996e824..6adf55b 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -616,6 +616,7 @@ class Crawlers extends AbstractProvider 'SiteTruth', 'sitexy\.com', 'SkypeUriPreview', + 'Slack\/', 'slider\.com', 'slurp', 'SMRF URL Expander', diff --git a/tests/crawlers.txt b/tests/crawlers.txt index 1e22add..68d94a3 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3111,3 +3111,4 @@ TLSProbe/1.0 (+https://scan.trustnet.venafi.com/) Sens.ai http://sovereign.ai/contact eContext/1.0 (eContext Classification Engine) Owler +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Slack/1.2.6 Chrome/45.0.2454.85 AtomShell/0.34.3 Safari/537.36 Slack_SSB/1.2.6 From 55e98a6c733a7f99912f6239c8d7ee25ed4f9eb5 Mon Sep 17 00:00:00 2001 From: MaxGiting Date: Mon, 19 Jun 2017 22:29:50 +0100 Subject: [PATCH 7/7] Add lots of bot user agents (#205) --- src/Fixtures/Crawlers.php | 21 ++++++++++++++++++++- tests/crawlers.txt | 22 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 6adf55b..231fa73 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -135,6 +135,7 @@ class Crawlers extends AbstractProvider 'CERT\.at-Statistics-Survey', 'cg-eye', 'changedetection', + 'ChangesMeter\/', 'Charlotte', 'CheckHost', 'checkprivacy', @@ -215,6 +216,7 @@ class Crawlers extends AbstractProvider 'facebookplatform', 'fairshare', 'Faraday v', + 'fasthttp', 'Faveeo', 'Favicon downloader', 'FavOrg', @@ -244,6 +246,7 @@ class Crawlers extends AbstractProvider 'free thumbnails', 'FreeWebMonitoring SiteChecker', 'Funnelback', + 'G-i-g-a-b-o-t', 'g00g1e\.net', 'GAChecker', 'ganarvisitas\/[0-9]', @@ -256,6 +259,7 @@ class Crawlers extends AbstractProvider 'GetURLInfo\/[0-9]', 'Ghost Inspector', 'GigablastOpenSource', + 'GIS-LABS', 'github\.com\/', 'Go [\d\.]* package http', 'Go-http-client', @@ -284,6 +288,7 @@ class Crawlers extends AbstractProvider 'GoogleDocs', 'GoogleHC\/', 'GoogleProducer', + 'Gookey', 'GoScraper', 'GoSpotCheck', 'GoSquared-Status-Checker', @@ -294,6 +299,7 @@ class Crawlers extends AbstractProvider 'grouphigh', 'grub-client', 'GTmetrix', + 'GuzzleHttp', 'gvfs\/', 'HAA(A)?RTLAND http client', 'Hatena', @@ -309,8 +315,10 @@ class Crawlers extends AbstractProvider 'ht:\/\/check', 'htdig', 'HTMLParser\/', + 'http-get', 'HTTP-Header-Abfrage', 'http-kit', + 'http-request\/', 'HTTP-Tiny', 'HTTP_Compression_Test', 'http_request2', @@ -353,6 +361,7 @@ class Crawlers extends AbstractProvider 'internet_archive', 'InternetSeer', 'internetVista monitor', + 'intraVnews', 'IODC', 'IOI', 'iplabel', @@ -365,6 +374,7 @@ class Crawlers extends AbstractProvider 'iskanie', 'iZSearch', 'janforman', + 'Jaunt\/', 'Jigsaw', 'Jobboerse', 'jobo', @@ -406,13 +416,14 @@ class Crawlers extends AbstractProvider 'LongURL API', 'looksystems\.net', 'ltx71', + 'lua-resty-http', 'lwp-trivial', 'lycos', 'LYT\.SR', 'mabontland', 'MagpieRSS', 'Mail.Ru', - 'MailChimp\.com', + 'MailChimp', 'Mandrill', 'MapperCmd', 'marketinggrader', @@ -442,6 +453,7 @@ class Crawlers extends AbstractProvider 'Moreover', 'Morning Paper', 'mowser', + 'MovableType', 'Mrcgiguy', 'mShots', 'MVAClient', @@ -507,6 +519,7 @@ class Crawlers extends AbstractProvider 'peerindex', 'Peew', 'PhantomJS\/', + 'PhantomJS Screenshoter', 'Photon\/', 'phpcrawl', 'phpservermon', @@ -572,6 +585,7 @@ class Crawlers extends AbstractProvider 'SalesIntelligent', 'SauceNAO', 'SBIder', + 'scalaj-http', 'Scoop', 'scooter', 'ScoutJet', @@ -635,6 +649,7 @@ class Crawlers extends AbstractProvider 'Spinn3r', 'spray-can', 'Sprinklr ', + 'sqlmap', 'spyonweb', 'Sqworm', 'SSL Labs', @@ -652,6 +667,7 @@ class Crawlers extends AbstractProvider 'Symfony2 BrowserKit', 'SynHttpClient-Built', 'Sysomos', + 'Symfony BrowserKit', 'T0PHackTeam', 'Tarantula\/', 'Taringa UGC', @@ -671,7 +687,9 @@ class Crawlers extends AbstractProvider 'topster', 'touche.com', 'Traackr.com', + 'TrapitAgent', 'truwoGPS', + 'TulipChain', 'tweetedtimes\.com', 'Tweetminster', 'Tweezler\/', @@ -680,6 +698,7 @@ class Crawlers extends AbstractProvider 'ubermetrics-technologies', 'uclassify', 'UdmSearch', + 'UniversalFeedParser', 'Untiny', 'UnwindFetchor', 'updated', diff --git a/tests/crawlers.txt b/tests/crawlers.txt index 68d94a3..9feef01 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3112,3 +3112,25 @@ Sens.ai http://sovereign.ai/contact eContext/1.0 (eContext Classification Engine) Owler Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Slack/1.2.6 Chrome/45.0.2454.85 AtomShell/0.34.3 Safari/537.36 Slack_SSB/1.2.6 +Mozilla/5.0 (compatible; ChangesMeter/1.9.1; http://intuiware.com/apps/changes-meter) +Mozilla/5.0 (compatible; Gookey.co/1.0; +http://gookey.co/) +TrapitAgent/0.1 (feed processor; +http://trapit.com/about) +sqlmap/1.0-dev-nongit-201612050a8c (http://sqlmap.org) +UniversalFeedParser/3.3 +http://feedparser.org/ +TulipChain/5.xx (http://ostermiller.org/tulipchain/) Java/1.x.1_0x (http://apple.com/) Mac_OS_X/10.2.8 +scalaj-http/1.0 +HggH PhantomJS Screenshoter +fasthttp +fasthttp, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36 +lua-resty-http/0.10 (Lua) ngx_lua/10000 +GIS-LABS:CertMon +GuzzleHttp/6.2.1 PHP/7.0.15-0ubuntu0.16.04.4 +http-request/v0.7.0 (http://git.io/tl_S2w) node.js/v0.10.29 +intraVnews/1.x +MailChimp +G-i-g-a-b-o-t +Symfony BrowserKit +VCPP71 http-get 1.0a +Jaunt/1.2 +Mozilla/5.0 (compatible; Owler/0.4; +; ) +MovableType/x.x