Skip to content

Commit

Permalink
Merge branch 'master' into 2.0
Browse files Browse the repository at this point in the history
# Conflicts:
#	.travis.yml
#	src/CrawlerDetect.php
  • Loading branch information
JayBizzle committed Jun 20, 2017
2 parents 63d8d7f + 55e98a6 commit 8668c74
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 14 deletions.
21 changes: 9 additions & 12 deletions src/CrawlerDetect.php
Expand Up @@ -89,7 +89,7 @@ public function __construct(array $headers = null, string $userAgent = null)
$this->compiledExclusions = $this->compileRegex($this->exclusions->getAll());

$this->setHttpHeaders($headers);
$this->setUserAgent($userAgent);
$this->userAgent = $this->setUserAgent($userAgent);
}

/**
Expand All @@ -107,7 +107,7 @@ public function compileRegex($patterns): string
/**
* Set HTTP headers.
*
* @param array|null $httpHeaders
* @param void
*/
public function setHttpHeaders(array $httpHeaders = null): void
{
Expand Down Expand Up @@ -141,22 +141,19 @@ public function getUaHttpHeaders(): array
/**
* Set the user agent.
*
* @param string|null $userAgent
* @param string $userAgent
*/
public function setUserAgent(string $userAgent = null): void
public function setUserAgent(string $userAgent = null): ?string
{
if (false === empty($userAgent)) {
$this->userAgent = $userAgent;
} else {
$this->userAgent = null;
if (is_null($userAgent)) {
foreach ($this->getUaHttpHeaders() as $altHeader) {
if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow.
$this->userAgent .= $this->httpHeaders[$altHeader].' ';
if (isset($this->httpHeaders[$altHeader])) {
$userAgent .= $this->httpHeaders[$altHeader].' ';
}
}

$this->userAgent = ! empty($this->userAgent) ? trim($this->userAgent) : null;
}

return $userAgent;
}

/**
Expand Down
32 changes: 30 additions & 2 deletions src/Fixtures/Crawlers.php
Expand Up @@ -51,6 +51,7 @@ class Crawlers extends AbstractProvider
'a3logics\.in',
'A6-Indexer',
'a\.pr-cy\.ru',
'Abonti\/',
'Aboundex',
'aboutthedomain',
'Accoona-AI-Agent',
Expand All @@ -70,6 +71,7 @@ class Crawlers extends AbstractProvider
'alertra',
'alexa site audit',
'Alibaba\.Security\.Heimdall',
'allloadin\.com',
'alyze\.info',
'amagit',
'AndroidDownloadManager',
Expand Down Expand Up @@ -133,6 +135,7 @@ class Crawlers extends AbstractProvider
'CERT\.at-Statistics-Survey',
'cg-eye',
'changedetection',
'ChangesMeter\/',
'Charlotte',
'CheckHost',
'checkprivacy',
Expand All @@ -142,6 +145,7 @@ class Crawlers extends AbstractProvider
'CJNetworkQuality',
'clips\.ua\.ac\.be',
'Cloud mapping experiment',
'CloudEndure',
'CloudFlare-AlwaysOnline',
'Cloudinary\/[0-9]',
'cmcm\.com',
Expand Down Expand Up @@ -189,6 +193,8 @@ class Crawlers extends AbstractProvider
'ec2linkfinder',
'eCairn-Grabber',
'ECCP',
'echocrawl',
'eContext\/',
'ElectricMonk',
'elefent',
'EMail Exractor',
Expand All @@ -210,6 +216,7 @@ class Crawlers extends AbstractProvider
'facebookplatform',
'fairshare',
'Faraday v',
'fasthttp',
'Faveeo',
'Favicon downloader',
'FavOrg',
Expand Down Expand Up @@ -239,6 +246,7 @@ class Crawlers extends AbstractProvider
'free thumbnails',
'FreeWebMonitoring SiteChecker',
'Funnelback',
'G-i-g-a-b-o-t',
'g00g1e\.net',
'GAChecker',
'ganarvisitas\/[0-9]',
Expand All @@ -251,6 +259,7 @@ class Crawlers extends AbstractProvider
'GetURLInfo\/[0-9]',
'Ghost Inspector',
'GigablastOpenSource',
'GIS-LABS',
'github\.com\/',
'Go [\d\.]* package http',
'Go-http-client',
Expand Down Expand Up @@ -279,6 +288,7 @@ class Crawlers extends AbstractProvider
'GoogleDocs',
'GoogleHC\/',
'GoogleProducer',
'Gookey',
'GoScraper',
'GoSpotCheck',
'GoSquared-Status-Checker',
Expand All @@ -289,6 +299,7 @@ class Crawlers extends AbstractProvider
'grouphigh',
'grub-client',
'GTmetrix',
'GuzzleHttp',
'gvfs\/',
'HAA(A)?RTLAND http client',
'Hatena',
Expand All @@ -304,8 +315,10 @@ class Crawlers extends AbstractProvider
'ht:\/\/check',
'htdig',
'HTMLParser\/',
'http-get',
'HTTP-Header-Abfrage',
'http-kit',
'http-request\/',
'HTTP-Tiny',
'HTTP_Compression_Test',
'http_request2',
Expand Down Expand Up @@ -348,6 +361,7 @@ class Crawlers extends AbstractProvider
'internet_archive',
'InternetSeer',
'internetVista monitor',
'intraVnews',
'IODC',
'IOI',
'iplabel',
Expand All @@ -360,6 +374,7 @@ class Crawlers extends AbstractProvider
'iskanie',
'iZSearch',
'janforman',
'Jaunt\/',
'Jigsaw',
'Jobboerse',
'jobo',
Expand All @@ -374,7 +389,6 @@ class Crawlers extends AbstractProvider
'Kml-Google',
'knows\.is',
'kouio',
'KrOWLer',
'kulturarw3',
'KumKie',
'L\.webis',
Expand Down Expand Up @@ -402,16 +416,18 @@ class Crawlers extends AbstractProvider
'LongURL API',
'looksystems\.net',
'ltx71',
'lua-resty-http',
'lwp-trivial',
'lycos',
'LYT\.SR',
'mabontland',
'MagpieRSS',
'Mail.Ru',
'MailChimp\.com',
'MailChimp',
'Mandrill',
'MapperCmd',
'marketinggrader',
'masscan\/[0-9]',
'Mediapartners-Google',
'MegaIndex\.ru',
'Melvil Rawi\/',
Expand All @@ -437,6 +453,7 @@ class Crawlers extends AbstractProvider
'Moreover',
'Morning Paper',
'mowser',
'MovableType',
'Mrcgiguy',
'mShots',
'MVAClient',
Expand Down Expand Up @@ -483,6 +500,7 @@ class Crawlers extends AbstractProvider
'Optimizer',
'Orbiter',
'OrgProbe\/[0-9]',
'Owler',
'ow\.ly',
'ownCloud News',
'OxfordCloudService\/[0-9]',
Expand All @@ -501,6 +519,7 @@ class Crawlers extends AbstractProvider
'peerindex',
'Peew',
'PhantomJS\/',
'PhantomJS Screenshoter',
'Photon\/',
'phpcrawl',
'phpservermon',
Expand Down Expand Up @@ -566,6 +585,7 @@ class Crawlers extends AbstractProvider
'SalesIntelligent',
'SauceNAO',
'SBIder',
'scalaj-http',
'Scoop',
'scooter',
'ScoutJet',
Expand Down Expand Up @@ -610,6 +630,7 @@ class Crawlers extends AbstractProvider
'SiteTruth',
'sitexy\.com',
'SkypeUriPreview',
'Slack\/',
'slider\.com',
'slurp',
'SMRF URL Expander',
Expand All @@ -620,13 +641,15 @@ class Crawlers extends AbstractProvider
'Snoopy',
'sogou web',
'SortSite',
'sovereign\.ai',
'spaziodati',
'Specificfeeds',
'speedy',
'SPEng',
'Spinn3r',
'spray-can',
'Sprinklr ',
'sqlmap',
'spyonweb',
'Sqworm',
'SSL Labs',
Expand All @@ -644,6 +667,7 @@ class Crawlers extends AbstractProvider
'Symfony2 BrowserKit',
'SynHttpClient-Built',
'Sysomos',
'Symfony BrowserKit',
'T0PHackTeam',
'Tarantula\/',
'Taringa UGC',
Expand All @@ -659,10 +683,13 @@ class Crawlers extends AbstractProvider
'ThumbSniper',
'TinEye',
'Tiny Tiny RSS',
'TLSProbe\/',
'topster',
'touche.com',
'Traackr.com',
'TrapitAgent',
'truwoGPS',
'TulipChain',
'tweetedtimes\.com',
'Tweetminster',
'Tweezler\/',
Expand All @@ -671,6 +698,7 @@ class Crawlers extends AbstractProvider
'ubermetrics-technologies',
'uclassify',
'UdmSearch',
'UniversalFeedParser',
'Untiny',
'UnwindFetchor',
'updated',
Expand Down
52 changes: 52 additions & 0 deletions tests/crawlers.txt
Expand Up @@ -3082,3 +3082,55 @@ Netpursual/1.0
Kaspersky Lab CFR link resolver cfradmins@kaspersky.com
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453 Safari/537.36 (compatible; Google-HotelAdsVerifier/2.0)
b0t
"echocrawl 2.0"
masscan/1.0
Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots)
Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots
Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexPagechecker/1.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexCalendar/1.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexSitelinks; Dyatel; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexAntivirus/2.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexVertis/3.0; +http://yandex.com/bots)
Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots)
jetmon/1.0 (Jetpack Site Uptime Monitor by WordPress.com)
Mozilla/5.0 (compatible; Abonti/0.8 - http://www.abonti.com)
CloudEndure Scanner (ops@cloudendure.com)
AFB/3.0 (+http://allloadin.com)
TLSProbe/1.0 (+https://scan.trustnet.venafi.com/)
Sens.ai http://sovereign.ai/contact
eContext/1.0 (eContext Classification Engine)
Owler
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Slack/1.2.6 Chrome/45.0.2454.85 AtomShell/0.34.3 Safari/537.36 Slack_SSB/1.2.6
Mozilla/5.0 (compatible; ChangesMeter/1.9.1; http://intuiware.com/apps/changes-meter)
Mozilla/5.0 (compatible; Gookey.co/1.0; +http://gookey.co/)
TrapitAgent/0.1 (feed processor; +http://trapit.com/about)
sqlmap/1.0-dev-nongit-201612050a8c (http://sqlmap.org)
UniversalFeedParser/3.3 +http://feedparser.org/
TulipChain/5.xx (http://ostermiller.org/tulipchain/) Java/1.x.1_0x (http://apple.com/) Mac_OS_X/10.2.8
scalaj-http/1.0
HggH PhantomJS Screenshoter
fasthttp
fasthttp, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36
lua-resty-http/0.10 (Lua) ngx_lua/10000
GIS-LABS:CertMon
GuzzleHttp/6.2.1 PHP/7.0.15-0ubuntu0.16.04.4
http-request/v0.7.0 (http://git.io/tl_S2w) node.js/v0.10.29
intraVnews/1.x
MailChimp
G-i-g-a-b-o-t
Symfony BrowserKit
VCPP71 http-get 1.0a
Jaunt/1.2
Mozilla/5.0 (compatible; Owler/0.4; +; )
MovableType/x.x

0 comments on commit 8668c74

Please sign in to comment.