From f9c591e0f0ef0773870329a3ecf124c543450344 Mon Sep 17 00:00:00 2001 From: Ilian Iliev Date: Tue, 31 Jan 2012 17:02:14 +0200 Subject: [PATCH 1/2] Sitemapindex support added XMLSitemapParser class created and sitemap processign split into several methods Dependencies list added to README --- README.rst | 16 +++-- simple_site_checker.py | 152 ++++++++++++++++++++++++++--------------- sitemap.xml | 28 ++++++++ 3 files changed, 136 insertions(+), 60 deletions(-) create mode 100644 sitemap.xml diff --git a/README.rst b/README.rst index 1a493ae..6a06594 100644 --- a/README.rst +++ b/README.rst @@ -17,7 +17,15 @@ Usage: python simple_site_checker.py http://example.com/sitemap.xml -Todo: -===== - -* Enable XML sitemapindex processing +Dependencies: +============= + +Simple Site Checker depends on the following modules: +* argparse +* lxml + + +Feel free to add any issues, bug reports, comments or advices to +https://github.com/IlianIliev/Simple-Site-Checker + or +http://ilian.i-n-i.org/simple-site-checker/ \ No newline at end of file diff --git a/simple_site_checker.py b/simple_site_checker.py index aa8c2ca..9accd9e 100644 --- a/simple_site_checker.py +++ b/simple_site_checker.py @@ -8,7 +8,8 @@ from lxml import etree -XMLNS = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'} +SITEMAP_NAMESPACE = 'http://www.sitemaps.org/schemas/sitemap/0.9' +XMLNS = {'sitemap': SITEMAP_NAMESPACE} DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' @@ -27,80 +28,119 @@ } +logger = logging.getLogger(__name__) + + class HeadRequest(urllib2.Request): def get_method(self): return "HEAD" +class XMLSitemapParser(object): + total = 0 + succeeded = 0 + failed = [] + sitemaps = {} + + def load_sitemap(self, url): + logger.debug('Loading sitemap %s' % url) + if '://' in url: + try: + sitemap = urllib2.urlopen(url) + except urllib2.HTTPError, e: + if e.code == 404: + logger.error('Sitemap not found as %s' % url) + elif e.code == 500: + logger.error('Server error when accessing sitemap as %s' % url) + sys.exit(1) + except Exception, e: + logger.debug('Unexpected error', e) + logger.error('Unexpected error while loading sitemap.') + sys.exit(1) + else: + try: + path = os.path.abspath(url) + sitemap = open(url) + except Exception, e: + logger.error('Unable to load sitemap file from %s' % path) + logger.debug(e) + sys.exit(1) + try: + tree = etree.parse(sitemap) + except Exception, e: + logger.debug('Unexpected error', e) + logger.error('Unexpected error while parsing sitemap XML from %s' % url) + else: + root = tree.getroot() + if root.tag == '{%s}sitemapindex' % SITEMAP_NAMESPACE: + self.process_sitemapindex(tree) + else: + self.sitemaps[url] = tree + + def process_sitemapindex(self, tree): + logger.debug('Processing sitemapindex') + for tag in tree.xpath('//sitemap:sitemap/sitemap:loc', namespaces=XMLNS): + sitemap_loc = tag.text + self.load_sitemap(sitemap_loc) + + def process_sitemap(self, sitemap): + tree = self.sitemaps[sitemap] + logger.debug('Processing sitemap %s' % sitemap) + loc_tags = tree.xpath('//sitemap:loc', namespaces=XMLNS) + urls_found = len(loc_tags) + self.total += urls_found + logger.info('%i URLs found' % urls_found) + + for tag in loc_tags: + loc_url = tag.text + logger.debug('Checking %s' % loc_url) + try: + response = urllib2.urlopen(HeadRequest(loc_url)) + self.succeeded += 1 + except Exception, e: + self.failed.append((loc_url, e)) + logger.debug(loc_url, e) + logger.error('%s -> %s' % (loc_url, e)) + + def process_sitemaps(self): + for sitemap in self.sitemaps: + self.process_sitemap(sitemap) + + +def time_info(start, end): + hours, remainder = divmod((end-start).seconds, 3600) + minutes, seconds = divmod(remainder, 60) + logger.info('Start - %s' % start.strftime(DATETIME_FORMAT)) + logger.info('End - %s' % end.strftime(DATETIME_FORMAT)) + logger.info('Time elapsed %s:%s:%s' % (hours, minutes, seconds)) + + def main(): - parser = argparse.ArgumentParser(description='Simple Site Checker', + arg_parser = argparse.ArgumentParser(description='Simple Site Checker', formatter_class=argparse.RawTextHelpFormatter) - - parser.add_argument('sitemap', metavar='s', type=str, + arg_parser.add_argument('sitemap', metavar='s', type=str, help='XML sitemap URL/path') - - parser.add_argument('-v', '--verbose', type=int, required=False, + arg_parser.add_argument('-v', '--verbose', type=int, required=False, help=VERBOSE_HELP, default = 0, choices=LOGGING_LEVELS) - args = parser.parse_args() + args = arg_parser.parse_args() - logger = logging.getLogger(__name__) logging.basicConfig(format='%(levelname)s: %(message)s', level = LOGGING_LEVELS[args.verbose]) - - url = args.sitemap - if '://' in url: - try: - sitemap = urllib2.urlopen(url) - except urllib2.HTTPError, e: - if e.code == 404: - logger.error('Sitemap not found as %s' % url) - elif e.code == 500: - logger.error('Server error when accessing sitemap as %s' % url) - sys.exit(1) - except Exception, e: - logger.debug('Unexpected error', e) - logger.error('Unexpected error while loading sitemap.') - sys.exit(1) - else: - try: - path = os.path.abspath(url) - sitemap = open(url) - except Exception, e: - logger.error('Unable to load sitemap file from %s' % path) - logger.debug(e) - sys.exit(1) start = datetime.now() - tree = etree.parse(sitemap) - loc_tags = tree.xpath('//sitemap:loc', namespaces=XMLNS) - total = len(loc_tags) - logger.info('%i URLs found' % total) - succeeded = 0 - failed = [] - for tag in loc_tags: - loc_url = tag.text - logger.debug('Checking %s' % loc_url) - try: - response = urllib2.urlopen(HeadRequest(loc_url)) - succeeded += 1 - except Exception, e: - failed.append((loc_url, e)) - logger.debug(loc_url, e) - logger.error('%s -> %s' % (loc_url, e)) + url = args.sitemap + parser = XMLSitemapParser() + parser.load_sitemap(url) + parser.process_sitemaps() end = datetime.now() - - hours, remainder = divmod((end-start).seconds, 3600) - minutes, seconds = divmod(remainder, 60) - failed_number = len(failed) + + failed_number = len(parser.failed) logger.info('Result - Checked %i, succeeded %i, failed %i' % - (total, succeeded, failed_number)) - logger.info('Start - %s' % start.strftime(DATETIME_FORMAT)) - logger.info('End - %s' % end.strftime(DATETIME_FORMAT)) - logger.info('Time elapsed %s:%s:%s' % (hours, minutes, seconds)) - + (parser.total, parser.succeeded, failed_number)) + time_info(start, end) if __name__ == '__main__': diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 0000000..6a6fcb1 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,28 @@ + + + + + http://ilian.i-n-i.org/ + 2012-01-30T08:00:18+00:00 + daily + 1.0 + + + http://ilian.i-n-i.org/faking-attributes-in-python-classes/ + 2012-01-27T12:48:03+00:00 + daily + 0.2 + + + http://ilian.i-n-i.org/connecting-django-models-with-outer-applications/ + 2012-01-24T09:53:05+00:00 + daily + 0.2 + + + http://ilian.i-n-i.org/software-for-business/ + 2012-01-24T02:56:08+00:00 + daily + 0.2 + + \ No newline at end of file From 69da22d384ac0ba723430d459b807dc6078641b4 Mon Sep 17 00:00:00 2001 From: Ilian Iliev Date: Tue, 31 Jan 2012 18:41:03 +0200 Subject: [PATCH 2/2] Merge branch 'master' of https://github.com/IlianIliev/Simple-Site-Checker