Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
266 lines (199 sloc) 8.22 KB
import re, urlparse, itertools
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from datetime import date, timedelta
language = 'en'
site_url = 'http://www.infoq.com/'
title_prefix = 'InfoQ'
date_regexes = [
r'Jan\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Feb\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Mar\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Apr\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'May\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Jun\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Jul\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Aug\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Sep\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Oct\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Nov\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'Dec\s+(?P<day>\d{2}),\s+(?P<year>\d{4})'
]
'''
language = 'zh'
site_url = 'http://www.infoq.com/cn/'
title_prefix = 'InfoQ中国站'
date_regexes = [
r'一月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'二月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'三月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'四月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'五月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'六月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'七月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'八月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'九月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'十月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'十一月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
r'十二月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})'
]
'''
# the sections to download
sections = [ 'news', 'articles', 'interviews' ]
# the range of date (both inclusive) to download
date_range = (date(2013, 6, 20), date(2013, 6, 22))
# the range of date to override for sections
section_date_ranges = {
# 'news': (date(2013, 6, 21), date(2013, 6, 22)),
# 'articles': (date(2013, 6, 5), date(2013, 6, 10)),
# 'interviews': (date(2013, 1, 1), date(2013, 3, 1))
}
# do NOT touch the code below unless you know what to do
def range2str(range, shorten):
year_fmt = '%Y%m%d'
month_fmt = '%m%d'
day_fmt = '%d'
begin, end = range
if begin == end:
return begin.strftime(year_fmt)
else:
text = begin.strftime(year_fmt) + "~"
if not shorten:
return text + end.strftime(year_fmt)
if begin.year == end.year and begin.month == end.month:
return text + end.strftime(day_fmt)
if begin.year == end.year:
return text + end.strftime(month_fmt)
return text + end.strftime(year_fmt)
def generate_title(prefix):
text = prefix + ' ' + range2str(date_range, True)
for sec in sections:
range = section_date_ranges.get(sec)
if range:
text = text + ' ' + sec[0].upper() + range2str(range, True)
return text
def parse_date(text):
for i in xrange(len(date_regexes)):
m = re.search(date_regexes[i], text)
if not m: continue
year = int(m.group('year'))
month = i + 1
day = int(m.group('day'))
return date(year, month, day)
def get_text(tag):
text = ''
for c in tag.contents:
if isinstance(c, NavigableString):
text = text + str(c)
else:
text = text + get_text(c)
return text.strip()
def find_by_class(tag, name, cls):
for c in tag.findAll(name):
c_cls = c.get('class')
if not c_cls: continue
if cls not in c_cls: continue
yield c
_section_texts = {}
_section_item_classes = {
'news': ['news_type_block'],
'articles': ['news_type1', 'news_type2'],
'interviews': ['news_type_video']
}
class InfoQ(BasicNewsRecipe):
title = title_prefix
language = language
no_stylesheets = True
keep_only_tags = [ { 'id': 'content' } ]
remove_tags = [
{ 'id': 'noOfComments' },
{ 'class': 'share_this' },
{ 'class': 'article_page_right' }
]
def get_items(self, section):
print '>>> Retrieving items for section: ', section
text_retrieved = False
count = 0
while True:
print '>>> Loading items from ' + section + '/' + str(count)
root = self.index_to_soup(site_url + section + '/' + str(count))
content_div = root.find('div', { 'id': 'content' })
if not text_retrieved:
text_retrieved = True
text = content_div.h2.string.strip()
_section_texts[section] = text
print '>>> Text for section "' + section + '": ' + text
for item_class in _section_item_classes[section]:
for item_div in find_by_class(content_div, 'div', item_class):
item = {}
link = item_div.h2.a
item['title'] = link.string.strip()
item['url'] = urlparse.urljoin(site_url, link['href'])
item['description'] = get_text(item_div.p)
author_span = item_div.find('span', { 'class': 'author' })
date_text = str(author_span.contents[-1])
item['date'] = parse_date(date_text)
print '>>> Item parsed: ', item
yield item
count = count + 1
def parse_index(self):
self.title = generate_title(self.title)
index = []
for sec in sections:
item_list = []
range = section_date_ranges.get(sec)
if not range: range = date_range
begin, end = range
for item in self.get_items(sec):
date = item['date']
if date > end: continue
if date < begin: break
item_list.append(item)
index.append((_section_texts[sec] + ' (' + range2str(range, False) + ')', item_list))
return index
def postprocess_html(self, soup, first_fetch):
author_general = soup.find('span', { 'class': 'author_general' })
author_general.em.extract()
# the complete content
full_div = None
transcript_div = soup.find('div', { 'id': 'transcript' })
if transcript_div: # that's an interview
# get all <div class="qa" />
qa_div_list = list(find_by_class(transcript_div, 'div', 'qa'))
for qa_div in qa_div_list:
qa_div.extract()
# replace all <a class="question_link">...</a> with <strong>...</strong>
question_link = qa_div.find('a', { 'class': 'question_link' })
question_strong = Tag(soup, 'strong')
question_strong.append(question_link.string)
question_link.replaceWith(question_strong)
full_div = find_by_class(soup.find('div', { 'id': 'content' }), 'div', 'presentation_full').next()
# clean the <h1 />
full_div.h1.span.extract()
title_div = full_div.h1.div
title_div.replaceWith(title_div.string)
# clear the presentation area
for div in full_div.findAll('div'):
div.extract()
# add qa list back to presentation area
for qa_div in qa_div_list:
full_div.append(qa_div)
else:
# text only without title
text_div = find_by_class(soup, 'div', 'text_info').next()
text_div.extract()
for other in text_div.findAll('div'):
other.extract()
# full_div contains title
full_div = soup.find('div', { 'id': 'content' })
for other in full_div.findAll('div'):
other.extract()
full_div.append(text_div)
full_div.extract()
nav_div = soup.body.div
nav_div.extract()
# keep nav_div and full_div in <body /> only
for other in soup.body:
other.extract()
soup.body.append(nav_div)
soup.body.append(full_div)
return soup