In [1]:
from __future__ import unicode_literals

import sys
import os
from bs4 import BeautifulSoup
import urllib2
import cv2
from matplotlib import pyplot as plt
import matplotlib
from PIL import Image, ImageDraw, ImageFont

In [2]:
# from https://stackoverflow.com/questions/4460921/extract-the-first-paragraph-from-a-wikipedia-article-python
import re
import textwrap
import yaml
import urllib
import urllib2
import random

import wikipedia

In [3]:
# set utf-8
stdi, stdo, stde = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdin, sys.stdout, sys.stderr = stdi, stdo, stde
sys.getdefaultencoding()

'utf-8'


In [4]:
OUTPUT_PATH = '/home/will/sdb1/cvdata/wikipages500_2_samesize/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

In [5]:
DATA_SIZE = 500
IMG_SIZE = (227, 227)

In [6]:
LANG = {
    'en': 'https://en.wikipedia.org',
    'zh': 'https://zh.wikipedia.org'
}

In [7]:
class WikipediaError(Exception):
    pass

In [8]:
class Wikipedia:
    url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s'
    url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s'
    url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml'
   
    def __init__(self, lang):
        self.lang = lang
   
    def __fetch(self, url):
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0')
       
        try:
            result = urllib2.urlopen(request)
        except urllib2.HTTPError, e:
            raise WikipediaError(e.code)
        except urllib2.URLError, e:
            raise WikipediaError(e.reason)
       
        return result
   
    def article(self, article):
        url = self.url_article % (self.lang, urllib.quote_plus(article))
        content = self.__fetch(url).read()
       
        if content.upper().startswith('#REDIRECT'):
            match = re.match('(?i)#REDIRECT \[\[([^\[\]]+)\]\]', content)
           
            if not match == None:
                return self.article(match.group(1))
           
            raise WikipediaError('Can\'t found redirect article.')
       
        return content
   
    def image(self, image, thumb=None):
        url = self.url_image % (self.lang, image)
        result = self.__fetch(url)
        content = result.read()
       
        if thumb:
            url = result.geturl() + '/' + thumb + 'px-' + image
            url = url.replace('/commons/', '/commons/thumb/')
            url = url.replace('/' + self.lang + '/', '/' + self.lang + '/thumb/')
           
            return self.__fetch(url).read()
       
        return content
   
    def search(self, query, page=1, limit=10):
        offset = (page - 1) * limit
        url = self.url_search % (self.lang, urllib.quote_plus(query), offset, limit)
        content = self.__fetch(url).read()
       
        parsed = yaml.load(content)
        search = parsed['query']['search']
       
        results = []
       
        if search:
            for article in search:
                title = article['title'].strip()
               
                snippet = article['snippet']
                snippet = re.sub(r'(?m)<.*?>', '', snippet)
                snippet = re.sub(r'\s+', ' ', snippet)
                snippet = snippet.replace(' . ', '. ')
                snippet = snippet.replace(' , ', ', ')
                snippet = snippet.strip()
               
                wordcount = article['wordcount']
               
                results.append({
                    'title' : title,
                    'snippet' : snippet,
                    'wordcount' : wordcount
                })
       
        # yaml.dump(results, default_style='', default_flow_style=False,
        #     allow_unicode=True)
        return results

In [9]:
def unwiki(wiki):
    """
   Remove wiki markup from the text.
   """
    wiki = re.sub(r'(?i)\{\{IPA(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
    wiki = re.sub(r'(?i)\{\{Lang(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
    wiki = re.sub(r'\{\{[^\{\}]+\}\}', '', wiki)
    wiki = re.sub(r'(?m)\{\{[^\{\}]+\}\}', '', wiki)
    wiki = re.sub(r'(?m)\{\|[^\{\}]*?\|\}', '', wiki)
    wiki = re.sub(r'(?i)\[\[Category:[^\[\]]*?\]\]', '', wiki)
    wiki = re.sub(r'(?i)\[\[Image:[^\[\]]*?\]\]', '', wiki)
    wiki = re.sub(r'(?i)\[\[File:[^\[\]]*?\]\]', '', wiki)
    wiki = re.sub(r'\[\[[^\[\]]*?\|([^\[\]]*?)\]\]', lambda m: m.group(1), wiki)
    wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', lambda m: m.group(1), wiki)
    wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', '', wiki)
    wiki = re.sub(r'(?i)File:[^\[\]]*?', '', wiki)
    wiki = re.sub(r'\[[^\[\]]*? ([^\[\]]*?)\]', lambda m: m.group(1), wiki)
    wiki = re.sub(r"''+", '', wiki)
    wiki = re.sub(r'(?m)^\*$', '', wiki)

    return wiki

def unhtml(html):
    """
   Remove HTML from the text.
   """
    html = re.sub(r'(?i)&nbsp;', ' ', html)
    html = re.sub(r'(?i)<br[ \\]*?>', '\n', html)
    html = re.sub(r'(?m)<!--.*?--\s*>', '', html)
    html = re.sub(r'(?i)<ref[^>]*>[^>]*<\/ ?ref>', '', html)
    html = re.sub(r'(?m)<.*?>', '', html)
    html = re.sub(r'(?i)&amp;', '&', html)

    return html

def punctuate(text):
    """
   Convert every text part into well-formed one-space
   separate paragraph.
   """
    text = re.sub(r'\r\n|\n|\r', '\n', text)
    text = re.sub(r'\n\n+', '\n\n', text)

    parts = text.split('\n\n')
    partsParsed = []

    for part in parts:
        part = part.strip()

        if len(part) == 0:
            continue

        partsParsed.append(part)

    return '\n\n'.join(partsParsed)


In [10]:
def article_urls_crawl(prefix, root_url, count):
    atc_list = set([])
    link_list = set([root_url])
    while True:
        url = random.choice(tuple(link_list))
        try:
            content = urllib.urlopen(url)
        except:
            continue
#         print urllib.urlopen(url).headers.getheader('Content-Type')
        soup = BeautifulSoup(content, "html.parser")
        base=soup.find('div', id="bodyContent")

        for link in BeautifulSoup(str(base), "html.parser").findAll("a"):
            if 'href' in link.attrs:  # Make sure it is link
                if link['href'].startswith('/wiki/') and prefix + link['href'] != url:
                    link_list.add(prefix + link['href'])
                if ( 
                    link['href'].startswith('/wiki/') and  # Make sure it is a wiki page
                    len(link['href'].split('/')) == 3 and  # Make sure it is an article
                    len(link['href'].split(':')) == 1      # Make sure it is not category
                ):
#                     print(link['href'])
                    title = str(link['href'])
                    title = urllib.unquote(title).decode('utf8')
                    print title
                    atc_list.add(prefix + title)
                    if len(atc_list) == count:
                        print('fetched ' + str(count) + ' article urls from ' + prefix + '\n')
                        return atc_list

urls_by_lang = {}

for key in LANG:
    urls_by_lang[key] = article_urls_crawl(LANG[key], LANG[key], DATA_SIZE)

/wiki/Wikipedia
/wiki/Free_content
/wiki/Encyclopedia
/wiki/English_language
/wiki/Loss_of_MV_Darlwyne
/wiki/Cornwall
/wiki/Picket_boat
/wiki/Mylor,_Cornwall
/wiki/Fowey
/wiki/Board_of_Trade
/wiki/Loss_of_MV_Darlwyne
/wiki/Bill_Brown_(cricketer)
/wiki/Octopus
/wiki/Maurice_Wilder-Neligan
/wiki/Benjamin_Goodwin_Seielstad
/wiki/Popular_Science
/wiki/Incendiary_balloon
/wiki/Incendiary_kite
/wiki/Gaza_Strip
/wiki/Catherine_Gayer
/wiki/Coloratura_soprano
/wiki/Deutsche_Oper_Berlin
/wiki/Intolleranza_1960
/wiki/Melusine_(Reimann)
/wiki/Schwetzingen_Festival
/wiki/Cloud_Kingdoms
/wiki/Li_Lin_(biochemist)
/wiki/Wet_market
/wiki/World_Trade_Center_station_(PATH)
/wiki/Averroes
/wiki/Philosophy
/wiki/Fiqh
/wiki/Medicine
/wiki/Astronomy
/wiki/An_Unearthly_Child
/wiki/TARDIS
/wiki/2018_Lombok_earthquake
/wiki/Lombok
/wiki/President_of_Indonesia
/wiki/Joko_Widodo
/wiki/Geraint_Thomas
/wiki/2018_Tour_de_France
/wiki/Theodore_Edgar_McCarrick
/wiki/College_of_Cardinals
/wiki/July_2018_lunar_eclipse
/

/wiki/自由內容
/wiki/卢克·P·布莱克本
/wiki/美国
/wiki/慈善家
/wiki/肯塔基州
/wiki/政治家
/wiki/肯塔基州州长列表
/wiki/医生
/wiki/厄尼·弗莱彻
/wiki/特兰西瓦尼亚大学
/wiki/纳奇兹
/wiki/黄热病
/wiki/隔離檢疫
/wiki/密西西比河
/wiki/美国国会
/wiki/百慕大
/wiki/维多利亚_(英国君主)
/wiki/史家
/wiki/德克萨斯州
/wiki/路易斯安那州
/wiki/墨西哥湾
/wiki/田纳西州
/wiki/孟菲斯_(田納西州)
/wiki/佛罗里达州
/wiki/费南迪纳比奇_(佛罗里达州)
/wiki/希克曼_(肯塔基州)
/wiki/民主党_(美国)
/wiki/普遍選舉
/wiki/共和黨_(美國)
/wiki/特赦
/wiki/埃迪维尔_(肯塔基州)
/wiki/纳税人
/wiki/列克星敦_(肯塔基州)
/wiki/南華足球隊歷史
/wiki/德国足球甲级联赛
/wiki/赫尔穆特·舍恩
/wiki/香港博物館列表
/wiki/纽约市立大学诺贝尔奖得主列表
/wiki/哥倫比亞大學諾貝爾獎得主列表
/wiki/GNU_Mailman
/wiki/GNU計劃
/wiki/郵遞論壇
/wiki/桑黃
/wiki/德意志帝国
/wiki/舰队巡洋舰
/wiki/艦隊
/wiki/蒋母墓
/wiki/蒋中正
/wiki/王采玉
/wiki/喬治·華盛頓紀念大橋
/wiki/自殺
/wiki/臺灣
/wiki/桃園市
/wiki/大溪區
/wiki/大溪永昌宮
/wiki/詔安客家人
/wiki/開漳聖王
/wiki/2017年中国朝鲜族灯光节
/wiki/延边朝鲜族自治州
/wiki/中华人民共和国国庆节
/wiki/中秋节
/wiki/中华人民共和国
/wiki/吉林省
/wiki/延边朝鲜族自治州
/wiki/延吉市
/wiki/中國朝鮮族
/wiki/中國朝鮮族
/wiki/勃兰登堡州
/wiki/海利根拉贝
/wiki/洪森
/wiki/柬埔寨人民黨
/wiki/2018年柬埔寨大选
/wiki/龍目島
/wiki/矩震級
/wiki/2018年龙目岛地震
/wiki/葉里溫
/wiki/2008年亞美尼亞總統選舉
/wiki/羅伯特·科恰良

In [11]:
def getChinese(context):
    context = context.decode("utf-8") # convert context from str to unicode
    filtrate = re.compile(u'[^\u4E00-\u9FA5]') # non-Chinese unicode range
    context = filtrate.sub(r'', context) # remove all non-Chinese characters
    context = context.encode("utf-8") # convert unicode back to str
    return context

In [12]:
print "Заглавная_страница"

Заглавная_страница


In [13]:
def generate_imgs(langs, urls_by_lang, count, fontsizerange):
    for key in langs:
        print 'generating imgs for ' + key
        i = 0
        while i < count:
            wikipedia.set_lang(key)
            res = None
            try:
                res = wikipedia.summary(random.choice(tuple(urls_by_lang[key])).split('/')[-1])
            except:
                continue
            
            text = res
        #     if key == 'en':
        #         for n in re.findall(ur'[a-zA-Z ]+',res):
        #             text = text + n
        #     elif key == 'zh':
        #         for n in re.findall(ur'[\u4e00-\u9fff]+',res):
        #             text = text + n
#             print text


            image = Image.new('L', IMG_SIZE, 255)
            draw = ImageDraw.Draw(image)
            if key == 'en':
                font = ImageFont.truetype('/usr/share/fonts/MyFonts/consola.ttf', random.randint(fontsizerange[0], fontsizerange[1]))
                margin = 0
                offset = 0
                for line in textwrap.wrap(text, width=23):
                    draw.text((margin, offset), line, font=font, fill="#000000")
                    offset += font.getsize(line)[1]
            elif key == 'zh':
                font = ImageFont.truetype('/usr/share/fonts/MyFonts/msyh.ttf', random.randint(fontsizerange[0], fontsizerange[1]))
                margin = 0
                offset = 0
                for line in textwrap.wrap(text, width=23):
                    draw.text((margin, offset), line, font=font, fill="#000000")
                    offset += font.getsize(line)[1]
            image.save(OUTPUT_PATH + key + '-' + str(i) + '-wiki.jpg')
            i += 1

In [14]:
generate_imgs(LANG, urls_by_lang, DATA_SIZE, (40, 40))

generating imgs for en
generating imgs for zh




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html.parser")

  markup_type=markup_type))
