In [1]:
from __future__ import unicode_literals

import sys
import os
from bs4 import BeautifulSoup
import urllib2
import cv2
from matplotlib import pyplot as plt
import matplotlib
from PIL import Image, ImageDraw, ImageFont

In [2]:
# from https://stackoverflow.com/questions/4460921/extract-the-first-paragraph-from-a-wikipedia-article-python
import re
import textwrap
import yaml
import urllib
import urllib2
import random

import wikipedia

In [3]:
# set utf-8
stdi, stdo, stde = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdin, sys.stdout, sys.stderr = stdi, stdo, stde
sys.getdefaultencoding()

'utf-8'


In [4]:
OUTPUT_PATH = '/home/will/sdb1/cvdata/wikipages500_2/'

In [5]:
DATA_SIZE = 500
IMG_SIZE = (227, 227)

In [6]:
LANG = {
    'en': 'https://en.wikipedia.org',
    'zh': 'https://zh.wikipedia.org'
}

In [7]:
class WikipediaError(Exception):
    pass

In [8]:
class Wikipedia:
    url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s'
    url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s'
    url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml'
   
    def __init__(self, lang):
        self.lang = lang
   
    def __fetch(self, url):
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0')
       
        try:
            result = urllib2.urlopen(request)
        except urllib2.HTTPError, e:
            raise WikipediaError(e.code)
        except urllib2.URLError, e:
            raise WikipediaError(e.reason)
       
        return result
   
    def article(self, article):
        url = self.url_article % (self.lang, urllib.quote_plus(article))
        content = self.__fetch(url).read()
       
        if content.upper().startswith('#REDIRECT'):
            match = re.match('(?i)#REDIRECT \[\[([^\[\]]+)\]\]', content)
           
            if not match == None:
                return self.article(match.group(1))
           
            raise WikipediaError('Can\'t found redirect article.')
       
        return content
   
    def image(self, image, thumb=None):
        url = self.url_image % (self.lang, image)
        result = self.__fetch(url)
        content = result.read()
       
        if thumb:
            url = result.geturl() + '/' + thumb + 'px-' + image
            url = url.replace('/commons/', '/commons/thumb/')
            url = url.replace('/' + self.lang + '/', '/' + self.lang + '/thumb/')
           
            return self.__fetch(url).read()
       
        return content
   
    def search(self, query, page=1, limit=10):
        offset = (page - 1) * limit
        url = self.url_search % (self.lang, urllib.quote_plus(query), offset, limit)
        content = self.__fetch(url).read()
       
        parsed = yaml.load(content)
        search = parsed['query']['search']
       
        results = []
       
        if search:
            for article in search:
                title = article['title'].strip()
               
                snippet = article['snippet']
                snippet = re.sub(r'(?m)<.*?>', '', snippet)
                snippet = re.sub(r'\s+', ' ', snippet)
                snippet = snippet.replace(' . ', '. ')
                snippet = snippet.replace(' , ', ', ')
                snippet = snippet.strip()
               
                wordcount = article['wordcount']
               
                results.append({
                    'title' : title,
                    'snippet' : snippet,
                    'wordcount' : wordcount
                })
       
        # yaml.dump(results, default_style='', default_flow_style=False,
        #     allow_unicode=True)
        return results

In [9]:
def unwiki(wiki):
    """
   Remove wiki markup from the text.
   """
    wiki = re.sub(r'(?i)\{\{IPA(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
    wiki = re.sub(r'(?i)\{\{Lang(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
    wiki = re.sub(r'\{\{[^\{\}]+\}\}', '', wiki)
    wiki = re.sub(r'(?m)\{\{[^\{\}]+\}\}', '', wiki)
    wiki = re.sub(r'(?m)\{\|[^\{\}]*?\|\}', '', wiki)
    wiki = re.sub(r'(?i)\[\[Category:[^\[\]]*?\]\]', '', wiki)
    wiki = re.sub(r'(?i)\[\[Image:[^\[\]]*?\]\]', '', wiki)
    wiki = re.sub(r'(?i)\[\[File:[^\[\]]*?\]\]', '', wiki)
    wiki = re.sub(r'\[\[[^\[\]]*?\|([^\[\]]*?)\]\]', lambda m: m.group(1), wiki)
    wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', lambda m: m.group(1), wiki)
    wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', '', wiki)
    wiki = re.sub(r'(?i)File:[^\[\]]*?', '', wiki)
    wiki = re.sub(r'\[[^\[\]]*? ([^\[\]]*?)\]', lambda m: m.group(1), wiki)
    wiki = re.sub(r"''+", '', wiki)
    wiki = re.sub(r'(?m)^\*$', '', wiki)

    return wiki

def unhtml(html):
    """
   Remove HTML from the text.
   """
    html = re.sub(r'(?i)&nbsp;', ' ', html)
    html = re.sub(r'(?i)<br[ \\]*?>', '\n', html)
    html = re.sub(r'(?m)<!--.*?--\s*>', '', html)
    html = re.sub(r'(?i)<ref[^>]*>[^>]*<\/ ?ref>', '', html)
    html = re.sub(r'(?m)<.*?>', '', html)
    html = re.sub(r'(?i)&amp;', '&', html)

    return html

def punctuate(text):
    """
   Convert every text part into well-formed one-space
   separate paragraph.
   """
    text = re.sub(r'\r\n|\n|\r', '\n', text)
    text = re.sub(r'\n\n+', '\n\n', text)

    parts = text.split('\n\n')
    partsParsed = []

    for part in parts:
        part = part.strip()

        if len(part) == 0:
            continue

        partsParsed.append(part)

    return '\n\n'.join(partsParsed)


In [10]:
def article_urls_crawl(prefix, root_url, count):
    atc_list = set([])
    link_list = set([root_url])
    while True:
        url = random.choice(tuple(link_list))
        try:
            content = urllib.urlopen(url)
        except:
            continue
#         print urllib.urlopen(url).headers.getheader('Content-Type')
        soup = BeautifulSoup(content, "html.parser")
        base=soup.find('div', id="bodyContent")

        for link in BeautifulSoup(str(base), "html.parser").findAll("a"):
            if 'href' in link.attrs:  # Make sure it is link
                if link['href'].startswith('/wiki/') and prefix + link['href'] != url:
                    link_list.add(prefix + link['href'])
                if ( 
                    link['href'].startswith('/wiki/') and  # Make sure it is a wiki page
                    len(link['href'].split('/')) == 3 and  # Make sure it is an article
                    len(link['href'].split(':')) == 1      # Make sure it is not category
                ):
#                     print(link['href'])
                    title = str(link['href'])
                    title = urllib.unquote(title).decode('utf8')
                    print title
                    atc_list.add(prefix + title)
                    if len(atc_list) == count:
                        print('fetched ' + str(count) + ' article urls from ' + prefix + '\n')
                        return atc_list

urls_by_lang = {}

for key in LANG:
    urls_by_lang[key] = article_urls_crawl(LANG[key], LANG[key], DATA_SIZE)

/wiki/Wikipedia
/wiki/Free_content
/wiki/Encyclopedia
/wiki/English_language
/wiki/Shorwell_helmet
/wiki/Anglo-Saxons
/wiki/Shorwell
/wiki/Isle_of_Wight
/wiki/Anglo-Saxon_warfare
/wiki/Pattern-welded
/wiki/Hanging_bowl
/wiki/Benty_Grange_helmet
/wiki/Sutton_Hoo_helmet
/wiki/Coppergate_Helmet
/wiki/Pioneer_Helmet
/wiki/Staffordshire_helmet
/wiki/Franks
/wiki/Scandinavia
/wiki/Shorwell_helmet
/wiki/2017–18_Bergen_County_eruv_controversy
/wiki/Suillus_spraguei
/wiki/Megalodon
/wiki/Oriole_Park_at_Camden_Yards
/wiki/Hailey_Dawson
/wiki/Ceremonial_first_pitch
/wiki/Major_League_Baseball
/wiki/Robotics
/wiki/Biggs_jasper
/wiki/Silicon_dioxide
/wiki/Minna_Lammert
/wiki/Lilli_Lehmann
/wiki/Marie_Lehmann_(soprano)
/wiki/Rhinemaidens
/wiki/Das_Rheingold
/wiki/Bayreuth_Festival
/wiki/Richard_Wagner
/wiki/Arothron_reticularis
/wiki/Tetrodotoxin
/wiki/Saxitoxin
/wiki/Wang_Jian_(businessman)
/wiki/Hainan_Airlines
/wiki/Technological_University_Dublin
/wiki/Republic_of_Ireland
/wiki/Association_footb

/wiki/Nordic_countries
/wiki/Scandinavia_(disambiguation)
/wiki/Demonym
/wiki/Scandinavian_people
/wiki/Denmark
/wiki/Norway
/wiki/Sweden
/wiki/Finland
/wiki/Iceland
/wiki/Faroe_Islands
/wiki/Åland_Islands
/wiki/Danish_language
/wiki/Norwegian_language
/wiki/Swedish_language
/wiki/Finnish_language
/wiki/Icelandic_language
/wiki/Faroese_language
/wiki/German_language
/wiki/Kven_language
/wiki/Meänkieli_dialects
/wiki/Romani_language
/wiki/Sami_languages
/wiki/Yiddish
/wiki/UTC+1
/wiki/UTC+2
/wiki/DST
/wiki/Top-level_domain
/wiki/.dk
/wiki/.no
/wiki/.se
/wiki/.ax
/wiki/.fi
/wiki/.fo
/wiki/.gl
/wiki/.is
/wiki/.sj
/wiki/Scandinavian_Mountains
/wiki/Scandinavian_Peninsula
/wiki/Viking_Age
/wiki/Old_Norse
/wiki/Vikings
/wiki/Viking_art
/wiki/Norse_mythology
/wiki/Denmark
/wiki/Norway
/wiki/Sweden
/wiki/Denmark–Norway
/wiki/Sweden–Finland
/wiki/Union_between_Sweden_and_Norway
/wiki/Kalmar_Union
/wiki/History_of_Scandinavia
/wiki/History_of_Denmark
/wiki/History_of_Norway
/wiki/History_of_Swed

/wiki/中央研究院
/wiki/天文攝影
/wiki/台南一中
/wiki/國立臺灣大學
/wiki/國立臺灣大學物理學系
/wiki/國立中央大學
/wiki/國立中央大學天文研究所
/wiki/夏威夷大学
/wiki/中央研究院天文及天文物理研究所
/wiki/梁次震宇宙學與粒子天文物理學研究中心
/wiki/次毫米波陣列望遠鏡
/wiki/美国国家射电天文台
/wiki/國立臺灣大學物理學系
/wiki/荒勝文策
/wiki/河田末吉
/wiki/太田賴常
/wiki/戴運軌
/wiki/許雲基
/wiki/沃爾夫岡·克洛爾
/wiki/黃振麟
/wiki/崔伯銓
/wiki/鄭伯昆
/wiki/林清凉
/wiki/吳俊輝_(科學家)
/wiki/高涌泉
/wiki/孫維新
/wiki/朱國瑞
/wiki/闕志鴻
/wiki/王亢沛
/wiki/張國龍
/wiki/陳丕燊
/wiki/趙丰
/wiki/李太楓
/wiki/賀曾樸
/wiki/朱時宜
/wiki/洪銘輝
/wiki/朱有花
/wiki/張慶瑞
/wiki/木村毅一
/wiki/植村吉明
/wiki/許玉釧
/wiki/梁次震
/wiki/姚期智
/wiki/林孝信
/wiki/郭兆林
/wiki/張達文
/wiki/李世昌
/wiki/李靈峰_(物理學家)
/wiki/李羅權
/wiki/丘宏義
/wiki/沈君山
/wiki/徐大麟
/wiki/葉乃裳
/wiki/葉公杼
/wiki/鄭國順
/wiki/劉源俊
/wiki/盧志遠
/wiki/陳力俊
/wiki/洪文璞事件
/wiki/郭瑞年
/wiki/詹裕農
/wiki/姜民權
/wiki/國立臺灣大學物理文物廳
/wiki/梁次震宇宙學與粒子天文物理學研究中心
/wiki/中華民國物理學會
/wiki/中国物理学刊
/wiki/國立臺灣大學
/wiki/臺北帝國大學
/wiki/彩虹小馬：友情就是魔法
/wiki/七夕魚
/wiki/鮗
/wiki/片山牙花鮨
/wiki/金帶金花鮨
/wiki/彭氏豆丁海馬
/wiki/澎氏海龍
/wiki/双叶羊耳蒜
/wiki/玉簪羊耳蒜
/wiki/一叶羊耳蒜
/wiki/镰翅羊耳蒜
/wiki/弯柱羊耳蒜
/wiki/齿唇羊耳蒜
/wiki/長腳羊耳蒜
/wiki/细茎羊耳蒜
/wiki/

In [14]:
def getChinese(context):
    context = context.decode("utf-8") # convert context from str to unicode
    filtrate = re.compile(u'[^\u4E00-\u9FA5]') # non-Chinese unicode range
    context = filtrate.sub(r'', context) # remove all non-Chinese characters
    context = context.encode("utf-8") # convert unicode back to str
    return context

In [15]:
print "Заглавная_страница"

Заглавная_страница


In [12]:
def generate_imgs(langs, urls_by_lang, count, fontsizerange):
    for key in langs:
        print 'generating imgs for ' + key
        i = 0
        while i < count:
            wikipedia.set_lang(key)
            res = None
            try:
                res = wikipedia.summary(random.choice(tuple(urls_by_lang[key])).split('/')[-1])
            except:
                continue
            
            text = res
        #     if key == 'en':
        #         for n in re.findall(ur'[a-zA-Z ]+',res):
        #             text = text + n
        #     elif key == 'zh':
        #         for n in re.findall(ur'[\u4e00-\u9fff]+',res):
        #             text = text + n
#             print text


            image = Image.new('L', IMG_SIZE, 255)
            draw = ImageDraw.Draw(image)
            if key == 'en':
                font = ImageFont.truetype('/usr/share/fonts/MyFonts/consola.ttf', random.randint(fontsizerange[0], fontsizerange[1]))
                margin = 0
                offset = 0
                for line in textwrap.wrap(text, width=23):
                    draw.text((margin, offset), line, font=font, fill="#000000")
                    offset += font.getsize(line)[1]
            elif key == 'zh':
                font = ImageFont.truetype('/usr/share/fonts/MyFonts/msyh.ttf', random.randint(fontsizerange[0], fontsizerange[1]))
                margin = 0
                offset = 0
                for line in textwrap.wrap(text, width=23):
                    draw.text((margin, offset), line, font=font, fill="#000000")
                    offset += font.getsize(line)[1]
            image.save(OUTPUT_PATH + key + '-' + str(i) + '-wiki.jpg')
            i += 1

In [13]:
generate_imgs(LANG, urls_by_lang, DATA_SIZE, (10, 50))

generating imgs for en




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html.parser")

  markup_type=markup_type))


generating imgs for zh
