forked from e9t/PyTagCloud-CJK
/
korean.py
39 lines (30 loc) · 1.06 KB
/
korean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-
from collections import Counter
import urllib
import random
import webbrowser
from konlpy.tag import Hannanum
from lxml import html
import pytagcloud # requires Korean font support
r = lambda: random.randint(0,255)
color = lambda: (r(), r(), r())
def get_bill_text(billnum):
url = 'http://pokr.kr/bill/%s/text' % billnum
response = urllib.urlopen(url).read().decode('utf-8')
page = html.fromstring(response)
text = page.xpath(".//div[@id='bill-sections']/pre/text()")[0]
return text
def get_tags(text, ntags=50, multiplier=10):
h = Hannanum()
nouns = h.nouns(text)
count = Counter(nouns)
return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\
for n, c in count.most_common(ntags)]
def draw_cloud(tags, filename, fontname='Noto Sans CJK', size=(800, 600)):
pytagcloud.create_tag_image(tags, filename, fontname=fontname, size=size)
webbrowser.open(filename)
bill_num = '1904882'
text = get_bill_text(bill_num)
tags = get_tags(text)
draw_cloud(tags, 'korean.png')