-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawl_cikuapi.py
71 lines (60 loc) · 2.17 KB
/
crawl_cikuapi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#! /usr/bin/python
#-*- coding:utf-8 -*-
"""
Author(s): Qiqun Han
Company: EverString Technology Ltd.
File: crawling.crawl_cikuapi.py
Description: this program crawls words in http://cikuapi.com/.
Creation: 2013-11-12
Revision: 2013-11-12
Copyright (c) All Right Reserved, EverString Technology Ltd., http://www.everstring.com
"""
#===============================================================================
# Main objective:
# 1. 利用http://cikuapi.com/爬取金融相关的词汇
#===============================================================================
# import pdb
import re
import urllib
from bs4 import BeautifulSoup as BS
from recipes import get_html
root = "证券"
url = "http://cikuapi.com/index.php?content=" + urllib.quote(root)
text = get_html(url)
# print dir(text)
# print type(text)
# print "info: ", text.info()
# print "msg: ", text.msg
# print "read: ", text.read(), type(text.read())
soup = BS(text)
# print soup.prettify()
#===============================================================================
# Extract degrees of relevance level
#===============================================================================
def get_degrees(text):
degrees = []
pattern = re.compile(r"([01].[0-9]{6})")
for match in re.finditer(pattern, text):
degrees.append(match.group(1))
return degrees
#===============================================================================
# BeautifulSoup:查看网页编码
#===============================================================================
# soup = BS(text, from_encoding='utf-8')
# print soup.original_encoding
# print soup.declared_html_encoding
# print soup.from_encoding
#===============================================================================
# 抓取相关词
#===============================================================================
def get_related_words(soup):
with open('../io/_temp.txt', 'a') as wf:
for each in soup.find_all('font'):
related = each.text.encode('utf-8')
print related
wf.write(related + '\n')
wf.write('\n')
if __name__ == "__main__":
print get_degrees(text)
# get_related_words(soup)
pass