-
Notifications
You must be signed in to change notification settings - Fork 0
/
webcraw.py
139 lines (111 loc) · 3.95 KB
/
webcraw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import re
import sys
import bs4
import requests
from collections import defaultdict
from xml.etree import ElementTree
from xml.etree.ElementTree import *
from xml.dom import minidom
class Crawler():
#or search based on category "Chef & Categories.... Magazines"
DEFAULT_PAGE = "http://www.gourmet.com/search/query?query="
DOMAIN_PAGE = "http://www.gourmet.com"
def __init__(self):
pass
#this part is so crappy...
def search(self, search, article_list = []):
base_url = Crawler.DOMAIN_PAGE if article_list else Crawler.DEFAULT_PAGE
search_url = base_url + search
response = requests.get(search_url)
soup = bs4.BeautifulSoup(response.content)
body = soup.find("div", {"class" : "results"}).find_all("div", {"class" : "result"})
result_list = [res.find("div", {"class" : re.compile("info")}) for res in body]
count = 0
for res in result_list:
article = defaultdict(str)
try:
title_info = res.find("h3", {"class" : True}).find("a")
article['title'] = title_info.getText().encode('utf-8')
article['url'] = title_info['href']
except KeyError:
print ("Article without url and title is not valid {url} in result list {line}: ").format(url=search_url, line=count)
count += 1
continue
article['date'] = res.find("div", {"class" : re.compile("date")}).getText()
article['category'] = res.find("h5", {"class" : True}).getText().encode('utf-8')
article['contributor'] = res.find("div", {"class" : "contributor"})
try:
keywords = res.find("div", {"class" : "keywords"}).find_all("dd")
keywords = [keyword.contents[0].string for keyword in keywords]
keywords = ';'.join(keywords)
article['keywords'] = keywords
except:
article['keywords'] = None
count += 1
article_list.append(article)
nextpage = soup.find("span", {"class" : re.compile("paginationNext")})
try:
if "off" in nextpage['class'][0]:
return article_list
else:
search = nextpage.contents[0]['href']
self.search(search, article_list)
except KeyError, TypeError:
return article_list
return article_list
def extract_content(self, url):
url = Crawler.DOMAIN_PAGE + url
#print url
try:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.content)
paragraphs = soup.find("div", {"class" : "text"}).find_all("p")
except:
print 'Enable open webpage or find contents'
return ' '
try:
sentences = []
for p in paragraphs:
contents = p.contents
sentence = [' '.join(part.string.split(' ')).encode('utf-8') if part.string else '' for part in contents]
sentences.append(sentence)
sentences = str([' '.join(sentence) for sentence in sentences])
#TODO: implement a filtering for the decode, unicode staff
#But now at least it can be written to the xml....
return sentences
f.close()
except IOError:
"Exception"
return ' '
@staticmethod
def prettify(elem):
rough_string = tostring(elem, 'utf-8', method='xml')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ")
#TODO: also change this part
def write_xml(self, articles, filename):
f = open(filename, 'w')
if not articles:
print 'No staff to write to file'
sys.exit(1)
xml = Element('Gourmet')
doc_attributes_list = articles[0].keys()
for article in articles:
doc = SubElement(xml, 'doc')
for attribute in doc_attributes_list:
node = SubElement(doc, attribute)
if attribute == 'url':
node.text = self.extract_content(article[attribute])
else:
node.text = article[attribute].decode('utf-8') if article[attribute] else ' '
print
f.write(Crawler.prettify(xml).encode('utf-8'))
f.close()
def main():
crawler = Crawler()
articles = crawler.search("Italy")
crawler.write_xml(articles, 'test.xml')
#crawler.extract_content('http://www.gourmet.com/restaurants/2007/01/epi_colmans_italy')
#'http://www.gourmet.com/food/gourmetlive/2012/103112/welcome-italian-2012')
if __name__ == '__main__':
sys.exit(main())