/
scraper.py
121 lines (109 loc) · 3.26 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# This is a template for a Python scraper on Morph (https://morph.io)
# including some code snippets below that you should find helpful
# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
# You don't have to do things with the ScraperWiki and lxml libraries. You can use whatever libraries are installed
# on Morph for Python (https://github.com/openaustralia/morph-docker-python/blob/master/pip_requirements.txt) and all that matters
# is that your final data is written to an Sqlite database called data.sqlite in the current working directory which
# has at least a table called data.
Skip to content
Explore
Gist
Blog
Help
NaefNaef NaefNaef
3
3
6
pallih/scraperwiki-scraper-vault
scraperwiki-scraper-vault / Users / T / tozzi / homegatech.py
Páll Hilmarsson pallih on 30 Sep 2013
Scrape on 30.9.2013
1 contributor
69 lines (62 sloc) 2.268 kb
import scraperwiki
from BeautifulSoup import BeautifulSoup
def scrape_table(soup):
i = 0
tds = soup.findAll('tr') # get all the <td> tags
for tr in tds:
k = 0
record = {}
tdr = tr.findAll('td') # get all the <td> tags
i = i + 1
record["i"] = i
for td in tdr:
k = k + 1
if k == 4:
record["address"] = td.text
if k == 5:
record["detail"] = td.text
if k == 6:
record["price"] = td.text
print record
scraperwiki.datastore.save(["i"], record)
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
soup = BeautifulSoup(html)
scrape_table(soup)
next_link = soup.find("a", { "class" : "forward iconLink" })
#if next_link:
# next_url = next_link['href']
# scrape_and_look_for_next_link(next_url)
starting_url = 'http://www.homegate.ch/kaufen/wohnung/bezirk-zuerich/trefferliste?a=default&tab=list&l=default&cid=3032967&aj=900000&ep=1&incsubs=default&tid=1&fromItem=ctn_zh'
scrape_and_look_for_next_link(starting_url)
import scraperwiki
from BeautifulSoup import BeautifulSoup
def scrape_table(soup):
i = 0
tds = soup.findAll('tr') # get all the <td> tags
for tr in tds:
k = 0
record = {}
tdr = tr.findAll('td') # get all the <td> tags
i = i + 1
record["i"] = i
for td in tdr:
k = k + 1
if k == 4:
record["address"] = td.text
if k == 5:
record["detail"] = td.text
if k == 6:
record["price"] = td.text
print record
scraperwiki.datastore.save(["i"], record)
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
soup = BeautifulSoup(html)
scrape_table(soup)
next_link = soup.find("a", { "class" : "forward iconLink" })
#if next_link:
# next_url = next_link['href']
# scrape_and_look_for_next_link(next_url)
starting_url = 'http://www.homegate.ch/kaufen/wohnung/bezirk-zuerich/trefferliste?a=default&tab=list&l=default&cid=3032967&aj=900000&ep=1&incsubs=default&tid=1&fromItem=ctn_zh'
scrape_and_look_for_next_link(starting_url)
Status
API
Training
Shop
Blog
About
© 2014 GitHub, Inc.
Terms
Privacy
Security
Contact