Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Kooben209 committed Oct 11, 2018
1 parent 70f8d6b commit aad4940
Showing 1 changed file with 93 additions and 101 deletions.
194 changes: 93 additions & 101 deletions scraper.py
@@ -1,5 +1,3 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import scraperwiki
import sqlite3
import os
Expand All @@ -12,17 +10,8 @@
from decimal import Decimal
from dateutil.parser import parse
import math

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")

chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")

driver = webdriver.Chrome(chrome_options=chrome_options,executable_path='/usr/local/bin/chromedriver')
import requests
import urllib.parse as urlparse

def parseAskingPrice(aPrice):
try:
Expand All @@ -47,97 +36,100 @@ def saveToStore(data):

if os.environ.get("MORPH_DOMAIN") is not None:
domain = os.environ["MORPH_DOMAIN"]

with requests.session() as s:
s.headers['user-agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'

for k, v in filtered_dict.items():
checkURL = v
if os.environ.get('MORPH_DEBUG') == "1":
print(checkURL)

if os.environ.get('MORPH_MAXDAYS') == "0":
checkURL = checkURL.replace("added=24_hours&","")

driver.get(checkURL)
try:
numOfResults = driver.find_element_by_css_selector('.listing-results-utils-count').text.replace(" ", "").split("of")
numOfResults = int(numOfResults[1])
numOfPages = math.ceil(float(numOfResults)/100)
except ValueError:
numOfPages = 0

print("NumberOfPages:"+str(numOfPages))

page = 0
while page < numOfPages:
numResults=0
numPreFeat=0
numNormFeat=0
numFeat=0
for k, v in filtered_dict.items():
checkURL = v
if os.environ.get('MORPH_DEBUG') == "1":
print(checkURL)

if os.environ.get('MORPH_MAXDAYS') == "0":
checkURL = checkURL.replace("added=24_hours&","")

parsedURL = urlparse.urlparse(checkURL)
params = urlparse.parse_qs(parsedURL.query)
if 'page_size' in params:
pageSize = params['page_size'][0]
else:
pageSize = 25

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
r1 = s.get(checkURL)
soup = BeautifulSoup(r1.content, 'html.parser')

searchResults = soup.find("ul", {"class" : "listing-results clearfix js-gtm-list"})
matches = 0
if searchResults is not None:
adverts = searchResults.findAll("li", {"id" : lambda L: L and L.startswith('listing_')})
numResults = len(adverts)
try:
numOfResults = soup.find("span", {"class" : "listing-results-utils-count"}).text.replace(" ", "").split("of")
numOfResults = int(numOfResults[1])
numOfPages = math.ceil(float(numOfResults)/pageSize)
except:
numOfPages = 0
page = 0
while page < numOfPages:
numResults=0
numPreFeat=0
numNormFeat=0
numFeat=0

for advert in adverts:
reduced=False
if advert.find("div", {"class" : "listing-results-wrapper"}) is not None:
advertMatch = {}
agent = advert.find("p", {"class" : "top-half listing-results-marketed"}).find("span").text

if any(x in agent.lower() for x in excludeAgents):
continue;
if page > 0: #get next page
r1 = s.get(checkURL+"&pn="+str(page+1))
soup = BeautifulSoup(r1.content, 'html.parser')

searchResults = soup.find("ul", {"class" : "listing-results clearfix js-gtm-list"})
matches = 0
if searchResults is not None:
adverts = searchResults.findAll("li", {"id" : lambda L: L and L.startswith('listing_')})
numResults = len(adverts)

for advert in adverts:
reduced=False
if advert.find("div", {"class" : "listing-results-wrapper"}) is not None:
advertMatch = {}
agent = advert.find("p", {"class" : "top-half listing-results-marketed"}).find("span").text

if any(x in agent.lower() for x in excludeAgents):
continue;

location = k.replace("MORPH_URL_","").replace("_"," ").title()
propLink=domain+advert.find("a", {"class" : "listing-results-price text-price"}).get('href')
propId=re.search('\d+',propLink.split("?")[0])
if propId:
propId=propId.group(0)
title = advert.find("h2", {"class" : "listing-results-attr"}).text
address = advert.find("a", {"class" : "listing-results-address"}).text
price = parseAskingPrice(advert.find("a", {"class" : "listing-results-price text-price"}).text.strip())
displayPrice = advert.find("a", {"class" : "listing-results-price text-price"})
unwanted = displayPrice.find('span')
if unwanted is not None:
unwanted = displayPrice.find('span').extract()
displayPrice = displayPrice.text.strip()+" "+unwanted.text.strip()
else:
displayPrice = displayPrice.text.strip()
image1 = advert.find("a", {"class" : "photo-hover"}).find("img").get('src')
addedOrReduced = advert.find("p", {"class" : "top-half listing-results-marketed"}).find("small").text.replace("Listed on","").replace("by","").strip()
if addedOrReduced != None and addedOrReduced != "":
addedOrReduced = parse(addedOrReduced)
else:
addedOrReduced = datetime.now().date()
advertMatch['propId'] = propId
advertMatch['link'] = propLink
advertMatch['title'] = title
advertMatch['address'] = address
advertMatch['price'] = price
advertMatch['displayPrice'] = displayPrice
advertMatch['image1'] = image1
advertMatch['pubDate'] = datetime.now()
advertMatch['addedOrReduced'] = addedOrReduced
advertMatch['reduced'] = reduced
advertMatch['location'] = location

saveToStore(advertMatch)

matches += 1
print("Found "+str(matches)+" Matches from "+str(numResults)+" Items of which "+str(numFeat)+" are Featured")
if matches == 0 or (numResults-numFeat-2)>matches:
break
else:
print('No Search Results\n')

if page < (numOfPages-1):
next_page = driver.find_element_by_link_text('Next')
next_page.click()
time.sleep(sleepTime)
page +=1
time.sleep(sleepTime)
driver.quit()
location = k.replace("MORPH_URL_","").replace("_"," ").title()
propLink=domain+advert.find("a", {"class" : "listing-results-price text-price"}).get('href')
propId=re.search('\d+',propLink.split("?")[0])
if propId:
propId=propId.group(0)
title = advert.find("h2", {"class" : "listing-results-attr"}).text
address = advert.find("a", {"class" : "listing-results-address"}).text
price = parseAskingPrice(advert.find("a", {"class" : "listing-results-price text-price"}).text.strip())
displayPrice = advert.find("a", {"class" : "listing-results-price text-price"})
unwanted = displayPrice.find('span')
if unwanted is not None:
unwanted = displayPrice.find('span').extract()
displayPrice = displayPrice.text.strip()+" "+unwanted.text.strip()
else:
displayPrice = displayPrice.text.strip()
image1 = advert.find("a", {"class" : "photo-hover"}).find("img").get('src')
addedOrReduced = advert.find("p", {"class" : "top-half listing-results-marketed"}).find("small").text.replace("Listed on","").replace("by","").strip()
if addedOrReduced != None and addedOrReduced != "":
addedOrReduced = parse(addedOrReduced)
else:
addedOrReduced = datetime.now().date()
advertMatch['propId'] = propId
advertMatch['link'] = propLink
advertMatch['title'] = title
advertMatch['address'] = address
advertMatch['price'] = price
advertMatch['displayPrice'] = displayPrice
advertMatch['image1'] = image1
advertMatch['pubDate'] = datetime.now()
advertMatch['addedOrReduced'] = addedOrReduced
advertMatch['reduced'] = reduced
advertMatch['location'] = location

saveToStore(advertMatch)

matches += 1
if matches == 0 or (numResults-numFeat-2)>matches:
break
else:
print('No Search Results\n')
page +=1
time.sleep(sleepTime)
sys.exit(0)

0 comments on commit aad4940

Please sign in to comment.