Update scraper.py

Kooben209 · Oct 11, 2018 · aad4940 · aad4940
1 parent 70f8d6b
commit aad4940
Showing 1 changed file with 93 additions and 101 deletions.
diff --git a/scraper.py b/scraper.py
@@ -1,5 +1,3 @@
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
 import scraperwiki
 import sqlite3
 import os
@@ -12,17 +10,8 @@
 from decimal import Decimal
 from dateutil.parser import parse
 import math
-
-chrome_options = Options()
-chrome_options.add_argument("--headless")
-chrome_options.add_argument("--disable-gpu")
-
-chrome_options.add_argument("--no-sandbox")
-chrome_options.add_argument("start-maximized")
-chrome_options.add_argument("disable-infobars")
-chrome_options.add_argument("--disable-extensions")
-
-driver = webdriver.Chrome(chrome_options=chrome_options,executable_path='/usr/local/bin/chromedriver')
+import requests
+import urllib.parse as urlparse
 
 def parseAskingPrice(aPrice):
 	try:
@@ -47,97 +36,100 @@ def saveToStore(data):
 
 if os.environ.get("MORPH_DOMAIN") is not None:
 	domain = os.environ["MORPH_DOMAIN"]
+
+with requests.session() as s:
+	s.headers['user-agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
 
-for k, v in filtered_dict.items(): 
-	checkURL = v
-	if os.environ.get('MORPH_DEBUG') == "1":
-		print(checkURL)
-
-	if os.environ.get('MORPH_MAXDAYS') == "0":
-		checkURL = checkURL.replace("added=24_hours&","")
-
-	driver.get(checkURL)
-	try:
-		numOfResults = driver.find_element_by_css_selector('.listing-results-utils-count').text.replace(" ", "").split("of")
-		numOfResults = int(numOfResults[1])
-		numOfPages = math.ceil(float(numOfResults)/100)
-	except ValueError:
-		numOfPages = 0	
-
-	print("NumberOfPages:"+str(numOfPages))
-
-	page = 0
-	while page < numOfPages:
-		numResults=0
-		numPreFeat=0
-		numNormFeat=0
-		numFeat=0
+	for k, v in filtered_dict.items(): 
+		checkURL = v
+		if os.environ.get('MORPH_DEBUG') == "1":
+			print(checkURL)
+
+		if os.environ.get('MORPH_MAXDAYS') == "0":
+			checkURL = checkURL.replace("added=24_hours&","")
+
+		parsedURL = urlparse.urlparse(checkURL)
+		params = urlparse.parse_qs(parsedURL.query)
+		if 'page_size' in params:
+			pageSize = params['page_size'][0]
+		else:
+			pageSize = 25
 
-		html = driver.page_source
-		soup = BeautifulSoup(html, 'html.parser')
+		r1 = s.get(checkURL)
+		soup = BeautifulSoup(r1.content, 'html.parser')
 
-		searchResults = soup.find("ul", {"class" : "listing-results clearfix js-gtm-list"})
-		matches = 0
-		if searchResults is not None:		
-			adverts = searchResults.findAll("li", {"id" : lambda L: L and L.startswith('listing_')})
-			numResults = len(adverts)
+		try:
+			numOfResults = soup.find("span", {"class" : "listing-results-utils-count"}).text.replace(" ", "").split("of")
+			numOfResults = int(numOfResults[1])
+			numOfPages = math.ceil(float(numOfResults)/pageSize)
+		except:
+			numOfPages = 0	
+		page = 0
+		while page < numOfPages:
+			numResults=0
+			numPreFeat=0
+			numNormFeat=0
+			numFeat=0
 
-			for advert in adverts:
-				reduced=False
-				if advert.find("div", {"class" : "listing-results-wrapper"}) is not None:
-					advertMatch = {}
-					agent = advert.find("p", {"class" : "top-half listing-results-marketed"}).find("span").text
-
-					if any(x in agent.lower() for x in excludeAgents):
-						continue;
+			if page > 0: #get next page
+				r1 = s.get(checkURL+"&pn="+str(page+1))
+				soup = BeautifulSoup(r1.content, 'html.parser')
+
+			searchResults = soup.find("ul", {"class" : "listing-results clearfix js-gtm-list"})
+			matches = 0
+			if searchResults is not None:		
+				adverts = searchResults.findAll("li", {"id" : lambda L: L and L.startswith('listing_')})
+				numResults = len(adverts)
+
+				for advert in adverts:
+					reduced=False
+					if advert.find("div", {"class" : "listing-results-wrapper"}) is not None:
+						advertMatch = {}
+						agent = advert.find("p", {"class" : "top-half listing-results-marketed"}).find("span").text
+
+						if any(x in agent.lower() for x in excludeAgents):
+							continue;
 
-					location = k.replace("MORPH_URL_","").replace("_"," ").title()
-					propLink=domain+advert.find("a", {"class" : "listing-results-price text-price"}).get('href')
-					propId=re.search('\d+',propLink.split("?")[0])
-					if propId:
-						propId=propId.group(0)
-					title = advert.find("h2", {"class" : "listing-results-attr"}).text
-					address = advert.find("a", {"class" : "listing-results-address"}).text
-					price = parseAskingPrice(advert.find("a", {"class" : "listing-results-price text-price"}).text.strip())
-					displayPrice = advert.find("a", {"class" : "listing-results-price text-price"})
-					unwanted = displayPrice.find('span')
-					if unwanted is not None:
-						unwanted = displayPrice.find('span').extract()
-						displayPrice = displayPrice.text.strip()+" "+unwanted.text.strip()
-					else:
-						displayPrice = displayPrice.text.strip()
-					image1 = advert.find("a", {"class" : "photo-hover"}).find("img").get('src')
-					addedOrReduced = advert.find("p", {"class" : "top-half listing-results-marketed"}).find("small").text.replace("Listed on","").replace("by","").strip()
-					if addedOrReduced != None and addedOrReduced != "":
-						addedOrReduced = parse(addedOrReduced)
-					else:
-						addedOrReduced = datetime.now().date()
-					advertMatch['propId'] = propId
-					advertMatch['link'] = propLink
-					advertMatch['title'] = title
-					advertMatch['address'] = address
-					advertMatch['price'] = price
-					advertMatch['displayPrice'] = displayPrice
-					advertMatch['image1'] = image1
-					advertMatch['pubDate'] = datetime.now()
-					advertMatch['addedOrReduced'] = addedOrReduced
-					advertMatch['reduced'] = reduced
-					advertMatch['location'] = location
-
-					saveToStore(advertMatch)
-
-					matches += 1
-			print("Found "+str(matches)+" Matches from "+str(numResults)+" Items of which "+str(numFeat)+" are Featured")
-			if matches == 0 or (numResults-numFeat-2)>matches:
-				break		
-		else:
-			print('No Search Results\n')
-
-		if page < (numOfPages-1):
-			next_page = driver.find_element_by_link_text('Next')
-			next_page.click()
-			time.sleep(sleepTime)
-		page +=1 
-	time.sleep(sleepTime)
-driver.quit()
+						location = k.replace("MORPH_URL_","").replace("_"," ").title()
+						propLink=domain+advert.find("a", {"class" : "listing-results-price text-price"}).get('href')
+						propId=re.search('\d+',propLink.split("?")[0])
+						if propId:
+							propId=propId.group(0)
+						title = advert.find("h2", {"class" : "listing-results-attr"}).text
+						address = advert.find("a", {"class" : "listing-results-address"}).text
+						price = parseAskingPrice(advert.find("a", {"class" : "listing-results-price text-price"}).text.strip())
+						displayPrice = advert.find("a", {"class" : "listing-results-price text-price"})
+						unwanted = displayPrice.find('span')
+						if unwanted is not None:
+							unwanted = displayPrice.find('span').extract()
+							displayPrice = displayPrice.text.strip()+" "+unwanted.text.strip()
+						else:
+							displayPrice = displayPrice.text.strip()
+						image1 = advert.find("a", {"class" : "photo-hover"}).find("img").get('src')
+						addedOrReduced = advert.find("p", {"class" : "top-half listing-results-marketed"}).find("small").text.replace("Listed on","").replace("by","").strip()
+						if addedOrReduced != None and addedOrReduced != "":
+							addedOrReduced = parse(addedOrReduced)
+						else:
+							addedOrReduced = datetime.now().date()
+						advertMatch['propId'] = propId
+						advertMatch['link'] = propLink
+						advertMatch['title'] = title
+						advertMatch['address'] = address
+						advertMatch['price'] = price
+						advertMatch['displayPrice'] = displayPrice
+						advertMatch['image1'] = image1
+						advertMatch['pubDate'] = datetime.now()
+						advertMatch['addedOrReduced'] = addedOrReduced
+						advertMatch['reduced'] = reduced
+						advertMatch['location'] = location
+
+						saveToStore(advertMatch)
+
+						matches += 1
+				if matches == 0 or (numResults-numFeat-2)>matches:
+					break		
+			else:
+				print('No Search Results\n')
+			page +=1 
+		time.sleep(sleepTime)
 sys.exit(0)