In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from datetime import date
import time

In [11]:
class Scraper_Yellowpages:
    def __init__(self):
        self.url = "https://www.yellowpages.com.au/search/listings?clue=farm&locationClue=All+States&lat=&lon=&referredBy=UNKNOWN&selectedViewMode=list&eventType=refinement&openNow=false&refinedCategory=35823"
        self.driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
        """options = webdriver.ChromeOptions()
        options.add_argument("headless")
        self.driver = webdriver.Chrome(chrome_options=options)
        self.driver = webdriver.Chrome()"""
    def scrape_page(self, url = ""):
        driver = self.driver
        if url != "":
            driver.get(url)
        else:
            url = driver.current_url

        results = driver.find_elements_by_class_name('listing')

        columns = ["Name", "Adress", "Phone", "Website", "Category", "Email", "Source", "DateUpdated", "DateChecked"]
        webObjects = ["listing-name", "listing-address", "contact-text", "contact-url", "listing-heading", "contact-email", 0, 0]
        dataDict = {"Name": [], "Adress": [], "GeoLocation": [], "Phone": [], "Website": [], "Category": [], "Email": [], "Crops": [], "Source": [], "DateUpdated": [], "DateChecked": []}

        for result in results:
            for i in range(0, len(columns)):
                if columns[i] == "DateUpdated":
                    dataDict[columns[i]].append(date.today().strftime("%d/%m/%Y"))
                    dataDict["GeoLocation"].append("")
                    dataDict["Crops"].append("")
                elif columns[i] == "Source":
                    dataDict[columns[i]].append(url)
                elif columns[i] == "DateChecked":
                    dataDict[columns[i]].append("-")
                elif columns[i] == "Phone":
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.text)
                    except:
                        dataDict[columns[i]].append("N/V")
                elif columns[i] == "Email":
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.get_attribute("data-email"))
                    except:
                        dataDict[columns[i]].append("N/V")
                elif columns[i] == "Website":
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.get_attribute("href"))
                    except:
                        dataDict[columns[i]].append("N/V")
                elif columns[i] == "Category":
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.text)
                    except:
                        dataDict[columns[i]].append("N/V")
                elif webObjects[i] != 0:    
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.text)
                    except:
                        dataDict[columns[i]].append("N/V") 
                else:
                    dataDict[columns[i]].append("N/V") 

        df = pd.DataFrame(dataDict)
        return df

    def scrape_all_pages(self, url):
        driver = self.driver
        driver.get(url)
        
        i = 0
        while i < 30:
            df_page = self.scrape_page()

            if i != 0:
                df = pd.concat([df, df_page], sort=False)
            else:
                df = df_page

            try:
                driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
            except:
                break
            i = i + 1
        return df
     
    def scrape_multiple_searches(self, searches):
        index = 0
        for title in searches:
            for state in ["NSW", "QLD", "VIC", "WA", "SA", "TAS", "ACT", "NT"]:                
                url = f"https://www.yellowpages.com.au/search/listings?clue={title}&locationClue={state}&pageNumber=1&referredBy=www.yellowpages.com.au&&eventType=pagination"

                df_page = self.scrape_all_pages(url)
                if index == 0:  
                    df = df_page
                else:
                    df = pd.concat([df, df_page], sort=False)
                
                if index % 32 == 0:
                    df = df.drop_duplicates(subset=["Name", "Adress"])
                    df.to_csv(f"Data/Yellowpages_{date.today().strftime('%d-%m-%Y')}.csv")
                    print(f"Iteration number: {index+1}/{str(int(len(searches))*8)}.")
                if index % 8 == 0:
                    self.driver.close()
                    time.sleep(20)
                    self.driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
                    
                index = index + 1
                              
        df.to_csv(f"Data/Yellowpages_{date.today().strftime('%d-%m-%Y')}.csv")
        return df     

In [3]:
class Scraper_Maps:
    def __init__(self):
        self.driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
    def scrape_maps(self, url):
        driver = self.driver
        driver.get(url)
        driver.implicitly_wait(1)
        driver.find_element_by_xpath("//button[contains(@class, 'widget-consent-button-later')]").click()
        driver.find_element_by_xpath("//span[contains(@class, 'button-next')]").click()

        results = driver.find_elements_by_class_name('section-result')
        dataDict = {"Name": [], "Phone": []}
        for result in results:
            dataDict["Name"].append(result.find_element_by_class_name("section-result-title").text)
            dataDict["Phone"].append(result.find_element_by_class_name("section-result-phone-number").text)
        df = pd.DataFrame(dataDict)
        return df

    def scrape_maps_detailed(self, url, length_of_search = 100):
        driver = self.driver
        driver.implicitly_wait(0.5)
        driver.get(url)
        firstItemT0 = ""
        i = 1
        dataDict = {"Name": [], "Phone": [], "Adress": [], "GeoLocation": [], "Website": [], "Category": [], "Crops": [], "Source": [], "DateUpdated": [], "DateChecked": []}
        while True:
            try:
                time.sleep(1.5)
                results = driver.find_elements_by_class_name('section-result')
                firstItemT1 = results[0].find_element_by_class_name("section-result-title").text
            except:
                break
            if i >= length_of_search:
                break
            if firstItemT1 != firstItemT0:
                numResults = 0
                for result in results:
                    dataDict["Name"].append(result.find_element_by_class_name("section-result-title").text)
                    dataDict["Phone"].append(result.find_element_by_class_name("section-result-phone-number").text)
                    dataDict["Category"].append(result.find_element_by_class_name("section-result-details").text)
                    dataDict["Source"].append(url)
                    dataDict["DateUpdated"].append(date.today().strftime("%d/%m/%Y"))
                    dataDict["DateChecked"].append("-")
                    dataDict["Crops"].append("-")
                    dataDict["GeoLocation"].append("-")
                    numResults = numResults + 1

                for title in dataDict["Name"][-numResults:]:
                    try:
                        #results = driver.find_elements_by_class_name('section-result-title')
                        driver.find_element_by_xpath('//span[contains(text(), "' + title + '")]').click()
                    except:
                        pass

                    try:
                        dataDict["Adress"].append(driver.find_element_by_xpath("//div[@data-section-id='ad']").text)
                    except:
                        dataDict["Adress"].append("N/V")

                    try:
                        dataDict["Website"].append(driver.find_element_by_xpath("//div[@data-section-id='ap']").text)
                    except:
                        dataDict["Website"].append("N/V")
                
                    try:
                        driver.find_element_by_class_name("section-back-to-list-button").click()
                    except:
                        pass

                try:
                    driver.find_element_by_xpath("//button[contains(@class, 'widget-consent-button-later')]").click()
                except:
                    pass

                try:
                    driver.find_element_by_xpath("//span[contains(@class, 'button-next')]").click()
                    firstItemT0 = firstItemT1
                    i = i + 1
                except:
                    break
        df = pd.DataFrame(dataDict)
        return df

    def scrape_maps_multiple_searches(self, searches):
        index = 1
        # 111, 155
        # -11, -43
        #for xCOR in range(114, 119, 8):
            #for yCOR in range(-22, -43, -4):
                #location = f"{str(yCOR)},{str(xCOR)},8.67z"
        xCOR = 117
        yCOR = -30
        location = f"{str(yCOR)},{str(xCOR)},4.67z"
        for title in searches:
            for state in ["New South Wales", "Queensland", "Victoria", "Western Australia", "South Australia", "Tasmania", "Nothern Territory"]:  
                url = f"https://www.google.com/maps/search/{title}+in+{state}/@{location}/data=!3m1!4b1?hl=en"            
                df_page = self.scrape_maps_detailed(url, 120)
                if index == 1:  
                    df = df_page
                else:
                    df = pd.concat([df, df_page], sort=False)
                    if index % 12 == 0:
                        df = df.drop_duplicates(subset=["Name", "Adress"])
                        df.to_csv(f"Data/Googlemaps_{date.today().strftime('%d-%m-%Y')}.csv")
                        self.driver.close()
                        time.sleep(3)
                        self.driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
                index = index + 1
        return df    

In [12]:
YP_Tool = Scraper_Yellowpages()


Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]


In [7]:
MAPS_Tool = Scraper_Maps()

Getting latest mozilla release info for v0.26.0
Trying to download new driver from https://github.com/mozilla/geckodriver/releases/download/v0.26.0/geckodriver-v0.26.0-macos.tar.gz
Unpack archive /Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver-v0.26.0-macos.tar.gz


In [5]:
input_file = pd.read_excel("Inputs2.xlsx", header=1)
Searches = input_file["Search"]

In [8]:
df = MAPS_Tool.scrape_maps_multiple_searches(Searches)


Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0

TimeoutException: Message: Timeout loading page after 300000ms


In [13]:
df = YP_Tool.scrape_multiple_searches(Searches)

Iteration number: 1/320.

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]
Iteration number: 33/320.

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]

Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.2

In [None]:
df.head()

In [31]:
Searches

0                  barn
1              barnyard
2              cropland
3                  farm
4             farm land
5             farmstead
6              farmyard
7              forestry
8             freerange
9          fruit garden
10               garden
11               grange
12            grapevine
13                grove
14                 grow
15             hacienda
16              harvest
17             hatchery
18            homestead
19               meadow
20              orchard
21         packing shed
22              paddock
23           plantation
24             planting
25                ranch
26              rearing
27                 shed
28           solar farm
29              spinney
30    vegetable nursery
31             vineyard
32                 wine
33               winery
34                 yard
35               alpaca
36                  bee
37                 beef
38              bufallo
39                 bull
Name: Search, dtype: object