In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from datetime import date

In [3]:
class Scraper_Yellowpages:
    def __init__(self):
        self.url = "https://www.yellowpages.com.au/search/listings?clue=farm&locationClue=All+States&lat=&lon=&selectedViewMode=list"
        #self.driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
        options = webdriver.ChromeOptions()
        options.add_argument("headless")
        self.driver = webdriver.Chrome(chrome_options=options)
        self.driver = webdriver.Chrome()
    def scrape_page(self, url = ""):
        driver = self.driver
        if url != "":
            driver.get(url)
        else:
            url = driver.current_url

        results = driver.find_elements_by_class_name('listing')

        columns = ["Name", "Adress", "Phone", "Website", "Email", "Source", "DateUpdated", "DateChecked"]
        webObjects = ["listing-name", "listing-address", "contact-text", "contact-url", "contact-email", 0, 0]
        dataDict = {"Name": [], "Adress": [], "Phone": [], "Website": [], "Email": [], "Source": [], "DateUpdated": [], "DateChecked": []}

        for result in results:
            for i in range(0, len(columns)):
                if columns[i] == "DateUpdated":
                    dataDict[columns[i]].append(date.today().strftime("%d/%m/%Y"))
                elif columns[i] == "Source":
                    dataDict[columns[i]].append(url)
                elif columns[i] == "DateChecked":
                    dataDict[columns[i]].append("-")
                elif columns[i] == "Phone":
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.text)
                    except:
                        dataDict[columns[i]].append("N/V")
                elif columns[i] == "Email":
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.get_attribute("data-email"))
                    except:
                        dataDict[columns[i]].append("N/V")
                elif columns[i] == "Website":
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.get_attribute("href"))
                    except:
                        dataDict[columns[i]].append("N/V")
                elif webObjects[i] != 0:    
                    try:
                        value = result.find_element_by_class_name(webObjects[i])
                        dataDict[columns[i]].append(value.text)
                    except:
                        dataDict[columns[i]].append("N/V") 
                else:
                    dataDict[columns[i]].append("N/V") 

        df = pd.DataFrame(dataDict)
        return df

    def scrape_all_pages(self, url):
        driver = self.driver
        driver.get(url)

        i = 0
        while i <= 50:
            df_page = self.scrape_page()

            if i != 0:
                df = pd.concat([df, df_page], sort=False)
            else:
                df = df_page
                
            try:
                driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
            except:
                break
            i = i + 1
        return df
    
    def scrape_multiple_searches(self, searches):
        for index, title in enumerate(searches):
            url = f"https://www.yellowpages.com.au/search/listings?clue={title}&locationClue=All+States&lat=&lon=&selectedViewMode=list"
            
            df_page = self.scrape_all_pages(url)
            if index == 0:  
                df = df_page
            else:
                df = pd.concat([df, df_page], sort=False)
        df.to_csv(f"Data/Yellowpages_{date.today().strftime('%d-%m-%Y')}.csv")
        return df     

In [39]:
class Scraper_Maps:
    def __init__(self):
        self.driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
    def scrape_maps(self, url):
        driver = self.driver
        driver.get(url)
        driver.implicitly_wait(1)
        driver.find_element_by_xpath("//button[contains(@class, 'widget-consent-button-later')]").click()
        driver.find_element_by_xpath("//span[contains(@class, 'button-next')]").click()

        results = driver.find_elements_by_class_name('section-result')
        dataDict = {"Name": [], "Phone": []}
        for result in results:
            dataDict["Name"].append(result.find_element_by_class_name("section-result-title").text)
            dataDict["Phone"].append(result.find_element_by_class_name("section-result-phone-number").text)
        df = pd.DataFrame(dataDict)
        return df

    def scrape_maps_detailed(self, url, length_of_search = 100):
        driver = self.driver
        driver.implicitly_wait(0.5)
        driver.get(url)
        firstItemT0 = ""
        i = 1
        dataDict = {"Name": [], "Phone": [], "Adress": [], "Website": [], "Source": [], "DateUpdated": [], "DateChecked": []}
        while True:
            results = driver.find_elements_by_class_name('section-result')
            firstItemT1 = results[0].find_element_by_class_name("section-result-title").text
            if i >= length_of_search/20:
                break
            if firstItemT1 != firstItemT0:
                for result in results:
                    dataDict["Name"].append(result.find_element_by_class_name("section-result-title").text)
                    dataDict["Phone"].append(result.find_element_by_class_name("section-result-phone-number").text)
                    dataDict["Source"].append(url)
                    dataDict["DateUpdated"].append(date.today().strftime("%d/%m/%Y"))
                    dataDict["DateChecked"].append("-")

                for title in dataDict["Name"][-20:]:
                    try:
                        results = driver.find_elements_by_class_name('section-result-title')
                        driver.find_element_by_xpath('//span[contains(text(), "' + title + '")]').click()
                    except:
                        pass

                    try:
                        dataDict["Adress"].append(driver.find_element_by_xpath("//div[@data-section-id='ad']").text)
                    except:
                        dataDict["Adress"].append("N/V")

                    try:
                        dataDict["Website"].append(driver.find_element_by_xpath("//div[@data-section-id='ap']").text)
                    except:
                        dataDict["Website"].append("N/V")

                    try:
                        driver.find_element_by_class_name("section-back-to-list-button").click()
                    except:
                        pass

                try:
                    driver.find_element_by_xpath("//button[contains(@class, 'widget-consent-button-later')]").click()
                except:
                    pass

                try:
                    driver.find_element_by_xpath("//span[contains(@class, 'button-next')]").click()
                    firstItemT0 = firstItemT1
                    i = i + 1
                except:
                    break

        df = pd.DataFrame(dataDict)
        return df

    def scrape_maps_multiple_searches(self, searches, length_of_search, locations):
        index = 1
        for location in locations:
            for title_index, title in enumerate(searches):
                url = f"https://www.google.com/maps/search/{title}/@{location}/data=!3m1!4b1?hl=en"            
                df_page = self.scrape_maps_detailed(url, length_of_search[title_index])
                if index == 1:  
                    df = df_page
                else:
                    df = pd.concat([df, df_page], sort=False)
                    if index % 20 == 0:
                        df.to_csv(f"Data/Googlemaps_{date.today().strftime('%d-%m-%Y')}.csv")
                index = index + 1
        return df    

In [5]:
YP_Tool = Scraper_Yellowpages()

  import sys


In [52]:
MAPS_Tool = Scraper_Maps()


Looking for [geckodriver v0.26.0 macos] driver in cache 
File found in cache by path [/Users/maximilianwitte/.wdm/drivers/geckodriver/v0.26.0/macos/geckodriver]


In [49]:
input_file = pd.read_excel("Inputs.xlsx", header=1)
Searches = input_file["Search"][:8]
Length_Of_Search = input_file["Length_of_Search"][:8]
Locations = input_file["Location"][:15]

In [53]:
df = MAPS_Tool.scrape_maps_multiple_searches(Searches, Length_Of_Search, Locations)

ValueError: arrays must all be same length

In [54]:
Locations

0      -22.8079219,113.3173924,6.3z
1     -28.3637621,113.6617329,7.39z
2      -30.907351,114.9286246,8.52z
3      -32.5815756,115.429443,8.52z
4     -33.9052434,115.2650522,8.52z
5      -34.0202397,117.154941,7.89z
6      -30.8924962,120.848444,6.42z
7     -31.1453231,134.5666518,6.42z
8     -35.3175626,138.6552691,7.86z
9      -37.374922,141.6841697,8.03z
10     -38.1478205,145.340854,9.16z
11     -37.7111722,146.7674402,8.2z
12    -36.6529314,147.5387489,7.55z
13    -35.0148433,148.9734658,8.51z
14        -33.881234,149.9353022,9z
Name: Location, dtype: object