In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

In [None]:
#create a list where the scraped businesses would be appended
businesses_list = []

In [None]:
url_prefix = 'https://www.houzz.com'

In [None]:
#the page the crawler would start from
starting_url = 'https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784'

In [None]:
#a function to retrieve the relevant details for each business found

def get_attrs(business_listing):
    #getting the business's phone number (returns 'None' if nothing is found)
    try:
        no_span = business_listing.find('span', attrs={'class': "hz-pro-search-result__contact-info"})
        phone_no = no_span.text
        
    except:
        phone_no = None
    
    #getting the business's name (returns 'None' if nothing is found)
    try:
        name_span = business_listing.find('span', attrs={'class':'header-5 text-unbold mlm'})
        business_name = name_span.text
        
    except:
        business_name = None
    
    #getting the business's address (returns 'None' if nothing is found)
    try:
        address_span = business_listing.find('span', attrs={'class':'hz-pro-search-result__location-info__text'})
        business_address = address_span.text
        
    except:
        business_address = None
    
    #getting the business's rating on the Houzz platform (returns 'None' if nothing is found)
    try:
        rating_span = business_listing.find('span', attrs={'class':'hz-star-rate__rating-number'})
        rating = rating_span.text
        
    except:
        rating = None
    
    #getting the number of reviews the business has (returns 'None' if nothing is found)
    try:
        no_of_reviews_span = business_listing.find('span', attrs={'class':'hz-star-rate__review-string'})
        no_of_reviews = no_of_reviews_span.text
        
    except:
        no_of_reviews = None
    
    #putting all the details gotten into a list
    attrs_list = [phone_no, business_name, business_address, rating, no_of_reviews]
    return attrs_list

In [None]:
def get_businesses(url):
    #initializing a selenium session
    browser = webdriver.Firefox()
    browser.implicitly_wait(10)
    #opening our target url with our selenium session
    browser.get(url)
    time.sleep(10)
    
    #using selenium to click all the 'click to call' elements on the webpage
    #so that we can have access to the phone numbers
    for call_button in browser.find_elements_by_class_name('hz-pro-search-result__contact-info__cover'):
        call_button.click()
    
    #handing over the page in its clicked state to BeautifulSoup
    soup = BeautifulSoup(browser.page_source, 'lxml')
    
    #calling get_attrs() on each business found on the webpage
    #and appending the details to 'businesses_list'
    for business_listing in soup.find_all('li', attrs={'class':'hz-pro-search-results__item'}):
        business_details = get_attrs(business_listing)
        businesses_list.append(business_details)
        
    #end the selenium session
    browser.close()
        
    #Print out the url of the webpage we just scraped so we know we're done with that page
    #this helps to monitor our crawler and acts as a log of sorts (to see that the crawler is working)
    print(url)
    
    #if a 'next page' exists, go to its url and continue crawling (recursive)
    next_link_ref = soup.find('a', attrs={'class':'hz-pagination-link hz-pagination-link--next'}).get('href')
    if continue_crawl(next_link_ref):
        new_url = url_prefix + next_link_ref
        get_businesses(new_url)

In [None]:
def continue_crawl(next_link_ref):
    if next_link_ref:
        return True
    else:
        return False

In [None]:
#calling get_business() on starting_url to commence crawling
get_businesses(starting_url)

In [None]:
#businesses_list

In [None]:
#convert the businesses_list to a Pandas DataFrame
businesses_list_df = pd.DataFrame(businesses_list, columns=['phone_no', 'business_name', 'business_address', 'rating', 'no_of_reviews'])
businesses_list_df

In [None]:
#Drop duplicate entries
businesses_list_df.drop_duplicates(inplace=True)

In [None]:
businesses_list_df

In [None]:
businesses_list_df.rating = businesses_list_df.rating.astype('float')

In [None]:
businesses_list_df.info()

In [None]:
#create a csv file from the Pandas DataFrame
businesses_list_df.to_csv('businesses_list.csv', index=False)