In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

In [2]:
#create a list where the scraped businesses would be appended
businesses_list = []

In [3]:
url_prefix = 'https://www.houzz.com'

In [4]:
#the page the crawler would start from
starting_url = 'https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784'

In [5]:
#a function to retrieve the relevant details for each business found

def get_attrs(business_listing):
    #getting the business's phone number (returns 'None' if nothing is found)
    try:
        no_span = business_listing.find('span', attrs={'class': "hz-pro-search-result__contact-info"})
        phone_no = no_span.text
        
    except:
        phone_no = None
    
    #getting the business's name (returns 'None' if nothing is found)
    try:
        name_span = business_listing.find('span', attrs={'class':'header-5 text-unbold mlm'})
        business_name = name_span.text
        
    except:
        business_name = None
    
    #getting the business's address (returns 'None' if nothing is found)
    try:
        address_span = business_listing.find('span', attrs={'class':'hz-pro-search-result__location-info__text'})
        business_address = address_span.text
        
    except:
        business_address = None
    
    #getting the business's rating on the Houzz platform (returns 'None' if nothing is found)
    try:
        rating_span = business_listing.find('span', attrs={'class':'hz-star-rate__rating-number'})
        rating = rating_span.text
        
    except:
        rating = None
    
    #getting the number of reviews the business has (returns 'None' if nothing is found)
    try:
        no_of_reviews_span = business_listing.find('span', attrs={'class':'hz-star-rate__review-string'})
        no_of_reviews = no_of_reviews_span.text
        
    except:
        no_of_reviews = None
    
    #putting all the details gotten into a list
    attrs_list = [phone_no, business_name, business_address, rating, no_of_reviews]
    return attrs_list

In [6]:
def get_businesses(url):
    #initializing a selenium session
    browser = webdriver.Firefox()
    browser.implicitly_wait(10)
    #opening our target url with our selenium session
    browser.get(url)
    time.sleep(10)
    
    #using selenium to click all the 'click to call' elements on the webpage
    #so that we can have access to the phone numbers
    for call_button in browser.find_elements_by_class_name('hz-pro-search-result__contact-info__cover'):
        call_button.click()
    
    #handing over the page in its clicked state to BeautifulSoup
    soup = BeautifulSoup(browser.page_source, 'lxml')
    
    #calling get_attrs() on each business found on the webpage
    #and appending the details to 'businesses_list'
    for business_listing in soup.find_all('li', attrs={'class':'hz-pro-search-results__item'}):
        business_details = get_attrs(business_listing)
        businesses_list.append(business_details)
        
    #end the selenium session
    browser.close()
        
    #Print out the url of the webpage we just scraped so we know we're done with that page
    #this helps to monitor our crawler and acts as a log of sorts (to see that the crawler is working)
    print(url)
    
    #if a 'next page' exists, go to its url and continue crawling (recursive)
    next_link_ref = soup.find('a', attrs={'class':'hz-pagination-link hz-pagination-link--next'}).get('href')
    if continue_crawl(next_link_ref):
        new_url = url_prefix + next_link_ref
        get_businesses(new_url)

In [7]:
def continue_crawl(next_link_ref):
    if next_link_ref:
        return True
    else:
        return False

In [8]:
#calling get_business() on starting_url to commence crawling
get_businesses(starting_url)

https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=15
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=30
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=45
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=60
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=75
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=90
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=105
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=120
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=135
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=150
https://www.houzz.com/professionals/searchDirectory?location=us&topicId=11784&p=165
http

AttributeError: 'NoneType' object has no attribute 'text'

In [9]:
#businesses_list

In [10]:
#convert the businesses_list to a Pandas DataFrame
businesses_list_df = pd.DataFrame(businesses_list, columns=['phone_no', 'business_name', 'business_address', 'rating', 'no_of_reviews'])
businesses_list_df

Unnamed: 0,phone_no,business_name,business_address,rating,no_of_reviews
0,(616) 285-9901,Visbeen Architects,"4139 Embassy Dr SE, Grand Rapids, Michigan 495...",5.0,61 Reviews
1,(512) 548-3087,Cornerstone Architects,"7000 Bee Caves Rd. Suite 200, Austin, Texas 78...",5.0,52 Reviews
2,(831) 292-3471,"Studio S Squared Architecture, Inc.","1000 South Winchester Blvd., San Jose, Califor...",5.0,95 Reviews
3,(410) 990-1700,Purple Cherry Architects,"One Melvin Avenue, Annapolis, Maryland 21401, ...",5.0,81 Reviews
4,(360) 629-3441,"Dan Nelson, Designs Northwest Architects","10031 SR 532, Ste B, Stanwood, Wa. 2316 Fairvi...",5.0,36 Reviews
...,...,...,...,...,...
756,(415) 536-9323,Architecture + Design,"566 Folsom Street, San Francisco, California 9...",5.0,3 Reviews
757,(902) 907-5388,Sweet Home Design,"1563 Wentworth Rd. RR1, Windsor, Nova Scotia B...",5.0,3 Reviews
758,(510) 420-0210,Amato Architecture,"1396 Park Avenue, Emeryville 94608, United States",5.0,2 Reviews
759,+34 619 04 40 68,Sergio Olazabal - 2arquitectos,"28100, Madrid, Spain",5.0,2 Reviews


In [11]:
#Drop duplicate entries
businesses_list_df.drop_duplicates(inplace=True)

In [12]:
businesses_list_df

Unnamed: 0,phone_no,business_name,business_address,rating,no_of_reviews
0,(616) 285-9901,Visbeen Architects,"4139 Embassy Dr SE, Grand Rapids, Michigan 495...",5.0,61 Reviews
1,(512) 548-3087,Cornerstone Architects,"7000 Bee Caves Rd. Suite 200, Austin, Texas 78...",5.0,52 Reviews
2,(831) 292-3471,"Studio S Squared Architecture, Inc.","1000 South Winchester Blvd., San Jose, Califor...",5.0,95 Reviews
3,(410) 990-1700,Purple Cherry Architects,"One Melvin Avenue, Annapolis, Maryland 21401, ...",5.0,81 Reviews
4,(360) 629-3441,"Dan Nelson, Designs Northwest Architects","10031 SR 532, Ste B, Stanwood, Wa. 2316 Fairvi...",5.0,36 Reviews
...,...,...,...,...,...
756,(415) 536-9323,Architecture + Design,"566 Folsom Street, San Francisco, California 9...",5.0,3 Reviews
757,(902) 907-5388,Sweet Home Design,"1563 Wentworth Rd. RR1, Windsor, Nova Scotia B...",5.0,3 Reviews
758,(510) 420-0210,Amato Architecture,"1396 Park Avenue, Emeryville 94608, United States",5.0,2 Reviews
759,+34 619 04 40 68,Sergio Olazabal - 2arquitectos,"28100, Madrid, Spain",5.0,2 Reviews


In [13]:
businesses_list_df.rating = businesses_list_df.rating.astype('float')

In [14]:
businesses_list_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 666 entries, 0 to 760
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   phone_no          662 non-null    object 
 1   business_name     666 non-null    object 
 2   business_address  666 non-null    object 
 3   rating            666 non-null    float64
 4   no_of_reviews     666 non-null    object 
dtypes: float64(1), object(4)
memory usage: 31.2+ KB


In [15]:
#create a csv file from the Pandas DataFrame
businesses_list_df.to_csv('businesses_list.csv', index=False)