In [1]:
from bs4 import BeautifulSoup
import requests
import time
import datetime
import pandas as pd
import regex as re
from splinter import Browser

In [2]:
url = "https://www.wayfair.com/appliances/sb0/built-in-dishwashers-c1874332.html?itemsperpage=96"

In [3]:
# Set up Splinter
browser = Browser('chrome')

In [4]:
# 
browser.visit(url)

In [5]:
# Parse the HTML
html = browser.html
html_soup = BeautifulSoup(html, 'html.parser')

# Start

In [6]:
def scrape_page(results):    
    dishwashers = []
    for result in results:
        dw_info = {}
        # Extract the product headline
        title = result.find('h2', class_='kb51y90_6101 kb51y91_6101 kb51y93_6101')
        
        if title:
            
            # Grab the maker
            dw_info["maker"] = result.find(class_="oakhm610g_6101 oakhm614l_6101 nhya890_6101").text

            #Grab the size
            size = re.findall(r'[0-9]+"|[0-9]+ in', title.text)
            if size:
                dw_info["size"]=size[0][:2]
            else:
                dw_info["size"]=None

            # Grab noise level; looked it up and all dishwashers have a noiselevel between 30-100 dBA
            noise_level = re.findall(r"[0-9]+ dBA|[0-9]+ Decibel dBA", title.text)
            if noise_level:
                dw_info["noise_level"]=noise_level[0][:2]
                
            else:
                dw_info["noise_level"] = None

            # Grab color options
            color_info = result.find(class_="MoreOptionsText")
            if color_info:
                dw_info["num_colors"] = re.findall(r"\+[0-9]+",color_info.text)
            else:
                dw_info["num_colors"] = 1

            # Grab the elements in the additional information box and store the elements in a list to check through 
            box = result.find_all("p", attrs={"data-hb-id":"Text"})
            box_elements = []
            for x in box:
                box_elements.append(x.text)

            # Grab the number of place settings
            if "Number of Place Settings" in box_elements:
                # Need to grab the index after that of the "Number of Place Settings"
                dw_info["num_place_settings"]= box_elements[box_elements.index("Number of Place Settings") +1]
            else:
                # If there isn't "Number of Place Settings" in the box, then put none
                dw_info["num_place_settings"] = None

            # Grab the number of cycles    
            if "Number of Cycles" in box_elements:
                # Need to grab the index after that of the "Number of Cycles"
                dw_info["num_cycles"]= box_elements[box_elements.index("Number of Cycles") +1]
            else:
                # If there isn't "Number of Cycles" in the box, then put none
                dw_info["num_cycles"] = None

            # Grab the price
            price_info = result.find_all(class_="oakhm627_6101 oakhm6y5_6101 oakhm610g_6101 oakhm6aj_6101")
            prices = []
            for x in price_info:
                y = x.text
                if re.search(r"\$", y):
                    prices.append(y.replace("$",""))
            prices_int = [float(p.replace(",","")) for p in prices]
            if len(prices) ==2:
                dw_info["price"] = prices_int[0]
                dw_info["original_price"] = prices_int[1]
                dw_info["discount"] = prices_int[1]-prices_int[0]
            else:
                dw_info["price"] = prices_int[0]
                dw_info["original_price"] = None
                dw_info["discount"] = 0
            

            # Grab the rating
            rating_info = result.find(class_="_1xxktfu7_6101 _1xxktfu3_6101 _1xxktfu8_6101")
            if rating_info:
                rating_style = re.findall(r"width:[0-9]+|width: [0-9]+", rating_info["style"]) 
                dw_info["rating"]=int(rating_style[0][-2:])/20
            else: 
                dw_info["rating"] =None

            # Grab the number of ratings:
            num_ratings_info = result.find(class_="_1xxktfua_6101 undefined")
            if num_ratings_info:
                dw_info["num_ratings"] = num_ratings_info.text.replace("(", "").replace(")","")
            else:
                dw_info["num_ratings"]=None

            # Add the dishwasher to the list
            dishwashers.append(dw_info)
            
    return dishwashers

  

In [7]:
results = html_soup.find_all(attrs={"data-hb-id": "Grid.Item", "class":"kzv0b81_6101"})

dishwashers = scrape_page(results)

# Make a dataframe

In [31]:
all_washers = pd.DataFrame(dishwashers)
all_washers

Unnamed: 0,maker,size,noise_level,num_colors,num_place_settings,num_cycles,price,original_price,discount,rating,num_ratings
0,GE Appliances,24,48,1,16,5,809.00,899.00,90.00,4.5,1446
1,Samsung,24,48,[+1],15,6,587.96,899.00,311.04,4.0,813
2,GE Appliances,24,52,[+3],16,4,611.00,679.00,68.00,4.5,1924
3,GE Appliances,24,50,[+3],16,5,701.00,779.00,78.00,4.5,1799
4,Frigidaire,24,49,[+1],14,8,629.00,1079.00,450.00,4.5,3744
...,...,...,...,...,...,...,...,...,...,...,...
89,Equator,24,51,1,15,8,689.00,839.00,150.00,,
90,ZLINE,24,51,[+2],14,6,1394.96,1549.95,154.99,0.0,8
91,ZLINE,18,,[+10],10,8,1124.96,1249.95,124.99,4.5,42
92,GE Appliances,24,59,[+2],12,4,494.00,549.00,55.00,3.5,2215


### Clean up the colors

In [32]:
### Clean Up the color options
def fix_colors(color):
    if color == "1" or color == 1:
        return 1
    elif re.findall(r"[0-9]+",str(color)):
        return int(re.findall(r"[0-9]+",str(color))[0]) + 1
    else:
        return 1


In [33]:
all_washers["num_colors"] = all_washers["num_colors"].apply(lambda x: fix_colors(x))
all_washers

Unnamed: 0,maker,size,noise_level,num_colors,num_place_settings,num_cycles,price,original_price,discount,rating,num_ratings
0,GE Appliances,24,48,1,16,5,809.00,899.00,90.00,4.5,1446
1,Samsung,24,48,2,15,6,587.96,899.00,311.04,4.0,813
2,GE Appliances,24,52,4,16,4,611.00,679.00,68.00,4.5,1924
3,GE Appliances,24,50,4,16,5,701.00,779.00,78.00,4.5,1799
4,Frigidaire,24,49,2,14,8,629.00,1079.00,450.00,4.5,3744
...,...,...,...,...,...,...,...,...,...,...,...
89,Equator,24,51,1,15,8,689.00,839.00,150.00,,
90,ZLINE,24,51,3,14,6,1394.96,1549.95,154.99,0.0,8
91,ZLINE,18,,11,10,8,1124.96,1249.95,124.99,4.5,42
92,GE Appliances,24,59,3,12,4,494.00,549.00,55.00,3.5,2215


### Clean up the ratings

If the dishwasher had a 5 star rating, that got put as a rating of 0. 

In [36]:
# Clean up the rating(
all_washers[all_washers["rating"]==0.0]
# Unique should be 5 stars
# 56 should be 5 stars
#90 should be 5 stars
# 50 should be 5 stars

Unnamed: 0,maker,size,noise_level,num_colors,num_place_settings,num_cycles,price,original_price,discount,rating,num_ratings
13,Samsung,,,1,15,3,879.0,1169.1,290.1,0.0,187
28,Unique Appliances,24.0,45.0,4,14,6,1499.0,1599.0,100.0,0.0,5
50,ZLINE,24.0,45.0,3,14,6,1304.96,1449.95,144.99,0.0,10
56,ZLINE,24.0,51.0,3,15,8,1259.96,1399.95,139.99,0.0,9
72,ZLINE,24.0,51.0,3,15,8,1349.96,1499.95,149.99,0.0,8
90,ZLINE,24.0,51.0,3,14,6,1394.96,1549.95,154.99,0.0,8


In [38]:
def fix_stars(r):
    if r == 0.0:
        return 5.0
    return r

all_washers["rating"] = all_washers["rating"].apply(lambda x: fix_stars(x))

In [42]:
all_washers[int(all_washers["num_ratings"]) > 2000]

TypeError: cannot convert the series to <class 'int'>

# Load

In [40]:
all_washers.to_csv("wayfair_dishwashers_clean.csv")

# Start parsing

In [35]:
browser.quit()