In [1]:
# Dependencies
import splinter
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import random

# Setting up our scraper

In [2]:
# Initialize Browser session
browser = Browser('chrome')

In [7]:
# ONLY RESTART the df when compiling new filter_ data
df = pd.DataFrame()

In [4]:
# Specify an inidividual Walmart link for testing
filter_ = 1
# Create URL object
website = f'https://www.walmart.com/reviews/product/389642865?filter={filter_}'
# Visit website in our Browser session
browser.visit(website)


In [5]:
# Important to pause briefly after the landing page loads because the bots detect rapid activity
# and classify as spider

# Scrape the landing page html
html = browser.html
# Use Beautiful soup to parse the webpage
soup = BeautifulSoup(html,"html.parser")

In [8]:
# Initialize empty reviews list
review_list = []
# Save all of the review containers into a list object
rows = soup.find_all("li", class_="dib w-100 mb3")

# Create a for loop to loop through the containers and pick up the data, text & rating
for i in rows:
    # Try for text data
    try:
        # Save date text
        date = i.find('div', class_='f7 gray mt1').text
        # Save review text
        text = i.find('span', class_='tl-m mb3 db-m').text
        # Save stars text
        stars = i.find('span', class_='w_iUH7').text
        # Add info dict to the empty reviews_list
        review_list.append( {
            'date':date,
            'text':text,
            'stars':stars
            } )
    # If text is empty, print "empty text" to terminal
    except AttributeError:
        print("empty text")

# Transform list of dicts into a df
review_df = pd.DataFrame(review_list)
# Append review_df to the master df
df = pd.concat([df,review_df])

In [9]:
# Manually kill session
browser.quit()

In [10]:
# Length check
df = df.drop_duplicates()
display(df)

Unnamed: 0,date,text,stars
0,2/7/2024,Unfortunately the tv fell on me after purchasi...,1 out of 5 stars review
1,11/1/2023,I love Vizio that’s always my go to brand I ha...,1 out of 5 stars review
2,11/22/2023,"I ordered a Vizio 50"" Class V-Series 4L UHD LE...",1 out of 5 stars review
3,12/5/2023,It didn't work right out of the box. Nothing b...,1 out of 5 stars review
4,12/24/2023,"I ordered 75” Vizio TV and just put it up, bu...",1 out of 5 stars review
5,12/2/2023,Bought the 65inch Vizio online and it arrived ...,1 out of 5 stars review
6,9/7/2023,"From the moment the TV arrived, albeit quickly...",1 out of 5 stars review
7,8/4/2023,Ordered a 75 inch television as soon as the dr...,1 out of 5 stars review
8,8/11/2023,"Came broken, still waiting on refund. Took for...",1 out of 5 stars review
9,10/22/2023,If you buy a TV Get it from the store and don'...,1 out of 5 stars review


# Automating the scraping bot

In [11]:
# Let's create a method to do each of the actions outlined in the last section automatically

# Define scrape and take in the # of stars, # of pages to loop, and DataFrame
def scrape(filter_, page_num_list, df):
    # Initialize the loop through pages
    for i in page_num_list:

        # Conditional to check if this is the first run or not
        if int(len(df)) < 1:
            # Initialize the Browser
            browser = Browser('chrome')
            # Compile URL with filter_
            website = f'https://www.walmart.com/reviews/product/1773088381?filter={filter_}'
            # Sleep the terminal for 3 seconds to simulate human activity
            sleep(3)
            # Visit the website
            browser.visit(website)        
            # Sleep the terminal for 3 seconds to simulate human activity
            sleep(3)
            # Scrape the HTML of the landing page
            html = browser.html
            # Feed HTML to parser
            soup = BeautifulSoup(html,"html.parser")

            # Clear reviews list
            review_list = []
            # Save all of the review containers into a list object
            rows = soup.find_all("li", class_="dib w-100 mb3")

            # Create a for loop to loop through the containers and pick up the data, text & rating
            for i in rows: 
                # Try for text data
                try:
                    # Save date text
                    date = i.find('div', class_='f7 gray mt1').text
                    # Save review text
                    text = i.find('span', class_='tl-m mb3 db-m').text
                    # Save stars text
                    stars = i.find('span', class_='w_iUH7').text
                    # Add info dict to the empty reviews_list
                    review_list.append( {
                        'date':date,
                        'text':text,
                        'stars':stars
                    } )
                # If empty, print "empty text" to console
                except AttributeError:
                    print("empty text")
            # Transform list of dicts into a df
            review_df = pd.DataFrame(review_list)
            # Append review_df to the master df
            df = pd.concat([df, review_df])
            # Print new length of master df to console
            print(f"-- Length of df -- {len(df)} --")
            # Kill the session
            browser.quit()
            # Sleep the terminal for random # between 110 and 140 seconds to simulate human activity
            sleep(int(random.uniform(110,140)))

        # If this isn't the first run:
        else:
            # Initialize the Browser
            browser = Browser('chrome')
            # Compile URL with filter_
            website = f'https://www.walmart.com/reviews/product/1773088381?filter={filter_}&page={i}'
            # Sleep the terminal for 3 seconds to simulate human activity
            sleep(3)
            # Visit the website
            browser.visit(website)
            # Sleep the terminal for 3 seconds to simulate human activity
            sleep(3)
            # Scrape the HTML of the landing page
            html = browser.html
            soup = BeautifulSoup(html,"html.parser")
            review_list = []
            # Save all of the review containers into a list object
            rows = soup.find_all("li", class_="dib w-100 mb3")

            # Create a for loop to loop through the containers and pick up the data, text & rating
            for i in rows: 
                # Try for text data
                try:
                    # Save date text
                    date = i.find('div', class_='f7 gray mt1').text
                    # Save review text
                    text = i.find('span', class_='tl-m mb3 db-m').text
                    # Save stars text
                    stars = i.find('span', class_='w_iUH7').text
                    # Add info dict to the empty reviews_list
                    review_list.append( {
                        'date':date,
                        'text':text,
                        'stars':stars
                    } )
                # If empty, print "empty text" to console
                except AttributeError:
                    print("empty text")
            # Transform list of dicts into a df
            review_df = pd.DataFrame(review_list)
            # Append review_df to the master df
            df = pd.concat([df, review_df])
            # Print new length of master df to console
            print(f"-- Length of df -- {len(df)} --")
            # Kill the session
            browser.quit()
            # Sleep the terminal for random # between 110 and 140 seconds to simulate human activity
            sleep(int(random.uniform(110,140)))
    # Remeber to assign this function to a variable
    return df

In [12]:
# ONLY RESTART the df when compiling new filter_ data
df = pd.DataFrame()
filter_ = [1,2,3,4,5]

for i in filter_:
    df = pd.DataFrame()
    page_num_list = list(range(13))
    df = scrape(i,  page_num_list[1:], df)
    df = df.drop_duplicates()
    df.to_csv(f'data/cup_scraped_{i}star_data.csv', index=False)
    print(f"--- Finished compiling {i} star Data ---")

-- Length of df -- 0 --
-- Length of df -- 20 --
-- Length of df -- 40 --
-- Length of df -- 60 --
-- Length of df -- 80 --
-- Length of df -- 100 --
-- Length of df -- 120 --
-- Length of df -- 140 --
-- Length of df -- 160 --
-- Length of df -- 180 --
-- Length of df -- 200 --
empty text
empty text
empty text
empty text
empty text
-- Length of df -- 215 --
--- Finished compiling 1 star Data ---
-- Length of df -- 20 --
-- Length of df -- 40 --
-- Length of df -- 60 --
-- Length of df -- 80 --
-- Length of df -- 100 --
empty text
empty text
empty text
empty text
empty text
empty text
empty text
empty text
empty text
empty text
empty text
empty text
empty text
empty text
-- Length of df -- 106 --
empty text
empty text
empty text
empty text
-- Length of df -- 106 --
-- Length of df -- 106 --
-- Length of df -- 106 --
-- Length of df -- 106 --
-- Length of df -- 106 --
-- Length of df -- 106 --
--- Finished compiling 2 star Data ---
-- Length of df -- 20 --
-- Length of df -- 40 --
-- Le

In [13]:
# Kill any remaining session
# browser.quit()