In [7]:




#import libraries and package
import requests
from bs4 import BeautifulSoup
import pandas as pd

#create header
custom_headers = {
    "Accept-language": "en-GB,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}

#function to send GET request to website
def get_soup(url):
    response = requests.get(url, headers=custom_headers)

    if response.status_code != 200:
        print("Error in getting webpage")
        exit(-1)

    soup = BeautifulSoup(response.text, "lxml")
    return soup

#function to scrape review from website
def get_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:

        # scrape reviewer name
        r_author_element = review.select_one("span.a-profile-name")
        r_author = r_author_element.text if r_author_element else None
        
        #scrape review title
        r_title_element = review.select_one("a.review-title")
        r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None
        r_title = r_title_span_element.text if r_title_span_element else None

        #scrape review content        
        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None

        #scrape review date
        r_date_element = review.select_one("span.review-date")
        r_date = r_date_element.text if r_date_element else None

        r = {
            "author": r_author,
            "title": r_title,
            "content": r_content,
            "date": r_date,
        }

        scraped_reviews.append(r)

    return scraped_reviews

#function to run all function
def main():
    base_url = "https://www.amazon.co.uk/PUMA-Mens-Teamrise-Jersey-Shirt/product-reviews/B091732G4M/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&page={}"
    #set maximum page to 5
    max_pages = 5
    #declare variable to store all scrape review
    all_reviews = []

    #loop to scrape review from different pages, max 5 pages
    for page in range(1, max_pages + 1):
        search_url = base_url.format(page)
        soup = get_soup(search_url)
        reviews = get_reviews(soup)
        all_reviews.extend(reviews)

    #extract review into csv file
    df = pd.DataFrame(data=all_reviews)
    df.to_csv("reviews.csv", index=False)

#run all function
if __name__ == '__main__':
    main()
