In [5]:
import csv
import requests
from bs4 import BeautifulSoup
import re

# Define the URL of the Amazon product page
url = 'https://www.amazon.com/dp/B0CD22ZPPX/ref=sspa_dk_detail_1?pd_rd_i=B0CD22ZPPX&pd_rd_w=vuCGD&content-id=amzn1.sym.386c274b-4bfe-4421-9052-a1a56db557ab&pf_rd_p=386c274b-4bfe-4421-9052-a1a56db557ab&pf_rd_r=TZS4FHVDV2QYJWD3YEBG&pd_rd_wg=vASRJ&pd_rd_r=98cb0ec1-3e92-4daa-9e1c-5327234dc32a&s=pc&sp_csd=d2lkZ2V0TmFtZT1zcF9kZXRhaWxfdGhlbWF0aWM&th=1'

# Function to extract reviews from a given page URL
def extract_reviews(page_url):
    # Send a GET request to the page URL
    response = requests.get(page_url) 
    # Parse the HTML content of the response
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize an empty list to store reviews
    reviews = []
    # Loop through each review element on the page
    for review in soup.find_all('div', {'data-hook': 'review'}):
        try:
            # Extract reviewer's name if available
            r_reviewer_element = review.select_one("span.a-profile-name")
            r_reviewer = r_reviewer_element.text.strip() if r_reviewer_element else None
        except AttributeError:
            r_reviewer = None

        try:
             # Extract review date if available
            review_date_element = review.find('span', {'data-hook': 'review-date'})
            review_date = review_date_element.text.strip()

            # Use regular expression to extract and format the review date
            date_pattern = re.compile(r"(\w+\s\d{1,2},\s\d{4})")
            match = re.search(date_pattern, review_date)
            review_date = match.group() if match else "Date not found"

        except AttributeError:
            review_date = ''

        try:
            # Extract review content if available
            review_content = review.find('span', {'data-hook': 'review-body'}).text.strip()
        except AttributeError:
            review_content = ''

        # Append the extracted information as a list to the reviews list
        reviews.append([r_reviewer, review_date, review_content])

    return reviews

# Function to save extracted reviews to a CSV file
def save_reviews_to_csv(reviews, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Reviewer Name", "Review Date", "Review Content"])
        writer.writerows(reviews)

# Main function to orchestrate the scraping and saving process
def main():
    # Initialize an empty list to store all reviews
    all_reviews = []
    # Iterate through pages 1 to 5 to scrape reviews
    for i in range(1, 6):
        print(f'Scraping page {i}...')
        page_url = url + f'&pageNumber={i}'
        reviews = extract_reviews(page_url)
        all_reviews.extend(reviews)

    # Save all reviews to a CSV file
    save_reviews_to_csv(all_reviews, 'reviews.csv')
    print('Saved all reviews to reviews.csv')

# Check if the script is being run directly
if __name__ == '__main__':
    # Call the main function
    main()
    
# Tharma Raj IS01081129
# Yovesh Varma IS01081505

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Saved all reviews to reviews.csv
