# Amazon Reviews Extraction for the product "Amazon Echo Studio" - Sep 2024

In [74]:
# Importing Required packages
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import nbformat

In [75]:
# Header to set the requests 
headers = {
    'authority': 'www.amazon.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
}

In [76]:
# URL of The amazon Review page
reviews_url = 'https://www.amazon.com/product-reviews/B07G9Y3ZMC/ref=cm_cr_arp_d_viewopt_srt?sortBy=recent&pageNumber=1'

In [77]:
# Defining number of review pages to be extracted
len_page = 10

## Data Extraction

In [79]:
# Extracting Data as Html object from amazon Review page
def Reviews_Html(url, len_page):
    # Empty List to store html data
    soups = []

    # Loop for gathering all reviews from defined number of pages
    for page_no in range(1, len_page + 1):
        print(f"Fetching page {page_no}")
        
        # Update the URL for each page dynamically by replacing the page number in the URL
        current_url = url.replace("pageNumber=1", f"pageNumber={page_no}")
        
        # Sending request to the updated URL
        response = requests.get(current_url, headers=headers)
        
        # Check for valid response
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            soups.append(soup)
        else:
            print(f"Error fetching page {page_no}: {response.status_code}")
    
    return soups


In [80]:
# Review name, description, date, stars, title from HTML
def getReviews(html_data):

    # Empty list to store the data
    data_dicts = []
    
    # Selecting all Reviews BOX html using css selector
    boxes = html_data.select('div[data-hook="review"]')
    
    for box in boxes:
        
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'   

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            # Convert date str to dd/mm/yyy format
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        # Creating data Dictionary 
        data_dict = {
            'Name' : name,
            'Stars' : stars,
            'Title' : title,
            'Date' : date,
            'Description' : description
        }

        data_dicts.append(data_dict)
    
    return data_dicts

## Converting to Dataframe

In [89]:
Extracted_html = Reviews_Html(reviews_url, len_page)

Fetching page 1
Fetching page 2
Fetching page 3
Fetching page 4
Fetching page 5
Fetching page 6
Fetching page 7
Fetching page 8
Fetching page 9
Fetching page 10


In [91]:
reviews = []
for html_data in Extracted_html:
    review = getReviews(html_data)
    reviews += review

In [93]:
# Create a dataframe with reviews Data
df_reviews = pd.DataFrame(reviews)

In [95]:
df_reviews

Unnamed: 0,Name,Stars,Title,Date,Description
0,Boss Barbie,5.0,5.0 out of 5 stars\nExcellent sound quality,22/09/2024,It sounds amazing !
1,RKREJ,5.0,5.0 out of 5 stars\nMANUAL POWER OFF SWITCH TO...,22/09/2024,INSTALLS BY INPUT PLUGThis Way You Don't Have ...
2,JDS,2.0,2.0 out of 5 stars\nDon’t waste your money,22/09/2024,Speakers sound decent enough but these loose i...
3,Ezra,2.0,2.0 out of 5 stars\nDO NOT WASTE YOUR TIME OR ...,20/09/2024,I've gone through 2 of these and have the same...
4,Christopher M. Rumley,5.0,5.0 out of 5 stars\nGreat Sound and classy look,19/09/2024,Went from an echo dot to this. Why did we wait...
5,William Nylic,5.0,5.0 out of 5 stars\nAwesome Speaker,18/09/2024,I purchased this speaker to replace an old ech...
6,CODOM,5.0,5.0 out of 5 stars\nThis Speaker is the Bomb!,17/09/2024,This speaker is the best stand-alone speaker I...
7,Justin in Florida,1.0,1.0 out of 5 stars\nDo not buy known issue,15/09/2024,Do not buy this after a year or so it will bec...
8,Dayana Suarez,5.0,5.0 out of 5 stars\nAlexa studio,15/09/2024,Muy buena ! excelente calidad de sonido
9,Gabriel Gachupin,1.0,1.0 out of 5 stars\nNot completely satisfied....,15/09/2024,I was expecting to receive an outstanding soun...


In [97]:
# Save data
df_reviews.to_csv('Echo_Reviews_24.csv', index=False)