# Web Scraping Product Reviews
This notebook scrapes reviews for a specified product from Amazon and stores the data in a CSV file.

In [10]:
import pandas as pd  # Import pandas for data manipulation and storage
from selenium import webdriver  # Import WebDriver from selenium for web scraping
from bs4 import BeautifulSoup  # Import BeautifulSoup for parsing HTML data

In [11]:
# Initialize the Chrome WebDriver
driver = webdriver.Chrome()

# Open the Amazon product reviews page for the specified product
driver.get("https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews")

# Parse the HTML content using BeautifulSoup
html_data = BeautifulSoup(driver.page_source, 'html.parser')

import time  # Import time to introduce delays between requests

In [12]:
# Initialize variables and lists to store data
i = 0
names = []
ratings = []
rating_dates = []
titles = []
reviews_text = []

In [13]:
url= "https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=1"

# Loop through pages to scrape reviews
while url != None:
    i = i + 1
    # Construct the URL for the next page
    url = "https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=" + str(i)
    
    # Load the page in the browser
    driver.get(url)
    
    # Pause for 5 seconds to allow the page to load
    time.sleep(5)
    
    # Parse the HTML content of the current page
    html_data = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all the reviews on the current page
    reviews = html_data.find_all('div', {'data-hook': 'review'})
    
    # Loop through each review and extract the required information
    for review in reviews:
        # Extract the reviewer's name
        name = review.find('span', {'class': 'a-profile-name'}).text
        names.append(name.strip())
        
        # Extract the rating given by the reviewer
        rating = review.find('span', {'class': 'a-icon-alt'}).text
        ratings.append(rating)
        
        # Extract the date of the rating
        rating_date = review.find('span', {'data-hook': 'review-date'}).text
        rating_dates.append(rating_date)
        
        # Extract the title of the review
        title = review.find('a', {'data-hook': 'review-title'}).text
        titles.append(title)
        
        # Extract the review text
        review_text = review.find('span', {'data-hook': 'review-body'}).text
        reviews_text.append(review_text)
    
    # Check if there is a next page
    url_check = html_data.find('li', {'class': 'a-last'})
    if url_check == None:
        # If no next page, end the loop
        url = None
    else:
        # If there is a next page, construct the URL for it
        url = 'https://www.amazon.in' + html_data.find('li', {'class': 'a-last'}).a['href']
    
    # Print the URL of the next page (for debugging purposes)
    print(url)

https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_arp_d_paging_btm_2?ie=UTF8&pageNumber=2&reviewerType=all_reviews
https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_arp_d_paging_btm_3?ie=UTF8&pageNumber=3&reviewerType=all_reviews
https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_arp_d_paging_btm_4?ie=UTF8&pageNumber=4&reviewerType=all_reviews
https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_arp_d_paging_btm_5?ie=UTF8&pageNumber=5&reviewerType=all_reviews
https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_arp_d_paging_btm_6?ie=UTF8&pageNumber=6&reviewerType=all_reviews
https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/product-reviews/B0CSSD9QJR/ref=cm_cr_arp_d_paging_btm_7?ie=UTF8&pageNumber=7&reviewerType=all_reviews
https://www.amazon.in/POCO-Snowstorm-White-RAM-Storage/pro

TypeError: 'NoneType' object is not subscriptable

In [14]:
# Create a DataFrame from the extracted data
data = pd.DataFrame({'profile_name': names,
                     'rating': ratings,
                     'rating_date': rating_dates,
                     'title': titles,
                     'review_text': reviews_text})

# Display the distribution of ratings
data.rating.value_counts()

# Save the DataFrame to a CSV file
data.to_csv('../data/Product_Reviews.csv')