# Amazon Web Scraping

## 1. Import Libraries


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np


### 2. Define Functions to Extract Product Information

 Extract Product Title

In [2]:
# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": 'productTitle'}).get_text(strip=True)
    except AttributeError:
        title = ""
    return title


 Extract Product Price

In [3]:
# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("span", attrs={'id': 'priceblock_ourprice'}).get_text(strip=True)
    except AttributeError:
        try:
            price = soup.find("span", attrs={'id': 'priceblock_dealprice'}).get_text(strip=True)
        except AttributeError:
            price = ""
    return price


 Extract Product Rating

In [4]:
# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find("i", attrs={'class': 'a-icon a-icon-star a-star-4-5'}).get_text(strip=True)
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class': 'a-icon-alt'}).get_text(strip=True)
        except AttributeError:
            rating = ""
    return rating


 Extract Number of Reviews

In [5]:
# Function to extract the Number of Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id': 'acrCustomerReviewText'}).get_text(strip=True)
    except AttributeError:
        review_count = ""
    return review_count


 Extract Availability Status

In [6]:
# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id': 'availability'}).find("span").get_text(strip=True)
    except AttributeError:
        available = "Not Available"
    return available


### 3. Main Script for Web Scraping

Set Headers and URL

In [7]:
if __name__ == '__main__':
    # Add your user agent to avoid getting blocked by Amazon
    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    })
    
    # URL for searching PlayStation 4 on Amazon
    URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"


Send HTTP Request and Parse Webpage

In [8]:
# Send HTTP request to get the webpage content
webpage = requests.get(URL, headers=HEADERS)
    
# Parse the webpage content using BeautifulSoup
soup = BeautifulSoup(webpage.content, "html.parser")


Extract Product Links

In [9]:
# Find all links for individual products
links = soup.find_all("a", attrs={'class': 'a-link-normal s-no-outline'})
   
# Extract the href attribute (product links) and store them
links_list = [link.get('href') for link in links]


Loop Through Product Links and Extract Details

In [10]:
# Dictionary to store the product data
product_data = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}

# Loop through each product link to extract details
for link in links_list:
        # Create a new URL for each product
        new_url = "https://www.amazon.com" + link
        new_webpage = requests.get(new_url, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Extract and store product information
        product_data['title'].append(get_title(new_soup))
        product_data['price'].append(get_price(new_soup))
        product_data['rating'].append(get_rating(new_soup))
        product_data['reviews'].append(get_review_count(new_soup))
        product_data['availability'].append(get_availability(new_soup))


### 4. Store Data in DataFrame 

In [11]:
# Create a pandas DataFrame from the dictionary
amazon_df = pd.DataFrame(product_data)

# Clean up empty titles (products with missing details)
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df.dropna(subset=['title'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('', np.nan, inplace=True)


In [12]:
# Display the first few rows of the DataFrame
amazon_df.head()

Unnamed: 0,title,price,rating,reviews,availability
16,"BRENDEZ Replacement Set of Cables,- HDMI Cable...",,4.7 out of 5 stars,52 ratings,
17,"GamingBoy 2 Pack Wireless Controller for PS4, ...",,,137 ratings,
18,"MOOGOLE PS4 Controller Wireless, with Vibratio...",,,"4,129 ratings",
19,"2 Pack Wireless Controller for PS4, Game Remot...",,,21 ratings,
20,"GamingBoy 2 Pack Wireless Controller for PS4, ...",,,137 ratings,


### 5. Save to CSV

In [13]:
# Save the DataFrame to a CSV file
#amazon_df.to_csv("amazon_data.csv", index=False)