# Web Scraping Products
This notebook scrapes details for products from Amazon and stores the data in a CSV file.

In [1]:
import pandas as pd  # Import pandas for data manipulation and analysis
from selenium import webdriver  # Import selenium for web automation and scraping
from bs4 import BeautifulSoup  # Import BeautifulSoup for parsing HTML data
import time  # Import time to introduce delays between page loads
from tqdm import tqdm  # Import tqdm to show a progress bar for loops

In [2]:
# Initialize the Chrome WebDriver
driver = webdriver.Chrome()

# Open the Amazon page for mobile products
driver.get('https://www.amazon.in/s?k=mobiles&crid=2944PM084A0K4&sprefix=mobiles%2Caps%2C271&ref=nb_sb_noss_1')

# Parse the page source using BeautifulSoup
html_data = BeautifulSoup(driver.page_source, 'html.parser')

# Find the number of pages available for the search results
no_of_pages = int(html_data.find('span', {'class': 's-pagination-item s-pagination-disabled'}).text)

# Find all product containers on the first page
products = html_data.find_all('div', {'data-component-type': 's-search-result'})

# Initialize lists to store product details
titles = []
images = []
ratings = []
prices = []

In [3]:
# Loop through each page to scrape the data
for i in tqdm(range(no_of_pages)):
    # Construct the URL for each page
    url = 'https://www.amazon.in/s?k=mobiles&crid=2944PM084A0K4&sprefix=mobiles%2Caps%2C271&ref=nb_sb_noss_1&page=' + str(i + 1)
    
    # Load the page in the browser
    driver.get(url)
    
    # Allow some time for the page to load completely
    time.sleep(2)
    
    # Parse the page source using BeautifulSoup
    html_data = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all product containers on the current page
    products = html_data.find_all('div', {'data-component-type': 's-search-result'})
    
    # Extract details for each product
    for product in products:
        # Extract product title
        title = product.find('span', {'class': 'a-size-medium a-color-base a-text-normal'}).text
        titles.append(title)
        
        # Extract product image URL
        img = product.find('img')['src']
        images.append(img)
        
        # Extract product rating (if available)
        rating = product.find('span', {'class': 'a-icon-alt'})
        if rating is None:
            rating = 'No Rating'  # Handle cases where rating is not available
        else:
            rating = rating.text
        ratings.append(rating)
        
        # Extract product price (if available)
        price = product.find('span', {'class': 'a-price-whole'})
        if price is None:
            price = 'No Price'  # Handle cases where price is not available
        else:
            price = '₹' + price.text
        prices.append(price)

100%|██████████| 20/20 [01:55<00:00,  5.77s/it]


In [4]:
# Create a DataFrame to store the scraped data
data = pd.DataFrame({'titles': titles, 'images': images, 'ratings': ratings, 'prices': prices})

# Display the DataFrame (for verification)
print(data)

                                                titles  \
0             Pixel 8 5G (Obsidian, 256 GB) (8 GB RAM)   
1       Pixel 8a 5G (Obsidian, 8GB RAM, 128GB Storage)   
2    iQOO Z9s 5G (Onyx Green, 8GB RAM, 256GB Storag...   
3    Redmi 13C (Stardust Black, 4GB RAM, 128GB Stor...   
4         POCO C65 (Pastel Blue 4GB RAM 128GB Storage)   
..                                                 ...   
421        Coolpad Cool 5 (Red, 4GB RAM, 64GB Storage)   
422  pTron Dynamo Rush 10000mAh 22.5W PD Charging N...   
423  SmartLike Robot Ring Thunder Case, Shockproof ...   
424  WeConnect Care 1 Year Premium Extended Warrant...   
425       POCO C65 (Pastel Blue 4GB RAM 128GB Storage)   

                                                images             ratings  \
0    https://m.media-amazon.com/images/I/61FRNxeKcM...           No Rating   
1    https://m.media-amazon.com/images/I/61NsUY3izo...           No Rating   
2    https://m.media-amazon.com/images/I/61nO5YRaAx...           No R

In [5]:
# Save the DataFrame to a CSV file
data.to_csv('../data/Mobile_Products.csv', index=False)