In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL
base_url = "https://www.amazon.com/s?k=mobile+phones"

# Define headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Referer": "https://www.amazon.com",
    "Origin": "https://www.amazon.com",
    "Content-Type": "text/html; charset=utf-8",
}


# Words to exclude from results
exclude_words = ['case', 'charger', 'cover', 'screen protector', 'accessory']

# Function to get product links from a page
def get_product_links(soup):
    product_links = []
    product_items = soup.find_all('div', class_ ="a-section a-spacing-small a-spacing-top-small")
    
    for item in product_items:
        title = item.find('h2', class_='a-size-mini a-spacing-none a-color-base s-line-clamp-2')
        if title:
            title_text = title.text.lower()
            if not any(exclude_word in title_text for exclude_word in exclude_words):
                product_link_tag = item.find('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')
                if product_link_tag:
                    product_link = "https://www.amazon.com" + product_link_tag["href"]
                    product_links.append(product_link)
                    
                    
    return product_links


# Function to check for the next page
def has_next_page(soup):
    div_button = soup.find('div', class_='a-section a-text-center s-pagination-container')
    if div_button.find('a', class_= "s-pagination-item s-pagination-next s-pagination-button s-pagination-separator"):
        return True
    return False

# Function to extract text safely
def extract_text(soup, row_class, span_class):
    row = soup.find("tr", class_=row_class)
    if row:
        span = row.find('span', class_=span_class)
        if span:
            return span.text.strip()
    return None

# Initialize
page_number = 1
product_links = []
product_data = []

while True:
    # Update URL for the current page
    url = f"{base_url}&page={page_number}"
    print(f"Processing page {page_number}: {url}")
    
    # Make the request and parse the content
    response = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Get product links from the current page
    links = get_product_links(soup)
    product_links.extend(links)
    print(f"Page {page_number} processed, {len(links)} product links found.")
    
    # Check if there is a next page
    if not has_next_page(soup):
        break
    
    page_number += 1

# Print total number of product links found
print(f"Total number of product links found: {len(product_links)}")

# Now scrape product details from each product link
for product_link in product_links:
    # Make the request with headers
    response = requests.get(product_link, headers=headers)
    soup2 = BeautifulSoup(response.content, 'lxml')
    
    # Extract product details
    product_details = {
        "Model Name": extract_text(soup2, "a-spacing-small po-model_name", "a-size-base po-break-word"),
        "Brand": extract_text(soup2, "a-spacing-small po-brand", "a-size-base po-break-word"),
        "Operating System": extract_text(soup2, "a-spacing-small po-operating_system", "a-size-base po-break-word"),
        "RAM": extract_text(soup2, "a-spacing-small po-ram_memory.installed_size", "a-size-base po-break-word"),
        "Storage Capacity": extract_text(soup2, "a-spacing-small po-memory_storage_capacity", "a-size-base po-break-word"),
        "Screen Size": extract_text(soup2, "a-spacing-small po-display.size", "a-size-base po-break-word"),
        "Resolution": extract_text(soup2, "a-spacing-small po-resolution", "a-size-base po-break-word"),
        "Refresh Rate": extract_text(soup2, "a-spacing-small po-refresh_rate", "a-size-base po-break-word"),
        "Cellular Technology": extract_text(soup2, "a-spacing-small po-cellular_technology", "a-size-base po-break-word"),
    }

    product_data.append(product_details)

# Combine the product details and links into a DataFrame
df = pd.DataFrame(product_data)
df['Product Link'] = product_links  # Add the product links as a new column

print(f"succefully scrapped {len(product_links)} products ")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Processing page 1: https://www.amazon.com/s?k=mobile+phones&page=1
Page 1 processed, 21 product links found.
Processing page 2: https://www.amazon.com/s?k=mobile+phones&page=2
Page 2 processed, 21 product links found.
Processing page 3: https://www.amazon.com/s?k=mobile+phones&page=3
Page 3 processed, 20 product links found.
Processing page 4: https://www.amazon.com/s?k=mobile+phones&page=4
Page 4 processed, 20 product links found.
Processing page 5: https://www.amazon.com/s?k=mobile+phones&page=5
Page 5 processed, 19 product links found.
Processing page 6: https://www.amazon.com/s?k=mobile+phones&page=6
Page 6 processed, 21 product links found.
Processing page 7: https://www.amazon.com/s?k=mobile+phones&page=7
Page 7 processed, 19 product links found.
Processing page 8: https://www.amazon.com/s?k=mobile+phones&page=8
Page 8 processed, 19 product links found.
Processing page 9: https://www.amazon.com/s?k=mobile+phones&page=9
Page 9 processed, 18 product links found.
Processing page 10:

In [2]:
df.head()

Unnamed: 0,Model Name,Brand,Operating System,RAM,Storage Capacity,Screen Size,Resolution,Refresh Rate,Cellular Technology,Product Link
0,Nord N30,OnePlus,OxygenOS,8 GB,128 GB,6.7 Inches,2400 x 1080 pixels,120 Hz,5G,https://www.amazon.com/OnePlus-Unlocked-Dual-S...
1,Galaxy S24 Plus,SAMSUNG,"Android 14, One UI 6.1",12 GB,256 GB,6.7 Inches,3120 x 1440 pixels,120 Hz,5G,https://www.amazon.com/SAMSUNG-Smartphone-Unlo...
2,OnePlus 12R,OnePlus,Android 14.0,16 GB,16 GB,6.82 Inches,,120 Hz,5G,https://www.amazon.com/OnePlus-Dual-SIM-Unlock...
3,Moto G 5G (2023),Motorola,Android 13,6 GB,4 GB,6.5 Inches,1600 x 720,120 Hz,,https://www.amazon.com/Motorola-Unlocked-128GB...
4,OnePlus 12,OnePlus,Android 14.0,16 GB,16 GB,6.82 Inches,,120 Hz,5G,https://www.amazon.com/OnePlus-Dual-SIM-Smartp...


In [3]:
# Export the DataFrame 'df' to an Excel file named 'mobile_data.xlsx' without including the index
# Use 'Sheet1' as the sheet name in the Excel file
df.to_excel("mobile_data.xlsx", sheet_name='Sheet1', index=False)
