<a href="https://colab.research.google.com/github/Ishfak46/Mobile_WebScraping_at_Amazon/blob/main/Mobile_WS_at_Amazon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL
base_url = "https://www.amazon.com/s?k=mobile+phones"

# Define headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Referer": "https://www.amazon.com",
    "Origin": "https://www.amazon.com",
    "Content-Type": "text/html; charset=utf-8",
}


# Words to exclude from results
exclude_words = ['case', 'charger', 'cover', 'screen protector', 'accessory']

# Function to get product links from a page
def get_product_links(soup):
    product_links = []
    product_items = soup.find_all('div', class_ ="a-section a-spacing-small a-spacing-top-small")

    for item in product_items:
        title = item.find('h2', class_='a-size-mini a-spacing-none a-color-base s-line-clamp-2')
        if title:
            title_text = title.text.lower()
            if not any(exclude_word in title_text for exclude_word in exclude_words):
                product_link_tag = item.find('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')
                if product_link_tag:
                    product_link = "https://www.amazon.com" + product_link_tag["href"]
                    product_links.append(product_link)


    return product_links


# Function to check for the next page
def has_next_page(soup):
    div_button = soup.find('div', class_='a-section a-text-center s-pagination-container')
    if div_button.find('a', class_= "s-pagination-item s-pagination-next s-pagination-button s-pagination-separator"):
        return True
    return False

# Function to extract text safely
def extract_text(soup, row_class, span_class):
    row = soup.find("tr", class_=row_class)
    if row:
        span = row.find('span', class_=span_class)
        if span:
            return span.text.strip()
    return None

# Initialize
page_number = 1
product_links = []
product_data = []

while True:
    # Update URL for the current page
    url = f"{base_url}&page={page_number}"
    print(f"Processing page {page_number}: {url}")

    # Make the request and parse the content
    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.content, 'lxml')

    # Get product links from the current page
    links = get_product_links(soup)
    product_links.extend(links)
    print(f"Page {page_number} processed, {len(links)} product links found.")

    # Check if there is a next page
    if not has_next_page(soup):
        break

    page_number += 1

# Print total number of product links found
print(f"Total number of product links found: {len(product_links)}")

# Now scrape product details from each product link
for product_link in product_links:
    # Make the request with headers
    response = requests.get(product_link, headers=headers)
    soup2 = BeautifulSoup(response.content, 'lxml')

    # Extract product details
    product_details = {
        "Model Name": extract_text(soup2, "a-spacing-small po-model_name", "a-size-base po-break-word"),
        "Brand": extract_text(soup2, "a-spacing-small po-brand", "a-size-base po-break-word"),
        "Operating System": extract_text(soup2, "a-spacing-small po-operating_system", "a-size-base po-break-word"),
        "RAM": extract_text(soup2, "a-spacing-small po-ram_memory.installed_size", "a-size-base po-break-word"),
        "Storage Capacity": extract_text(soup2, "a-spacing-small po-memory_storage_capacity", "a-size-base po-break-word"),
        "Screen Size": extract_text(soup2, "a-spacing-small po-display.size", "a-size-base po-break-word"),
        "Resolution": extract_text(soup2, "a-spacing-small po-resolution", "a-size-base po-break-word"),
        "Refresh Rate": extract_text(soup2, "a-spacing-small po-refresh_rate", "a-size-base po-break-word"),
        "Cellular Technology": extract_text(soup2, "a-spacing-small po-cellular_technology", "a-size-base po-break-word"),
    }

    product_data.append(product_details)

# Combine the product details and links into a DataFrame
df = pd.DataFrame(product_data)
df['Product Link'] = product_links  # Add the product links as a new column

print(f"succefully scrapped {len(product_links)} products ")

Processing page 1: https://www.amazon.com/s?k=mobile+phones&page=1
Page 1 processed, 16 product links found.
Processing page 2: https://www.amazon.com/s?k=mobile+phones&page=2
Page 2 processed, 20 product links found.
Processing page 3: https://www.amazon.com/s?k=mobile+phones&page=3
Page 3 processed, 21 product links found.
Processing page 4: https://www.amazon.com/s?k=mobile+phones&page=4
Page 4 processed, 19 product links found.
Processing page 5: https://www.amazon.com/s?k=mobile+phones&page=5
Page 5 processed, 22 product links found.
Processing page 6: https://www.amazon.com/s?k=mobile+phones&page=6
Page 6 processed, 21 product links found.
Processing page 7: https://www.amazon.com/s?k=mobile+phones&page=7
Page 7 processed, 22 product links found.
Processing page 8: https://www.amazon.com/s?k=mobile+phones&page=8
Page 8 processed, 22 product links found.
Processing page 9: https://www.amazon.com/s?k=mobile+phones&page=9
Page 9 processed, 22 product links found.
Processing page 10:

In [None]:
# Export the DataFrame 'df' to an Excel file named 'mobile_data.xlsx' without including the index
# Use 'Sheet1' as the sheet name in the Excel file
df.to_excel("mobile_data.xlsx", sheet_name='Sheet1', index=False)

In [None]:
df.isnull()

Unnamed: 0,Model Name,Brand,Operating System,RAM,Storage Capacity,Screen Size,Resolution,Refresh Rate,Cellular Technology,Product Link
0,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,True,True,False
3,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...
399,True,True,True,True,True,True,True,True,True,False
400,False,False,False,False,False,False,True,False,False,False
401,False,False,False,False,False,False,False,False,False,False
402,True,False,True,True,True,True,True,True,False,False


In [None]:
df.isna()

Unnamed: 0,Model Name,Brand,Operating System,RAM,Storage Capacity,Screen Size,Resolution,Refresh Rate,Cellular Technology,Product Link
0,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,True,True,False
3,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...
399,True,True,True,True,True,True,True,True,True,False
400,False,False,False,False,False,False,True,False,False,False
401,False,False,False,False,False,False,False,False,False,False
402,True,False,True,True,True,True,True,True,False,False


In [None]:

df=df.drop(columns="Cellular Technology")
df.head(5)

Unnamed: 0,Model Name,Brand,Operating System,RAM,Storage Capacity,Screen Size,Resolution,Refresh Rate,Product Link
0,Moto G Play (2023),Motorola,Android 12.0,3 GB,32 GB,6.5 Inches,1600 x 720,90 Hz,https://www.amazon.com/Moto-3-Day-Battery-Unlo...
1,Moto G Power 5G 2023,Motorola,Android 13.0,6 GB,6 GB,6.5 Inches,1920 x 1080,120 Hz,https://www.amazon.com/Motorola-Unlocked-MPCam...
2,A04e,SAMSUNG,Android,4 GB,3 GB,6.5 Inches,720 x 1600,,https://www.amazon.com/Samsung-SM-A042M-DS-Unl...
3,Moto G Stylus 5G (2023),Motorola,Android 13.0,6 GB,6 GB,6.6 Inches,1920 x 1080,120 Hz,https://www.amazon.com/Motorola-Stylus-Unlocke...
4,Moto G Pure,TracFone,Android 11.0,3 GB,32 GB,6.5 Inches,1600×720,,https://www.amazon.com/TracFone-Motorola-Moto-...


In [None]:
# Export the DataFrame 'df' to an Excel file named 'mobile_data.xlsx' without including the index
# Use 'Sheet1' as the sheet name in the Excel file
df.to_excel("mobile_data.xlsx", sheet_name='Sheet1', index=False)