In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import random


In [6]:
def get_page_content(url, headers, retries=3, delay=5):
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.content
        elif response.status_code == 503:
            print(f"503 Service Unavailable. Retry {i+1}/{retries} after {delay} seconds.")
            time.sleep(delay)
            delay *= 2  # Exponential backoff
        else:
            print(f"Failed to retrieve data: {response.status_code}")
            break
    return None


In [7]:
# Function to scrape data from Amazon.in
def scrape_amazon_laptops(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    page_content = get_page_content(url, headers)
    if not page_content:
        print("Failed to retrieve page content.")
        return []

    soup = BeautifulSoup(page_content, 'html.parser')
    laptops = soup.find_all('div', {'data-component-type': 's-search-result'})

    if not laptops:
        print("No laptops found on the page. The structure of the page might have changed.")
        print(soup.prettify()[:1000])
        return []

    data = []

    for laptop in laptops:
        title_tag = laptop.h2
        title = title_tag.text.strip() if title_tag else "N/A"

        price_tag = laptop.find('span', 'a-price-whole')
        price = price_tag.text.strip() if price_tag else "N/A"

        rating_tag = laptop.find('span', {'class': 'a-icon-alt'})
        rating = rating_tag.text.strip() if rating_tag else "N/A"

        image_tag = laptop.find('img', {'class': 's-image'})
        image = image_tag['src'] if image_tag else "N/A"

        ad_tag = laptop.find('span', {'class': 's-label-popover-default'})
        ad_or_organic = 'Ad' if ad_tag else 'Organic'

        data.append({
            'Title': title,
            'Price': price,
            'Rating': rating,
            'Image': image,
            'Ad/Organic': ad_or_organic
        })

    return data

In [8]:
# Main function to save data to a CSV file with a timestamp
def main():
    url = 'https://www.amazon.in/s?k=laptop'
    laptops_data = scrape_amazon_laptops(url)

    if not laptops_data:
        print("No data to save.")
        return

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"amazon_laptops_{timestamp}.csv"

    df = pd.DataFrame(laptops_data)
    df.to_csv(filename, index=False)

    print(f"Data saved to {filename}")

if __name__ == "__main__":
    main()


503 Service Unavailable. Retry 1/3 after 5 seconds.
Data saved to amazon_laptops_20240628_113731.csv
