In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL
base_url = "https://www.noon.com/egypt-en/electronics-and-mobiles/mobiles-and-accessories/mobiles-20905/eg-all-mobiles/"

# Initialize lists to store scraped data
product_links = []

# Pagination loop
page = 1

while True:
    # Request the page
    url = f"{base_url}?page={page}"
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error on page {page}: {e}")
        time.sleep(5)
        continue

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all product containers
    products = soup.find_all('div', class_='sc-57fe1f38-0 eSrvHE')

    if not products:
        print("No products found, stopping.")
        break
    for product in products:
        # Extract product link
        link = product.find('a', href=True)
        if link:
            full_link = f"https://www.noon.com{link['href']}"
            product_links.append(full_link)

    print(f"Processed page {page}, found {len(products)} products.")

    # Increment page count
    page += 1

df = pd.DataFrame({
    'Product Link': product_links
})

df.to_csv('noon_mobile_links.csv', index=False)
print("Links exported to noon_mobile_links.csv")

Processed page 1, found 50 products.
Processed page 2, found 50 products.
Processed page 3, found 50 products.
Processed page 4, found 50 products.
Processed page 5, found 50 products.
Processed page 6, found 50 products.
Processed page 7, found 49 products.
Processed page 8, found 50 products.
Processed page 9, found 50 products.
Processed page 10, found 50 products.
Processed page 11, found 50 products.
Processed page 12, found 50 products.
Processed page 13, found 50 products.
Processed page 14, found 50 products.
Processed page 15, found 50 products.
Processed page 16, found 6 products.
No products found, stopping.
Links exported to noon_mobile_links.csv


In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import requests


# Load the CSV file with URLs
file_path = '/content/noon_mobile_links.csv'
urls_df = pd.read_csv(file_path)

# Prepare an empty list to collect data
all_data = []


# Iterate over each URL in the DataFrame with tqdm progress bar
for url in tqdm(urls_df['Product Link'], desc="Scraping Products", unit="product"):
    try:
        # Send a GET request to the webpage

        # Render the JavaScript on the page (this might take a couple of seconds)
        # Increased timeout to 20 seconds
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the title
        title = soup.find('h1', class_='sc-b5e5edab-22 cRYLTe').text.strip() if soup.find('h1', class_='sc-b5e5edab-22 cRYLTe') else pd.NA

        # Extract the price
        price = soup.find('div', class_='priceNow').text.strip() if soup.find('div', class_='priceNow') else pd.NA

        # Extract brand
        brand = soup.find('div', class_='sc-b5e5edab-21 jjBJcR').text.strip() if soup.find('div', class_='sc-b5e5edab-21 jjBJcR') else pd.NA

        # Extract rating
        rating = soup.find('div', class_='sc-9cb63f72-2 dGLdNc').text if soup.find('div', class_='sc-9cb63f72-2 dGLdNc') else pd.NA

        # Extract product image
        img_tags = soup.find_all('img')

    # Extract the 'src' attribute from each <img> tag and clean it
        product_image = []
        for img in img_tags:
            img_url = img.get('src')
            if img_url and img_url.startswith('https://f.nooncdn.com/') and img_url.endswith('width=240'):
                # Remove query parameters from the URL (anything after '?')
                clean_img_url = img_url.split('?')[0]
                product_image.append(clean_img_url)

        # Extract the specifications table
        specifications = {}
        specs_table = soup.find('table').find('tbody').find_all('tr') if soup.find('table') else []

        for row in specs_table:
            # Get the header (first column) and value (second column)
            header = row.find_all('td')[0].text.strip()
            value = row.find_all('td')[1].text.strip()
            specifications[header] = value

        # Add title, price, and availability to specifications
        specifications.update({
            'Title': title,
            'Price': price,
            'Brand': brand,
            'Rating': rating,
            'Product Image': product_image,
            'Product Link': url
        })

        # Append the specifications as a row to the list
        all_data.append(specifications)

    except Exception as e:
        print(f"Error scraping {url}: {e}")

# Convert the list of dictionaries to a DataFrame
specs_df = pd.DataFrame(all_data)

# Save the data to a CSV file
specs_df.to_csv('/content/noon_mobile_links.csv', index=False)

Scraping Products: 100%|██████████| 806/806 [10:30<00:00,  1.28product/s]
