## Full Repo
https://github.com/MostafaBelo/Konecta_Assignments/tree/main

## Imports

In [1]:
import requests
import bs4
from bs4 import BeautifulSoup

import numpy as np

from tqdm.auto import tqdm

## Data Classes

In [2]:
class Product:
    details_fetched = 0

    def __init__(self, elem=None):
        self.title = None
        
        self.price = None
        
        self.rating = None
        self.ratings_count = None

        self.url = None
        self.img_url = None

        self.brand = None

        if elem is not None:
            self._extract_from_elem(elem)

    def _check_one_find(self, elem, find_css) -> bs4.element.Tag | None:
        finds = elem.select(find_css)
        if (len(finds) < 1 or len(finds) > 1):
            return None
        else:
            return finds[0]

    def _extract_from_elem(self, elem: bs4.element.Tag):
        try:
            title_css = "div > div > span > div > div > div.a-section.a-spacing-small.puis-padding-left-small.puis-padding-right-small > div.a-section.a-spacing-none.a-spacing-top-small.s-title-instructions-style > a > h2 > span"
            title = self._check_one_find(elem, title_css)
            self.title = title if title is None else title.text
        except:
            self.title = None

        try:
            price_css = "div > div > span > div > div > div.a-section.a-spacing-small.puis-padding-left-small.puis-padding-right-small > div.a-section.a-spacing-none.a-spacing-top-small.s-price-instructions-style > div > div:nth-child(1) > a > span.a-price > span:nth-child(2)"
            price = self._check_one_find(elem, price_css)
            self.price = price if price is None else price.text
        except:
            self.price = None

        try:
            rating_css = "div > div > span > div > div > div.a-section.a-spacing-small.puis-padding-left-small.puis-padding-right-small > div > div > span > a > i.a-icon.a-icon-star-small > span"
            rating = self._check_one_find(elem, rating_css)
            self.rating = rating if rating is None else float(rating.text.split(" ")[0])
        except:
            self.rating = None

        try:
            ratings_count_css = "div > div > span > div > div > div.a-section.a-spacing-small.puis-padding-left-small.puis-padding-right-small > div:nth-child(2) > div > a > span"
            ratings_count = self._check_one_find(elem, ratings_count_css)
            self.ratings_count = ratings_count if ratings_count is None else int(ratings_count.text)
        except:
            self.ratings_count = None

        try:
            url_css = "div > div > span > div > div > div.a-section.a-spacing-small.puis-padding-left-small.puis-padding-right-small > div.a-section.a-spacing-none.a-spacing-top-small.s-title-instructions-style > a"
            url = self._check_one_find(elem, url_css)
            self.url = url if url is None else f"https://www.amazon.eg/{url.get("href").strip("/")}"
        except:
            self.url = None

        try:
            img_url_css = "div > div > span > div > div > div.s-product-image-container.aok-relative.s-text-center.s-image-overlay-grey.puis-image-overlay-grey.s-padding-left-small.s-padding-right-small.puis-spacing-small.s-height-equalized.puis.puis-v3tv8pm51p8jkg2gibtq2lktwum > span > a > div > img"
            img_url = self._check_one_find(elem, img_url_css)
            self.img_url = img_url if img_url is None else img_url.get("src")
        except:
            self.img_url = None

        if self.title is None:
            self.brand = None
        else:
            known_brands = ["Lenovo", "Dell", "Hp", "ASUS", "MSI", "Apple", "acer", "Microsoft"]
            title_words = self.title.lower().split(" ")
            brands = []
            for brand in known_brands:
                if brand.lower() in title_words:
                    brands.append(brand)
            
            if len(brands) == 1:
                self.brand = brands[0]
            else:
                self.brand = None
        
        if self.brand is None:
            self._fetch_details()

    def _fetch_details(self):
        try:
            if (self.url is None):
                return
            
            headers = {
                "Referrer Policy": "strict-origin-when-cross-origin"
            }

            Product.details_fetched += 1
            page = requests.get(self.url, headers=headers)

            soup = BeautifulSoup(page.content, "html.parser")
            properties = soup.select("#poExpander > div.a-expander-content.a-expander-partial-collapse-content > div > table > tbody > tr.a-spacing-small.po-brand")
            brands = []
            for property in properties:
                try:
                    row_data = property.select("td > span")
                    if (len(row_data) != 2):
                        continue
                    
                    if (row_data[0].text.lower() != "brand name"):
                        continue

                    brands.append(row_data[1].text)
                except:
                    pass
            
            if len(brands) != 1:
                self.brand = None
            else:
                self.brand = brands[0]
        except:
            self.brand = None

    def __repr__(self) -> str:
        return f"""
Title: {self.title}
Brand: {self.brand}
Price: {self.price}
Rating: {self.rating}/5 ({self.ratings_count})
Product Url: {self.url}
Image Link: {self.img_url}
"""
    
    def __str__(self) -> str:
        items = [self.title, self.brand, self.price, self.rating, self.ratings_count, self.url, self.img_url]
        items = [str(item).replace(",", "") if item is not None else str(np.nan) for item in items]
        return ",".join(items)

## Constants

In [3]:
def get_page_url(page_count):
    return f"https://www.amazon.eg/s?k=laptops&i=electronics&rh=n%3A18018102031%2Cn%3A21832872031%2Cn%3A21832907031&dc&page={page_count}&language=en&crid=2IAIQTQAY773T&qid=1754418251&rnid=21832872031&sprefix=laptops%2Celectronics%2C123&xpid=xU2DCUvFseb8a"

def get_page(page_count):
    page_url = get_page_url(1)

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Referer": "https://www.google.com/",
        "DNT": "1",  # Do Not Track
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",

        "Referrer Policy": "strict-origin-when-cross-origin"
    }

    page = requests.get(page_url, headers=headers)

    with open(f"pages/page{page_count}.html", "wb") as f:
        f.write(page.content)

    return page

## Testing Page Fetch and Data Extraction

In [4]:
page = get_page(1)

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

elems_selector = "#search > div.s-desktop-width-max.s-desktop-content.s-opposite-dir.s-wide-grid-style.sg-row > div.sg-col-4-of-4.sg-col-20-of-24.s-matching-dir.sg-col-16-of-20.sg-col.sg-col-12-of-12.sg-col-8-of-8.sg-col-12-of-16 > div > span.rush-component.s-latency-cf-section > div.s-main-slot.s-result-list.s-search-results.sg-row > div[role=\"listitem\"]"
elems = soup.select(elems_selector)

In [6]:
type(elems), len(elems)
# type(elems), len(elems), type(elems[0])

(bs4.element.ResultSet, 33)

In [7]:
products = []
for elem in elems:
    products.append(Product(elem))

extra_links_count = Product.details_fetched
Product.details_fetched = 0
extra_links_count

3

In [8]:
len(products), products

(33,
 [
  Title: Lenovo IdeaPad Slim 3 15IRH8 Laptop - 13th Intel Core i7-13620H, 16 GB DDR5-4800 MHz, 512 GB SSD, Integrated Intel UHD Graphics, 15.6" FHD (1920x1080) IPS 300nits Anti-glare, Dos - Arctic Grey
  Brand: Lenovo
  Price: EGP32,799.00
  Rating: 3.2/5 (7)
  Product Url: https://www.amazon.eg/-/en/sspa/click?ie=UTF8&spc=MToxMzA2ODQ3OTUxNTA3NzUyOjE3NTQ0MjQ1OTM6c3BfYXRmOjMwMDYyMDk5MjQ4NzQzMjo6MDo6&url=%2FLenovo-IdeaPad-Slim-15IRH8-Laptop%2Fdp%2FB0CWG78C3L%2Fref%3Dsr_1_1_sspa%3Fcrid%3D2IAIQTQAY773T%26dib%3DeyJ2IjoiMSJ9.NQ2InMkyDbIoa4RXk7IhrjxRbx__BHww9K-xq3c-kL-DBi4Lo7di8kyaUz-qspOAUJ_RHrfJ4rCcJK9z9qb7vf-ChUL7C81g56dqLW7muCNkPYydBemQhS_auYJoIkc0lle5pn6xKD9PU35WZHCvPjwhSKeRs73UJqo9xX-YIZz9pVM3thinXxw135X-ZWgkioFnXgKeXN3-41F50WwGv9nVX3vehsX5Momv3lHsefjrpxVbAD8qRRIQEUQ-36X-DWaLO8HTInLR32ofqPG2i2nwdLqKG7pPYIFiSw6YENw.WcFCvWB2NCPMs-PEDzcSzmDNuKppajfHKDi8DCHq-BQ%26dib_tag%3Dse%26keywords%3Dlaptops%26qid%3D1754424593%26rnid%3D21832872031%26s%3Delectronics%26sprefix%3Dlaptops%252Celect

## Running for 5 pages (120 products)

In [9]:
products = []
def fetch_pages(starting_page, ending_page):
    for i in tqdm(range(starting_page, ending_page+1)):
        page =  get_page(i)

        soup = BeautifulSoup(page.content, "html.parser")

        elems_selector = "#search > div.s-desktop-width-max.s-desktop-content.s-opposite-dir.s-wide-grid-style.sg-row > div.sg-col-4-of-4.sg-col-20-of-24.s-matching-dir.sg-col-16-of-20.sg-col.sg-col-12-of-12.sg-col-8-of-8.sg-col-12-of-16 > div > span.rush-component.s-latency-cf-section > div.s-main-slot.s-result-list.s-search-results.sg-row > div[role=\"listitem\"]"
        elems = soup.select(elems_selector)

        for elem in elems:
            products.append(Product(elem))

fetch_pages(1, 5)
extra_links_count = Product.details_fetched
Product.details_fetched = 0
extra_links_count

  0%|          | 0/5 [00:00<?, ?it/s]

15

In [10]:
len(products), products

(165,
 [
  Title: Lenovo IdeaPad Slim 3 15IRH8 Laptop - 13th Intel Core i7-13620H, 16 GB DDR5-4800 MHz, 512 GB SSD, Integrated Intel UHD Graphics, 15.6" FHD (1920x1080) IPS 300nits Anti-glare, Dos - Arctic Grey
  Brand: Lenovo
  Price: EGP32,799.00
  Rating: 3.2/5 (7)
  Product Url: https://www.amazon.eg/-/en/sspa/click?ie=UTF8&spc=MTo2OTY3OTUwMDU0MzM3MDQyOjE3NTQ0MjQ1OTY6c3BfYXRmOjMwMDYyMDk5MjQ4NzQzMjo6MDo6&url=%2FLenovo-IdeaPad-Slim-15IRH8-Laptop%2Fdp%2FB0CWG78C3L%2Fref%3Dsr_1_1_sspa%3Fcrid%3D2IAIQTQAY773T%26dib%3DeyJ2IjoiMSJ9.NQ2InMkyDbIoa4RXk7IhrrInA7tmhMVcmawaGq-20Wy2LQwg3eSOuxbWB-6QToxN_JxEt-G3cn73_XYzWd5b7Xio4fQ_uwtvH81_grAcXutkPYydBemQhS_auYJoIkc0VTqh0yUj2bJ9bTa97u-sok-3clsdAl8AZ1uK2in_-yb9pVM3thinXxw135X-ZWgk7I_vw3yv-sKIatnqndEhS7CDDO_t2fJHH4Si-Tp1LPvC8qoRQnNy6w5BnBmGqy45qJcAj4yZ2NXqNDt_ODce17GToGd4WWC8ZhIL4FJy18A.HQf9F-SD8wSQjsyHqGVODEjNf083k-owlLi1PxXd9_w%26dib_tag%3Dse%26keywords%3Dlaptops%26qid%3D1754424596%26rnid%3D21832872031%26s%3Delectronics%26sprefix%3Dlaptops%252Celec

## Saving Data to csv

In [11]:
headers = "Title,Brand,Price,Rating,Ratings Count,Product Link,Product Image Link\n"
csv_data = headers + "\n".join(str(product) for product in products)

with open("extracted_data.csv", "w") as f:
    f.write(csv_data)