# 1. Web Scraping (static pages) with BeautifulSoap

Import the required libraries

BeautifulSoup documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

Pandas documentation: https://pandas.pydata.org/docs/getting_started/intro_tutorials/01_table_oriented.html



**First click on the folder icon to connect Google Drive, the third one in the side menu. We need the connection to save the data in the end**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
url = "https://www.setec.mk/"

Send a HTTP GET request to the URL

In [None]:
response = requests.get(url)

In [None]:
response

<Response [200]>

In [None]:
response.status_code

200

In [2]:
# response.text

In [None]:
raw_html = response.text

Parse the raw html file with a html parser from BeautifulSoup

In [None]:
soup = BeautifulSoup(raw_html, "html.parser")

In [4]:
# print(soup.prettify())

* class selector - starts with '.' and continues with the name of the class
(example '.sale')

* id selector - starts with '#' and continues with the name of the id (example '#id')

* tag names - example 'p': paragraph, 'a': anchor (link), 'div' - division

In [None]:
regular_price = soup.select(".price-old-new")

In [None]:
len(regular_price)

81

In [None]:
happy_price = soup.select(".price-new-new")

In [None]:
len(happy_price)

81

In [None]:
discount = soup.select(".sale")

In [None]:
len(discount)

69

Notice that not all products have a discount

Tag selector

In [6]:
# soup.select('p')

Id selector

In [None]:
soup.select_one("#mora_da_ima_prazno_mesto")

<div class="name" id="mora_da_ima_prazno_mesto"><a href="https://setec.mk/tcl-65c74577352">
				TCL 65C745</a>
</div>

The find and find_all functions

In [None]:
soup.find("p")

<p class="close-menu"></p>

In [8]:
# soup.find_all("p")

In [10]:
# soup.find_all("span")

More specific selectors

In [12]:
# soup.select("div.category-price-akciska span.price-new-new")

Find tags with attributes, you can add multiple key-value pairs for the attrubte names and values.

In [14]:
# soup.find_all('a', attrs={"data-original-title": "Додади во кошничка"})

Scrape all products from the home page

In [None]:
products = soup.select('.product')

In [None]:
print(products[0])

<div class="product clearfix product-hover">
<div class="left">
<div class="sale">-40%</div>
<div class="image">
<a href="https://setec.mk/tcl-65c74577352">
<!-- Como LABELS -->
<!--&& $manufacturer!=15 -->
<div class="comopl" id="comopl1"><div class="comopl_box">
<span class="comopl_label comopl_mod-270 comopl_page-other" style="left: 0%;bottom: 0%;opacity: 1;height: 90px;width: 80px;font-size: 1em;font-weight: 700;text-align: center;border-style: none;border: none;box-shadow: none;background-image: url('https://setec.mk/image/catalog/Promo/2+3TCL.png'); background-position: center; background-repeat: no-repeat;z-index: 1;">
</span>
</div>
<div class="comopl_box">
<span class="comopl_label comopl_mod-450 comopl_page-other" style="right: 0%;top: 0%;opacity: 1;height: 100px;width: 100px;font-size: 1em;font-weight: 700;text-align: center;border-style: none;border: none;box-shadow: none;background-image: url('https://setec.mk/image/catalog/Promo/iute_eye.png'); background-position: center

Let's test out the selectors for a single product. Then we will use them for all products on the home page.

In [None]:
product = products[0]

In [None]:
name = product.select_one('.name').text
code = product.select_one('.shifra').text
regular_price = product.select_one('.price-old-new').text
happy_price = product.select_one('.price-new-new').text
discount = product.select_one('.sale').text
link = product.select_one('.image a').get("href")
image_link = product.select_one('.image img').get("data-echo")

In [None]:
product.select_one('.image img').get("data-echo")

'https://setec.mk/image/cache/catalog/Product/56939_0-228x228.jpg'

In [None]:
name, code, regular_price, happy_price, discount, link, image_link

('\n\t\t\t\tLenovo DT Legion T7 34IRZ8 (Storm Grey)\n',
 '\n\t\t\t\tШифра: 56939\t\t\t',
 '199,999 Ден.',
 '179,999 Ден.',
 '-10%',
 'https://setec.mk/компјутери-и-it-опрема/конфигурации/lenovo-dt-legion-t7-34irz8-storm-grey56939',
 'https://setec.mk/image/cache/catalog/Product/56939_0-228x228.jpg')

In [None]:
name.strip()

'TCL 65C745'

In [None]:
len(products)

82

In [None]:
parsed_products = []
for product in products:
  name = product.select_one('.name').text.strip()
  code = product.select_one('.shifra').text.strip()
  regular_price = product.select_one('.price-old-new')
  if regular_price is not None:  # check to see if the element exists, if it's None it will throw an error
    regular_price = regular_price.text.strip()

  happy_price = product.select_one('.price-new-new')
  if happy_price is not None:
    happy_price = happy_price.text.strip()
  discount = product.select_one('.sale')
  if discount is not None:
    discount = discount.text.strip()
  link = product.select_one('.image a').get("href")
  image_link = product.select_one('.image img').get("data-echo")

  product_dict = {
      "ProductName": name,
      "ProductCode": code,
      "RegularPrice": regular_price,
      "HappyPrice": happy_price,
      "DiscountPercent": discount,
      "PageLink": link,
      "ImageLink": image_link
  }

  parsed_products.append(product_dict)

In [None]:
parsed_products

[{'ProductName': 'Apple Mac mini M2 PRO -',
  'ProductCode': 'Шифра: 55218',
  'RegularPrice': '99,999 Ден.',
  'HappyPrice': '89,999 Ден.',
  'DiscountPercent': '-10%',
  'PageLink': 'https://setec.mk/компјутери-и-it-опрема/конфигурации/apple-mac-mini-m2-pro-55218',
  'ImageLink': 'https://setec.mk/image/cache/catalog/Product/55218_0-228x228.jpg'},
 {'ProductName': 'ASUS GAMING PC G35DX-WB7730W  /  Win 11',
  'ProductCode': 'Шифра: 52738',
  'RegularPrice': '99,999 Ден.',
  'HappyPrice': '89,999 Ден.',
  'DiscountPercent': '-10%',
  'PageLink': 'https://setec.mk/компјутери-и-it-опрема/конфигурации/asus-gaming-pc-g35dx-wb7730w-win-1152738',
  'ImageLink': 'https://setec.mk/image/cache/catalog/Product/52738_0-228x228.jpg'},
 {'ProductName': 'Lenovo Legion T5 26ARA8',
  'ProductCode': 'Шифра: 56987',
  'RegularPrice': '99,999 Ден.',
  'HappyPrice': '92,999 Ден.',
  'DiscountPercent': '-7%',
  'PageLink': 'https://setec.mk/компјутери-и-it-опрема/конфигурации?product_id=56987',
  'ImageLin

In [None]:
df = pd.DataFrame(parsed_products)

In [None]:
df

Unnamed: 0,ProductName,ProductCode,RegularPrice,HappyPrice,DiscountPercent,PageLink,ImageLink
0,Apple Mac mini M2 PRO -,Шифра: 55218,"99,999 Ден.","89,999 Ден.",-10%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
1,ASUS GAMING PC G35DX-WB7730W / Win 11,Шифра: 52738,"99,999 Ден.","89,999 Ден.",-10%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
2,Lenovo Legion T5 26ARA8,Шифра: 56987,"99,999 Ден.","92,999 Ден.",-7%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
3,"Apple iMac 24"" ( Blue ) - Процесор:Apple M3 ch...",Шифра: 59281,"99,999 Ден.","94,999 Ден.",-5%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
4,"Apple iMac 24"" ( Green ) - Процесор:Apple M3 c...",Шифра: 59100,"99,999 Ден.","94,999 Ден.",-5%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
5,Конфигурација INTEL GAMER WARRIOR,Шифра: 57467,"97,999 Ден.","94,999 Ден.",-3%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
6,ASUS ROG STRIX GAMING PC G13CHR-M71470F16465,Шифра: 59715,"99,999 Ден.","97,999 Ден.",-2%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
7,ASUS GAMING PC G15DS-WB7722W / Win 11,Шифра: 55819,"109,999 Ден.","99,999 Ден.",-9%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
8,Конфигурација INTEL GAMER EXTREME,Шифра: 54337,"119,999 Ден.","99,999 Ден.",-17%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
9,"Apple iMac 24"" ( Blue ) - Процесор:Apple M3 ch...",Шифра: 59099,"115,999 Ден.","104,999 Ден.",-9%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...


In [None]:
df.head(10)

Unnamed: 0,ProductName,ProductCode,RegularPrice,HappyPrice,DiscountPercent,PageLink,ImageLink
0,Apple Mac mini M2 PRO -,Шифра: 55218,"99,999 Ден.","89,999 Ден.",-10%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
1,ASUS GAMING PC G35DX-WB7730W / Win 11,Шифра: 52738,"99,999 Ден.","89,999 Ден.",-10%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
2,Lenovo Legion T5 26ARA8,Шифра: 56987,"99,999 Ден.","92,999 Ден.",-7%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
3,"Apple iMac 24"" ( Blue ) - Процесор:Apple M3 ch...",Шифра: 59281,"99,999 Ден.","94,999 Ден.",-5%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
4,"Apple iMac 24"" ( Green ) - Процесор:Apple M3 c...",Шифра: 59100,"99,999 Ден.","94,999 Ден.",-5%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
5,Конфигурација INTEL GAMER WARRIOR,Шифра: 57467,"97,999 Ден.","94,999 Ден.",-3%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
6,ASUS ROG STRIX GAMING PC G13CHR-M71470F16465,Шифра: 59715,"99,999 Ден.","97,999 Ден.",-2%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
7,ASUS GAMING PC G15DS-WB7722W / Win 11,Шифра: 55819,"109,999 Ден.","99,999 Ден.",-9%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
8,Конфигурација INTEL GAMER EXTREME,Шифра: 54337,"119,999 Ден.","99,999 Ден.",-17%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...
9,"Apple iMac 24"" ( Blue ) - Процесор:Apple M3 ch...",Шифра: 59099,"115,999 Ден.","104,999 Ден.",-9%,https://setec.mk/компјутери-и-it-опрема/конфиг...,https://setec.mk/image/cache/catalog/Product/5...


In [None]:
df.to_csv('products-setec.csv', index=False)

# Additional

### How to scrape multiple pages?

Let's try to scrape all the keybords from their category page. The page implements paging so we can use the query parameter to move between pages.

In [None]:
base_url = "https://setec.mk/computers--it/pc-accessories/keybords?page="

The same code from above is copied in a function `extract_product_characteristics_as_dict` for easy access

In [None]:
def extract_product_characteristics_as_dict(product):
  name = product.select_one('.name').text.strip()
  code = product.select_one('.shifra').text.strip()
  regular_price = product.select_one('.price-old-new')
  if regular_price is not None:
    regular_price = regular_price.text.strip()

  happy_price = product.select_one('.price-new-new')
  if happy_price is not None:
    happy_price = happy_price.text.strip()
  discount = product.select_one('.sale')
  if discount is not None:
    discount = discount.text.strip()
  link = product.select_one('.image a').get("href")
  image_link = product.select_one('.image img').get("data-echo")

  product_dict = {
      "ProductName": name,
      "ProductCode": code,
      "RegularPrice": regular_price,
      "HappyPrice": happy_price,
      "DiscountPercent": discount,
      "PageLink": link,
      "ImageLink": image_link
  }

  return product_dict

In [None]:
all_products = []
for i in range(1, 6):  # there are 6 pages for keyboards
  url = base_url + str(i)
  response = requests.get(url)
  soup = BeautifulSoup(response.text, "html.parser")
  products = soup.select('.product')
  for product in products:
    result = extract_product_characteristics_as_dict(product)
    all_products.append(result)

In [None]:
all_pc_configs = pd.DataFrame(all_products)

In [None]:
all_pc_configs.to_csv('pc-configs-setec.csv', index=False)

# Homework: Find all links to categories and scrape the entire site
Hint: Also find a way to automatically check the number of pages per category