# Ceneo Scraper

## Biblioteki

In [64]:
import os
import requests
from bs4 import BeautifulSoup
import json

In [65]:
def extract(ancestor, selector=None, attribute=None, returns_list=False):
    if selector:
        if returns_list:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.get_text().strip() for tag in ancestor.select(selector)]
        
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None

        try:
            return ancestor.select_one(selector).get_text().strip()
        except AttributeError:
            return None
    if attribute:
        return ancestor[attribute].strip()
    return ancestor.get_text().strip()


In [66]:
selectors = { 
                "Opinion_ID": (None, "data-entry-id",),
                "Author": ('span.user-post__author-name',),
                "Recommendation":('span.user-post__author-recommendation > em',),
                "Ratings": ('span.user-post__score-count',),
                "Content": ('div.user-post__text',),
                "Pros": ('div.review-feature__title--positives ~ div.review-feature__item', None, True,),
                "Cons": ('div.review-feature__title--negatives ~ div.review-feature__item', None, True,),
                "useful": ('span[id^="votes-yes"]',),
                "useless": ('span[id^="votes-no"]',),
                "post_date": ('span.user-post__published > time:nth-child(1)',"datetime",),
                "purchase_date": ('span.user-post__published > time:nth-child(2)',"datetime",)
            }

## Wysłanie żądania dostępu do zasobu do serwera

In [67]:
product_id = "114700014"
url = f"https://www.ceneo.pl/{product_id}/opinie-2"

In [68]:
all_opinions = []
while url:
    response = requests.get(url)
    page_dom = BeautifulSoup(response.text, "html.parser")
    opinions = page_dom.select("div.js_product-review")

    for opinion in opinions:
        single_opinion = {
            key: extract(opinion, *value)
                for key,value in selectors.items()
            
        }
        all_opinions.append(single_opinion)
    try:
        url = "https://www.ceneo.pl" + extract(page_dom, "a.pagination__next","href")
    except TypeError:
        url = None

## Ekstrakcja składowych pojedynczej opinii
|Składowa|Selektor|Zmienna|
|--------|--------|-------|
|ID opinii|data-entry-id |Opinion ID| 
|Autor|user-post__author-name |Author| 
|Rekomendacja|span.user-post__author-recomendation > em |Reccomendation| 
|gwiazdki|span.user-post__score|Ratings| 
|treść|div.user-post__text|Content| 
|Lista zalet|div.review-feature__title--positive ~ div.review-feature-item|Pros| 
|Lista wad|div.review-feature__title--negatives ~ div.review-feature__item|Cons| 
|Dla ilu przydatna|span[id^="votes-yes"]|useful| 
|Dla ilu nieprzydatna|span[id^="votes-no"]|uselles| 
|Data wystawienia|span.user-post__published > time:nth-child(1)["datetime"]|post date| 
|Data zakupu|span.user-post__published > time:nth-child(2)["datetime"]|purchase date| 

In [69]:
if not os.path.exists("opinions"):
    os.mkdir("opinions")
with open (f"opinions/{product_id}.json", "w", encoding ="UTF-8") as jf:
    json.dump(all_opinions, jf, indent = 4, ensure_ascii=False)

In [70]:
len(all_opinions)

170