# Ceneo Scraper

## ekstrakcja składowych pojedynczej opinii
|Składowa|Selektor|Zmienna|
|--------|--------|-------|
|id opinii|["data-entry-id"]|opinion_id|
|autor|span.user-post__author-name|author|
|rekomendacja|span.user-post__author-recomendation > em|recommendation|
|gwiazdki|span.user-post__score-count|rating|
|treść|div.user-post__text|content|
|lista zalet|div.review-feature__title--positives ~ div.review-feature_item|pros|
|lista wad|div.review-feature__title--negatives ~ div.review-feature_item|cons|
|dla ilu przydatna|button.vote-yes > span|useful|
|dla ilu nieprzydatna|button.vote-no > span|useless|
|data wystawienia|span.user-post__published > time:nth-child(1)['datatime']|publish_date|
|data zakupu|span.user-post__published > time:nth-child(2)['datatime']|purchase_date|

# Biblioteki

In [9]:
import os
import json
import requests
from bs4 import BeautifulSoup


## Funkcja do ekstrakcji danych z kodu HTML

In [10]:
def extract(ancestor, selector=None, attribute=None, return_list=False):
    if selector:
        if return_list:
            if attribute:
                return [tag[attribute].text.strip() for tag in ancestor.select(selector)]
            return [tag.text.strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).text.strip()
        except AttributeError:
            return None
    if attribute:
        return ancestor[attribute]
    return ancestor.text.strip()

## Słownik z strukturą opinii z serwisu Ceneo.pl

In [11]:
selectors = {
    "opinion_id" : (None, "data-entry-id"),
    "author" :  ("span.user-post__author-name",),
    "recommendation" : ("span.user-post__author-recomendation > em",),
    "rating" : ("span.user-post__score-count",),
    "content" : ("div.user-post__text",),
    "pros" : ("div.review-feature__title--positives ~ div.review-feature__item", None, True),
    "cons" : ("div.review-feature__title--negatives ~ div.review-feature__item", None, True),
    "useful" : ("button.vote-yes > span",),
    "useless" : ("button.vote-no > span",),
    "publish_date" : ("span.user-post__published > time:nth-child(1)",'datetime'),
    "purchase_date" : ("span.user-post__published > time:nth-child(2)",'datetime')
}

## Link do pierwszej strony z opiniami o wskazanym produkcie w serwisie Ceneo.pl

In [22]:
#product_id = "156802519"
product_id = input("Podaj kod produktu w serwisie Ceneo.pl")
url = f"https://www.ceneo.pl/{product_id}#tab=reviews"

## Ekstrakcja wszystkich opinii o wskazanym produkcie z serwisu Ceneo.pl

In [23]:
all_opinions = []
while (url):
    response = requests.get(url)
    page_dom = BeautifulSoup(response.text, "html.parser")
    opinions = page_dom.select("div.js_product-review")
    for opinion in opinions:
        single_opinion = {
            key: extract(opinion, *value)
                for key, value in selectors.items()
        }
        all_opinions.append(single_opinion)
    try:
        url = "https://www.ceneo.pl" + extract(page_dom,"a.pagination__next",'href')
    except TypeError:
        url = None


## Zapis pobranych opinii o wskazanym produkcie do pliku JSON

In [24]:
if not os.path.exists("opinions"):
    os.mkdir("opinions")
with open(f"opinions/{product_id}.json", "w", encoding="UTF-8") as opf:
    json.dump(all_opinions, opf,indent=4, ensure_ascii=False)
print(len(all_opinions))
#print(json.dumps(all_opinions,indent=4, ensure_ascii=False))

133


## OLD

In [7]:
#old
page_dom = BeautifulSoup(response.text, "html.parser")
print(type(page_dom))
opinions = page_dom.select("div.js_product-review")

#opinion = page_dom.select_one("div.js_product-review")
print(type(opinion))
#print(opinion)

<class 'bs4.BeautifulSoup'>
<class 'bs4.element.Tag'>


In [33]:
#old
all_opinions = []
for opinion in opinions:
    try:
        single_opinion = {
            "opinion_id" : opinion["data-entry-id"],
            "author" : opinion.select_one("span.user-post__author-name").text.strip(),
            "recommendation" : opinion.select_one("span.user-post__author-recomendation > em").text.strip(),
            "rating" : opinion.select_one("span.user-post__score-count").text.strip(),
            "content" : opinion.select_one("div.user-post__text").text.strip(),
            "pros" : [p.text.strip() for p in opinion.select("div.review-feature__title--positives ~ div.review-feature_item")],
            "cons" : [c.text.strip() for c in opinion.select("div.review-feature__title--negatives ~ div.review-feature_item")],
            "useful" : opinion.select_one("button.vote-yes > span").text.strip(),
            "useless" : opinion.select_one("button.vote-no > span").text.strip(),
            "publish_date" : opinion.select_one("span.user-post__published > time:nth-child(1)")["datetime"].strip(),
            "purchase_date" : opinion.select_one("span.user-post__published > time:nth-child(2)")["datetime"].strip()
            }
        all_opinions.append(single_opinion)
    except (TypeError, AttributeError):
        pass
url = "https://www.ceneo.pl" + page_dom.select_one("a.pagination__next")["href"]