# Ceneo Scraper

## Biblioteki

In [36]:
import os
import json
import requests
from bs4 import BeautifulSoup

## Struktura opinni w serwisie Ceneo.pl

|Składowa|Selektor|Zmienna|
|--------|--------|-------|
|id_opinii|["data-entry-id"]|opinion_id|
|autor|span.user-post__author-name|author|
|rekomendacja|span.user-post__author-recomendation > em|recommendation|
|gwiazdki|span.user-post__score-count|rating|
|treść|div.user-post__text|content|
|lista zalet|div.review-feature__title--positives ~ div.review-feature__item|pros|
|lista wad|div.review-feature__title--negatives ~ div.review-feature__item|cons|
|dla ilu przydatna|button.vote-yes > span|useful|
|dla ilu nieprzydatna|button.vote-no > span|useless|
|data wystawienia|span.user-post__published > time:nth-child(1)["datetime"]|publish_date|
|data zakupu|span.user-post__published > time:nth-child(2)["datetime"]|purchase_date|


## Wysłanie do serwera żądania dostępu do zasobu

In [37]:
product_id = '94103927'
url = f'https://www.ceneo.pl/{product_id}#tab=reviews'

In [38]:
all_opinions = []
while url:
    response = requests.get(url)
    page_dom = BeautifulSoup(response.text, "html.parser")
    opinions = page_dom.select('div.js_product-review')    
    for opinion in opinions:
        try:
            single_opinion = {
                'opinion_id':opinion['data-entry-id'],
                'author':opinion.select_one('span.user-post__author-name').text.strip(),
                'recommendation':opinion.select_one('span.user-post__author-recomendation > em').text.strip(),
                'rating':opinion.select_one('span.user-post__score-count').text.strip(),
                'content':opinion.select_one('div.user-post__text').text.strip(),
                'pros':[x.text.strip() for x in opinion.select('div.review-feature__title--positives ~ div.review-feature__item')],
                'cons':[x.text.strip() for x in opinion.select('div.review-feature__title--negatives ~ div.review-feature__item')],
                'useful':opinion.select_one('button.vote-yes > span').text.strip(),
                'useless':opinion.select_one('button.vote-no > span').text.strip(),
                'publish_date':opinion.select_one('span.user-post__published > time:nth-child(1)').get('datetime').strip(),
                'purchase_date':opinion.select_one('span.user-post__published > time:nth-child(2)').get('datetime').strip()
            }
            all_opinions.append(single_opinion)
        except (TypeError, AttributeError):
            pass
    
    try:
        url = 'https://www.ceneo.pl' + page_dom.select_one('a.pagination__next').get('href').strip()
        print(url)
    except AttributeError:
        url = None

https://www.ceneo.pl/94103927/opinie-2
https://www.ceneo.pl/94103927/opinie-3
https://www.ceneo.pl/94103927/opinie-4
38


In [39]:
if not os.path.exists('opinions'):
    os.mkdir('opinions')

with open(f'opinions/{product_id}.json', 'w', encoding='utf=8') as jf: #json file
    json.dump(all_opinions, jf, indent=4, ensure_ascii=False)