#### 1.Imports


In [29]:
import os
import json
import requests
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator

### Utils


In [30]:
def extract(ancestor, selector=None, attribute=None, multiple=False):
    if selector:
        if multiple:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.get_text().strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).get_text().strip()
        except AttributeError:
            return None
    try:
        return ancestor[attribute].strip()
    except (TypeError, KeyError):
        return None

In [31]:
def translate(text, source="pl", target="en"):
    return GoogleTranslator(source, target).translate(text=text)

In [32]:
selectors = {
    "opinion_id": (None, "data-entry-id"),
    "author": ("span.user-post__author-name",),
    "recommendation": ("span.user-post__author-recomendation > em",),
    "stars": ("span.user-post__score-count",),
    "content_pl": ("div.user-post__text",),
    "pros_pl": ("div.review-feature__item--positive", None, True),
    "cons_pl": ("div.review-feature__item--negative", None, True),
    "vote_yes": ("button.vote-yes","data-total-vote"),
    "vote_no": ("button.vote-no","data-total-vote"),
    "published": ("span.user-post__published > time:nth-child(1)","datetime"),
    "purchased": ("span.user-post__published > time:nth-child(2)","datetime"),
}

### Extraction of opinions

In [33]:
with open("./cookie.json", 'r') as jf:
    headers = json.load(jf)

In [34]:
product_id = input("ennter product code")
next_page = f"https://www.ceneo.pl/{product_id}#tab=reviews"
all_opinions = []
while next_page:
    response = requests.get(next_page, headers = headers)
    if response.status_code == 200:
        print(next_page)
        page_dom = BeautifulSoup(response.text, 'html.parser')
        opinions = page_dom.select("div.js_product-review:not(.user-post--highlight)")
        print(len(opinions))
        for opinion in opinions:
            single_opinion = {
                key: extract(opinion, *value)
                for key, value in selectors.items()
            }
            single_opinion['content_en'] = translate(single_opinion['content_pl'])
            single_opinion['pros_en'] = [translate(pros) for pros in single_opinion['pros_pl']]
            single_opinion['cons_en'] = [translate(cons) for cons in single_opinion['cons_pl']]
            single_opinion['recommendation'] = True if single_opinion['recommendation']=='Polecam' else False if  single_opinion['recommendation']=="Nie polecam" else None
            single_opinion['stars'] = float(single_opinion['stars'].split("/")[0].replace(",", "."))
            single_opinion['vote_yes'] = int(single_opinion['vote_yes'])
            single_opinion['vote_no'] = int(single_opinion['vote_no'])
            all_opinions.append(single_opinion)
        try:
            next_page = "https://www.ceneo.pl" + page_dom.select_one("a.pagination__next")["href"]
        except TypeError:
            next_page = None

https://www.ceneo.pl/#tab=reviews
0


In [20]:
if not os.path.exists("./opinions"):
    os.mkdir("./opinions")
with open(f"./opinions/{product_id}.json", "w", encoding="UTF-8") as jf:
    json.dump(all_opinions, jf, ensure_ascii=False, indent=4)

#### 2. Sending HTTP request to access first page with options

In [21]:
headers = {
    "Cookie" : "sv3=1.0_d30e6f6e-13a6-11f0-88f3-92233e48ae52; __RequestVerificationToken=6btcuKo3AACe56ovwhgerlfPoqUpZzyG7pcg8hGjANDiAWgtmyRQ9RuFS8LQ1F5y39J9Gg01GjJPLqM-CmYmBPHUVOFggOCz0ODycbiHYfQ1; userCeneo=ID=4d13d8e4-c186-4653-ad36-be9ae38bf8af; appType=%7B%22Value%22%3A1%7D; ai_user=b6k/M|2025-04-07T11:52:48.415Z; cProdCompare_v2=; __utmf=6f4c4b6ffd4fed7933e28ea544f0e325_k2wCRI6tAVQ9FXqBZshhQvo35Yf981ST; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%22unknown%22%2C%22expiryDate%22%3A%222026-04-07T11%3A52%3A48.878Z%22%7D; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%22uaiaihKkgE8RQVo7qAxJ%22%2C%22expiryDate%22%3A%222026-04-07T11%3A52%3A48.878Z%22%7D; __eoi=ID=2714304659f50dde:T=1744026768:RT=1744026768:S=AA-AfjbkCLvStpjA7BHmgpzNfwMq; browserBlStatus=0; _gcl_au=1.1.1995519728.1744026771; ga4_ga=GA1.2.d30e6f6e-13a6-11f0-88f3-92233e48ae52; consentcookie=eyJBZ3JlZUFsbCI6dHJ1ZSwiQ29uc2VudHMiOlsxLDMsNCwyXSwiVENTdHJpbmciOiJDUVBmdGNBUVBmdGNBR3lBQkJQTEJrRXNBUF9nQUFBQUFCNVlJTnBEN0JiQkxVRkF3RmhqWUtzUU1JRVRVTUNBQW9RQUFBYUJBQ0FCUUFLUUlBUUNra0FRQkFTZ0JBQUNBQUFBSUNSQklRQU1BQUFBQ0VBQVFBQUFJQUFFQUFDUUJRQUlBQUFBZ0FBUUFBQVlBQUFpQUlBQUFBQUlnQUlBRUFBQW1RaEFBQUlBRUVBQWhBQUVJQUFBQUFBQUFBQUFBZ0FBQUFBQ0FBSUFBQUFBQUNBQUFJSU5nQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUJZS0FEQUFFRUd3a0FHQUFJSU5ob0FNQUFRUWJFUUFZQUFnZzJLZ0F3QUJCQnNaQUJnQUNDRFk2QURBQUVFR3lFQUdBQUlJTmtvQU1BQVFRYktRQVlBQWdnMldnQXdBQkJCc0EiLCJWZXJzaW9uIjoidjMifQ==; FPID=FPID2.2.8OFFUHXxPzeaJ6g56rBE5yWCf9F3GeGPejQMu0ibdmE%3D; FPLC=D8qYJh5nYjo9di38Pcf%2BI8UMzYEw9b0caFMpXOKQBYaEAuhdKA9zvVQDU9dsXqwIeNf5bBWdg9mUKny3qZoQByOl%2F5xkuvCUTfg1OsaccYkB5A4%3D; ga4_ga_K2N2M0CBQ6=GS1.2.1744026768.1.1.1744026791.0.0.1531059166; ai_session=b/dLY|1744026768607.6|1744026835436.5",
    "Host" : "www.ceneo.pl",
    "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
}

In [22]:
with open("./cookie.json", 'r') as jf:
    headers = json.load(jf)

In [23]:
url = "https://www.ceneo.pl/84514582#tab=reviews"
response = requests.get(url)

#### 3.Checking the code of HTTP response

In [None]:
response.status_code

#### 4.Parse the HTML code of first page with options

In [25]:
page_don = BeautifulSoup(response.text, 'html.parser')

#### 5.Extract required data from parsed code

In [None]:
opinions = page_don.select('div.js_product-review:not(.user-post--highlight)')
for opinion in opinions:
    opinion_id = opinion['data-entry-id']
    author = opinion.select_one('span.user-post__author-name').get_text()
    try:
        reccommendation = opinion.select_one('span.user-post__author-recomendation > em').get_text()
    except AttributeError: 
        reccommendation = None
    stars = opinion.select_one('span.user-post__score-count').get_text()
    content = opinion.select_one('div.user-post__text').get_text()
    pros = [p.get_text()for p in opinion.select('div.review-feature__item--positive')]
    cons = [c.get_text() for c in opinion.select('div.review-feature__item--negative')]
    vote_yes = opinion.select_one('button.vote-yes')['data-total-vote']
    vote_no = opinion.select_one('button.vote-no')['data-total-vote']
    published = opinion.select_one('span.user-post__published > time:nth-child(1)')["datetime"]
    try:
        purchased = opinion.select_one('span.user-post__published > time:nth-child(2)')["datetime"]
    except TypeError: 
        purchased = None
    print(opinion_id, author, reccommendation, stars, content, pros, cons, vote_yes, vote_no, published, purchased, sep='\n')

#### 6. If there are more pages, move to the next page and repeat step 2-5 for it

In [27]:
try:
    next_page ='https://www.ceneo.pl' + page_don.select_one('a.pagination__next')['href']
except TypeError:
    next_page = None


In [None]:
product_id = "84514582"
next_page = f"https://www.ceneo.pl/{product_id}84514582#tab=reviews"
all_opinions = []
while next_page:
    response = requests.get(next_page, headers= headers)
    if response.status_code == 200:
        print(next_page)
        page_dom = BeautifulSoup( response.text, 'html.parser')
        opinions = page_dom.select('div.js_product-review:not(.user-post--highlight)')
        print(len(opinions))
        for opinion in opinions:
            single_opinion = {}
            single_opinion['opinion_id'] = opinion['data-entry-id']
            single_opinion['author'] = opinion.select_one('span.user-post__author-name').get_text()
            try:
                single_opinion['reccommendation'] = opinion.select_one('span.user-post__author-recomendation > em').get_text()
            except AttributeError: 
                single_opinion['reccommendation'] = None
            single_opinion['stars'] = opinion.select_one('span.user-post__score-count').get_text()
            single_opinion['content'] = opinion.select_one('div.user-post__text').get_text()
            single_opinion['pros'] = [p.get_text()for p in opinion.select('div.review-feature__item--positive')]
            single_opinion['cons'] = [c.get_text() for c in opinion.select('div.review-feature__item--negative')]
            single_opinion['vote_yes'] = opinion.select_one('button.vote-yes')['data-total-vote']
            single_opinion['vote_no'] = opinion.select_one('button.vote-no')['data-total-vote']
            single_opinion['published'] = opinion.select_one('span.user-post__published > time:nth-child(1)')["datetime"]
            try:
                single_opinion['purchased'] = opinion.select_one('span.user-post__published > time:nth-child(2)')["datetime"]
            except TypeError: 
                single_opinion['purchased'] = None
            all_opinions.append(single_opinion)
        try:
            next_page = 'https://www.ceneo.pl' + page_dom.select_one('a.pagination__next')['href']
        except TypeError: 
            next_page = None
    print(all_opinions)

In [None]:
print(response.text)

### 7.Save extracted data

In [36]:
if not os.path.exists("./opinions"):
    os.mkdir("./opinions")
with open(f"./opinions{product_id}.json", "w", encoding="UTF-8" ) as js:
    json.dump(all_opinions, js, ensure_ascii=False, indent=4)

In [35]:
product_id = "84514582"