1. Access the first page of the product using requests

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
from deep_translator import GoogleTranslator
import datetime

### Utils


In [2]:
def extract(ancestor, selector=None, attribute=None, multiple=False):
    if selector:        
        if multiple:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.text.strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).text.strip()
        except AttributeError:
            return None
    if attribute:
        return ancestor[attribute]
    return ancestor.text.strip()
    

In [3]:
selectors = {
'opinion_id': (None, 'data-entry-id',),
'author': ('span.user-post__author-name',),
'recommend' :('span.user-post__author-recomendation > em.recommended',),
'stars' :('span.user-post__score-count',),
'content_pl' :('div.user-post__text',),
'pros_pl' :('div.review-feature__item--positive', None, True),
'cons_pl' :('div.review-feature__item--negative', None, True),
'helpful' :('button.vote-yes', "data-total-vote"),
'unhelpful' :('button.vote-no', "data-total-vote"),
'published' :("span.user-post__published > time:nth-child(1)", 'datetime'),
'purchased' :("span.user-post__published > time:nth-child(2)", 'datetime')
}

In [4]:
def translate(text, source='pl', target='en'):
    return GoogleTranslator(source, target).translate(text=text)

### Extraction start here 

In [6]:
with open("./cookie.json", 'r', encoding='UTF-8') as f:
    headers = json.load(f)

In [13]:
product_id = input("Please enter product id")
url = f'https://www.ceneo.pl/{product_id}#tab=reviews'
all_opinions = []
while url:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        opinions = soup.select("div.js_product-review:not(.user-post--highlight)")
        for opinion in opinions:
            single_opinion= {
                key: extract(opinion, *values)
                for key, values in selectors.items()
            }
            single_opinion['content_en'] = translate(single_opinion['content_pl'])
            single_opinion['pros_en'] = [translate(pros) for pros in single_opinion['pros_pl']]    
            single_opinion['cons_en'] = [translate(cons) for cons in single_opinion['cons_pl']]  
            single_opinion['recommend'] = True if single_opinion['recommend'] == "Polecam" else False if single_opinion['recommend'] == "Nie Polecam" else None        
            single_opinion['stars'] = float(single_opinion['stars'].split('/')[0].replace(',', '.'))   
            single_opinion['helpful'] = int(single_opinion['helpful'])
            single_opinion['unhelpful'] = int(single_opinion['unhelpful'])
            all_opinions.append(single_opinion)
    try:
        url ='https://www.ceneo.pl' + soup.select_one("link[rel='next']")['href']
    except TypeError:
        url = None


In [15]:
if not os.path.exists('./opinions'):
    os.mkdir('./opinions')
if not os.path.exists('./piecharts'):
    os.mkdir('./piecharts')
if not os.path.exists('./barcharts'):
    os.mkdir('./barcharts')
    


In [14]:
with open(f'./opinions/{product_id}.json', 'w', encoding='UTF-8') as f:
    json.dump(all_opinions, f, indent=4, ensure_ascii=False)