In this notebook we'll fetch the urls of the companies that are referenced on https://fr.trustpilot.com/

We'll use selenium because the content is dynamically rendered

We'll then scrape the reviews using scrapy and feending it the scraped urls

In [1]:
%config Completer.use_jedi=False

In [2]:
import json
import time

from bs4 import BeautifulSoup
import requests
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from tqdm import tqdm_notebook

In [4]:
base_url = "https://trustpilot.com"

In [5]:
def get_soup(url):
    return BeautifulSoup(requests.get(url).content, 'lxml')

We first start by fetching sub-categories urls:

In [8]:
data = {}

soup = get_soup(base_url + '/categories')
for category in soup.findAll('div', {'class': 'category-object'}):
    name = category.find('h3', {'class': 'sub-category__header'}).text
    name = name.strip()
    data[name] = {}  
    sub_categories = category.find('div', {'class': 'sub-category-list'})
    for sub_category in sub_categories.findAll('div', {'class': 'child-category'}):
        sub_category_name = sub_category.find('a', {'class': 'sub-category-item'}).text 
        sub_category_uri = sub_category.find('a', {'class': 'sub-category-item'})['href'] 
        data[name][sub_category_name] = sub_category_uri

In [10]:
len(data)

22

This function allows to fetch company urls referenced in a given subcategory:

In [11]:
def extract_company_urls_form_page():
    a_list = driver.find_elements_by_xpath('//a[@class="category-business-card card"]')
    urls = [a.get_attribute('href') for a in a_list]
    dedup_urls = list(set(urls))
    return dedup_urls

This function indicates if a pagination exists on the page:

In [12]:
def go_next_page():
    try:
        button = driver.find_element_by_xpath('//a[@class="button button--primary next-page"]')
        return True, button
    except NoSuchElementException:
        return False, None

We start by initializing Selenium with a headless Chromedriver:

In [13]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")

prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome('./driver/chromedriver', options=options)

timeout = 3

We launch scraping: (~ 50 minutes)

In [14]:
company_urls = {}
for category in tqdm_notebook(data):
    for sub_category in tqdm_notebook(data[category], leave=False):
        company_urls[sub_category] = []

        url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all"
        driver.get(url)
        try: 
            element_present = EC.presence_of_element_located(
                (By.CLASS_NAME, 'category-business-card card'))
            
            WebDriverWait(driver, timeout).until(element_present)
        except:
            pass
    
        next_page = True
        c = 1
        while next_page:
            extracted_company_urls = extract_company_urls_form_page()
            company_urls[sub_category] += extracted_company_urls
            next_page, button = go_next_page()
            
            if next_page:
                c += 1
                next_url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all" + f'&page={c}'
                driver.get(next_url)
                try: 
                    element_present = EC.presence_of_element_located(
                        (By.CLASS_NAME, 'category-business-card card'))
                    
                    WebDriverWait(driver, timeout).until(element_present)
                except:
                    pass
                    

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))

HBox(children=(IntProgress(value=0, max=23), HTML(value='')))

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=155), HTML(value='')))

HBox(children=(IntProgress(value=0, max=168), HTML(value='')))

HBox(children=(IntProgress(value=0, max=86), HTML(value='')))

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=68), HTML(value='')))

HBox(children=(IntProgress(value=0, max=81), HTML(value='')))

HBox(children=(IntProgress(value=0, max=142), HTML(value='')))

HBox(children=(IntProgress(value=0, max=56), HTML(value='')))

HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

HBox(children=(IntProgress(value=0, max=52), HTML(value='')))

HBox(children=(IntProgress(value=0, max=91), HTML(value='')))

HBox(children=(IntProgress(value=0, max=53), HTML(value='')))

HBox(children=(IntProgress(value=0, max=23), HTML(value='')))

HBox(children=(IntProgress(value=0, max=101), HTML(value='')))

HBox(children=(IntProgress(value=0, max=80), HTML(value='')))

HBox(children=(IntProgress(value=0, max=65), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, max=116), HTML(value='')))




And finally we export everything:

In [41]:
with open('./exports/company_urls_en', 'w') as f:
    json.dump(company_urls, f)

In [15]:
consolidated_data = []

for category in data:
    for sub_category in data[category]:
        for url in company_urls[sub_category]:
            consolidated_data.append((category, sub_category, url))

df_consolidated_data = pd.DataFrame(consolidated_data, columns=['category', 'sub_category', 'company_url'])

df_consolidated_data.to_csv('./exports/consolidate_company_urls.csv', index=False)

In [16]:
df_consolidated_data.head()

Unnamed: 0,category,sub_category,company_url
0,Animals & Pets,Agistment Service,https://www.trustpilot.com/review/nomuggle.com
1,Animals & Pets,Animal Control Service,https://www.trustpilot.com/review/zooeasy.com
2,Animals & Pets,Animal Control Service,https://www.trustpilot.com/review/rentokil.com
3,Animals & Pets,Animal Feed Store,https://www.trustpilot.com/review/www.topdogra...
4,Animals & Pets,Animal Feed Store,https://www.trustpilot.com/review/agrizoo-shop...
