In [45]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

In [56]:
initial_url = 'https://www.tripadvisor.com/Attraction_Products-g187497-a_sort.-Barcelona_Catalonia.html#ATTRACTION_LIST'

## Combining all the pages in one list:

In [57]:
# Generating a list "urls" of all the pages of the website: 

next_url = initial_url
urls = next_url
urls = []

while next_url != 0:
    soup = BeautifulSoup(requests.get(next_url).content, "lxml")
    urls.append(requests.get(next_url).url)
    try:
        next_url = "https://www.tripadvisor.com" + soup.select("a.nav.next.rndBtn.ui_button.primary.taLnk")[0]["href"]
    except:
        next_url = 0       
      

In [58]:
len(urls)

43

In [59]:
def get_info(soup):    
    """ Returns a list of the titles of all the tours on each page as well as their price"""
  
    
    tours_prices = []
    
    samples1 = soup.find_all("div", {"class":"listing_title"})
    samples2 = soup.find_all("div", {"class":"from"})
    for s1, s2 in zip(samples1, samples2):
            

              tours_prices.append([s1.text, s2.text])
    
    return(tours_prices) 
    
    

def parse_tours(url):
    """ Fetches a TripAdvisor webpage and extracts information: """
    
    r = requests.get(url)
    c = r.content
    
    # Parsing the HTML: 
    soup = BeautifulSoup(c, "lxml")
    
    # Using our get_info function to extract the information we want:
    
    info = get_info(soup)
    return(info)
    

## Scraping Synchronously:

In [60]:
# We loop through the urls list, processing one at a time, and extracting our information:

start = time.time()
results = []
for u in urls:
    results.append(parse_tours(u))
end = time.time()
print("Time Taken: {:.6f}s".format(end-start))    

# On my PC usually takes between 60 and 80 seconds.

Time Taken: 63.567342s


In [61]:
print(results)

[[['Basilica of the Sagrada Familia Admission Ticket', 'from €15.00* '], ["Gaudi's Casa Batlló Admission Ticket with Video Tour", 'from €24.50* '], ['City Sightseeing Barcelona Hop-On Hop-Off Tour', 'from €30.00* '], ['Priority Access: Barcelona Sagrada Familia Tour', 'from €38.00* '], ['Montserrat Tour from Barcelona Including Lunch and Wine Tasting in...', 'from €94.95* '], ['Montserrat and Oller del Mas Winery Tour from Barcelona', 'from €67.95* '], ['Park Guell Admission Ticket', 'from €9.00* '], ['Barcelona City Tour Hop-On Hop-Off', 'from €30.00* '], ['Flamenco Night at Tablao Cordobes', 'from €44.00* '], ['Camp Nou Experience and Museum Admission Ticket', 'from €28.50* '], ['Skip the Line: Gaudi’s La Pedrera Audio Tour in Barcelona', 'from €22.00* '], ['Tapas and Wine Experience Small-Group Walking Tour', 'from €75.00* '], ['Flamenco Show at Los Tarantos Barcelona', 'from €15.00* '], ['Sagrada Familia Official Guided Tour - Priority Access', 'from €35.00* '], ['Interactive Spani

## Scraping Asynchronously using ProcessPool:

In [62]:
# Here we have 4 workers, they are our "executor". Each of them will execute the parse_tours function 
# against a url.
# All tasks will be executed in some time in the future (i.e. not immediately).
# The as_completed function watches our future_results for completion,
# after which we'll be able to fetch each result via the result method.

# This method is fast, because we can submit a task while other tasks are still in progress,
# there is no waiting. This is why this process is asynchronous.


from concurrent.futures import ProcessPoolExecutor
import concurrent.futures
start = time.time()


with ProcessPoolExecutor(max_workers = 4) as executor:
    future_results = {executor.submit(parse_tours, url): url for url in urls}


results2 = []
for future in concurrent.futures.as_completed(future_results):
        results2.append(future.result())
end = time.time()
print("Time Taken: {:.6f}s".format(end-start))

# Usually takes about 17 seconds on my PC - between 4 and 5 times faster compared to the synchronous code.

Time Taken: 16.728733s


In [54]:
results2

[[['Exclusive sailing boat 2h', 'from €299.00* '],
  ['Shore Excursion: Girona Pals and Peratallada Medieval Towns from...',
   'from €117.00* '],
  ['Private Full Day City Tour in Barcelona', 'from €79.00* '],
  ['Shore Excursion: Montserrat Abbey and Salnitre Caverns from Barcelona',
   'from €117.00* '],
  ['Poble Espanyol Private Tour in Barcelona', 'from €95.00* '],
  ['Barcelona Shore Excursion: Medieval Costa Brava from Barcelona',
   'from €117.00* '],
  ['4 hour Shore Excursion to Santa Severa Castle from Civitavecchia',
   'from €109.00* '],
  ['La Roca Village Private Experience with Hotel Pick-up', 'from €91.00* '],
  ['Barcelona City Tapas Tour by Vespa Scooter', 'from €125.00* '],
  ['Chocolate Walking Private Tour in Barcelona', 'from €29.00* '],
  ['Sitges Private Tour from Barcelona', 'from €59.00* '],
  ['Private Half Day Walking Tour in Barcelona', 'from €31.00* '],
  ['Private Half Day City Tour in Barcelona', 'from €70.00* '],
  ['Private Las Ramblas Walking Tour a