### Obtaining all cities with hostels from countries I've backpacked in
This code scrapes hostel information from the Hostelworld website for a list of South American countries I backpacked in 2022 and 2023. The information was scrapped in **September of 2023**.

The hostelworld website does not contain webpages with all the hostels per country listed, however it does contain webpages with all the hostels per city. For this reason, the first step of the web scrapping was to obtain a list of all cities containing hostels available on the main page of each country in hostelworld. A dictionary containing city data for each country was created. 

In [1]:
import requests
from bs4 import BeautifulSoup

# List of countries I've backpacked in
countries_i_backpacked = ["colombia", "peru", "bolivia", "chile", "argentina"]

# Create an empty dictionary to store cities for each country
cities_dictionary = {"colombia": [], "peru": [], "bolivia": [], "chile": [], "argentina": []}

# Loop through each country in your backpacking list
for country in countries_i_backpacked:

    # Define the URL for the search results page for hostels in the current country
    url = f"https://www.hostelworld.com/st/hostels/south-america/{country}/"

    # Send an HTTP GET request to the URL and store the response
    response = requests.get(url)

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all containers with class "destination-card-info" which represent cities
    city_containers = soup.findAll(class_="destination-card-info")

    # Create an empty list to store city names
    city_names = []

    # Loop through the city containers
    for i in range(len(city_containers)):
        city = city_containers[i]
        # Extract the city name, convert it to lowercase, and replace spaces with hyphens to match the format in the url
        city_name = city.find('strong').text.strip().lower().replace(" ", "-")
        # Append the city name to the list for the current country
        city_names.append(city_name)

    # Add the list of city names to the dictionary for the current country
    cities_dictionary[country] = city_names

# Print the final dictionary containing cities for each country
print(cities_dictionary)

{'colombia': ['medellin', 'bogota', 'santa-marta', 'cartagena', 'minca', 'minca', 'salento', 'cali', 'guachaca', 'palomino', 'jardin', 'guatape', 'san-andres-island', 'san-gil', 'taganga', 'barranquilla', 'villa-de-leyva', 'leticia', 'buenaventura', 'rosario-island', 'riohacha', 'filandia', 'pereira', 'rincon-del-mar', 'barichara', 'manizales', 'bucaramanga', 'bahia-solano', 'san-agustin', 'pasto', 'neiva', 'popayan', 'capurgana', 'buritaca', 'tintipan-island', 'jerico', 'baru-island', 'isla-fuerte', 'mompos', 'ipiales', 'villavieja', 'sapzurro', 'mongui', 'san-rafael', 'villavicencio', 'mocoa', 'zipaquira', 'santa-rosa-de-cabal', 'valledupar', 'doradal', 'tolu', 'isla-palma', 'acacias', 'mayapo', 'san-bernardo-del-viento', 'carmen-de-viboral', 'santa-fe-de-antioquia', 'sogamoso', 'nuqui', 'turbo', 'los-santos', 'don-diego', 'paicol', 'puerto-narino', 'monitos', 'guaduas', 'juan-de-acosta', 'valle-del-cauca', 'la-vega', 'suesca', 'nocaima', 'ibague', 'guasca', 'salgar', 'palmira', 'gir

## Extracting all hostels information for each city

This Python script scrapes hostel information from the Hostelworld website for a list of cities in South American countries. It collects data such as country, city, hostel names, descriptions, ratings, reviews, distance to the city center, and minimum prices for private and dormitory rooms. The script iterates through the cities, pages within each city, and countries specified in the cities_dictionary. It stores the collected data in a pandas DataFrame, final_df, which can be further analyzed or saved for future reference.

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def hostel_scrapper(soup):

    hostel_containers = soup.findAll(class_="property-info-container")

    # Initialize lists to store hostel data
    list1, list2, list3, list4, km_to_centre, min_private_price, min_dorm_price = [], [], [], [], [], [], []
    lists = [list1, list2, list3, list4]

    # Loop through hostel containers to extract data
    for i in range(len(hostel_containers)):
        hostel = hostel_containers[i]

        # Define a list of property classes to search for
        property_classes = ['property-name', 'property-description', 'rating-score', 'review']

        # Iterate through the property classes and append data to the corresponding list
        for count, property_class in enumerate(property_classes):
            if hostel.find('div', class_=property_class):
                value = hostel.find('div', class_=property_class).find('span').text.strip()
            else:
                value = None
            lists[count].append(value)

        # Find the distance to centre element
        distance_element = hostel.find('span', class_='distance-description')
        distance_element = distance_element.text.strip()
        km_to_centre.append(distance_element)

        # Find the prices for "Privates From" and "Dorms From" and ad to the list
        private_price_div = hostel.find('div', class_='accommodation-label', text='Privates From')
        if private_price_div:
            private_price_element = private_price_div.find_next('strong', class_='current')
            private_price = private_price_element.text.strip()
            min_private_price.append(private_price)
        else:
            min_private_price.append(None)

        dorm_price_div = hostel.find('div', class_='accommodation-label', text='Dorms From')
        if dorm_price_div:
            dorm_price_element = dorm_price_div.find_next('strong', class_='current')
            dorm_price = dorm_price_element.text.strip()
            min_dorm_price.append(dorm_price)
        else:
            min_dorm_price.append(None)

    # Create a DataFrame for the current city's data
    df = pd.DataFrame({
        "country": country, 
        "city": city,
        'name': list1, 
        'description': list2, 
        "rating": list3, 
        "reviews": list4, 
        "km_to_centre": km_to_centre, 
        "min_private_price": min_private_price, 
        "min_dorm_price": min_dorm_price
    })
    return df
    

# Initialize an empty list to store DataFrames
df_list = []

# Iterate through countries and cities to scrape hostel data
for country, cities in cities_dictionary.items():
    for city in cities:
        # Define the URL for the search results page
        url = f"https://www.hostelworld.com/st/hostels/south-america/{country}/{city}/"
        
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Create soup and find hostel containers
        soup = BeautifulSoup(response.text, 'html.parser')
        
        data = hostel_scrapper(soup)
        
        df = pd.DataFrame(data)
        df_list.append(df)
        
         # Count the number of pages for each city
        page_wrappers = soup.find_all('div', class_='page-wrapper')
        num_page_wrappers = len(page_wrappers)       
    
        # If there is more than one page, which is returned as 0, iterate through the other pages and extract hostel information in the same way as before
        if len(page_wrappers) > 0: 
            for i in range(2, num_page_wrappers+1):
                # Define the URL for the search results page
                url = f"https://www.hostelworld.com/st/hostels/south-america/{country}/{city}/p/{i}/"

                # Send an HTTP GET request to the URL
                response = requests.get(url)

                # Create soup and find hostel containers
                soup = BeautifulSoup(response.text, 'html.parser')
                data = hostel_scrapper(soup)    
    
                df = pd.DataFrame(data)
                df_list.append(df)    
            
# Concatenate all DataFrames into a final DataFrame
final_df = pd.concat(df_list, ignore_index=True)

# Export the final dataframe as a csv
final_df.to_csv('backpacking_hostel_data.csv', index=False)