In [1]:
import unicodedata
from typing import Dict, List, Optional, Tuple

import pandas
import requests
from bs4 import BeautifulSoup


CITIES_FILE_NAME = 'destinations_LP_crawler_Ex5.csv'
OUTPUT_FILE_NAME = 'LP_destinations.csv'

GOOGLE_API_KEY = ''
GOOGLE_PLACES_API_URL = f'https://maps.googleapis.com/maps/api/place/autocomplete/json?key={GOOGLE_API_KEY}&input='

LONELY_PLANET_BASE_URL = 'https://www.lonelyplanet.com'
TARGET_HTML_ELEMENT_ID = 'introduction'


def strip_csv_column_value(value: str) -> str:
    return value.strip('\r\n"').encode('ASCII', 'ignore').decode()


def load_cities(cities_file_path: str) -> Dict[str, str]:
    with open(cities_file_path, 'r') as of:
        cities_data: List[Tuple[str, str]] = []
        for line in of.readlines():
            city, lp_city = line.split(',')
            cities_data.append((strip_csv_column_value(city), strip_csv_column_value(lp_city)))
        
        cities_data.pop(0)  # Remove csv header line
        return dict(cities_data)


def normalize_city_name(city_name: str) -> str:
    # Replace latin letters returned from google's API with ASCII-only characters as our CSV is normalized that way
    latin_stripped_text = unicodedata.normalize('NFKD', city_name).encode('ASCII', 'ignore').decode()
    return latin_stripped_text.replace(' ', '-').lower()


def retrieve_country_predictions_for_city(city: str) -> List[str]:
    url = GOOGLE_PLACES_API_URL + city
    response_data = requests.get(url).json()

    country_predictions: List[str] = []
    predictions = response_data['predictions']
    for prediction in predictions:
        normalized_main_text = normalize_city_name(prediction['structured_formatting']['main_text'])
        if normalized_main_text == normalize_city_name(city):
            country = prediction['terms'][-1]['value']
            country_predictions.append(country)

    # If we can't find an exact match prediction, the first one is our second-best option
    country_predictions.append(predictions[0]['terms'][-1]['value'])

    return country_predictions


def scrape_introduction_text(country: str, city: str) -> Optional[str]:
    url = f'{LONELY_PLANET_BASE_URL}/{country}/{city}'
    res = requests.get(url)

    if res.status_code in (403, 404):
        return None

    soup = BeautifulSoup(res.text, 'html.parser')
    introduction_text_element = soup.select_one(f'#{TARGET_HTML_ELEMENT_ID}')

    if not introduction_text_element:
        return None

    introduction_text = introduction_text_element.text
    canonized_text = introduction_text.removesuffix('Read more').replace('\n', ' ')
    return canonized_text


def main():
    lp_path_per_city = load_cities(CITIES_FILE_NAME)

    results: List[Tuple[str, str, str]] = []
    for city, city_lp_path in lp_path_per_city.items():
        city = normalize_city_name(city)
        print(city, '...')
        introduction_text = 'Not Available'
        country_predictions = retrieve_country_predictions_for_city(city)
        for country in country_predictions:
            introduction_text = scrape_introduction_text(country, city_lp_path)
            if introduction_text:
                break

        results.append((country.title(), city.title(), introduction_text))

    df = pandas.DataFrame(results)
    df.to_csv(OUTPUT_FILE_NAME, header=['Country', 'City', 'Description'], index=False, line_terminator='\n')


if __name__ == '__main__':
    main()


antalya ...


IndexError: list index out of range