# Data Science SS 2022
## Task 1 - Data Preparation and Modeling
### Task 1a - Crawl and parse

In [4]:
# Import Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd



# Function for read in csv with web-links to scrape
def read_links(source):
    datatable = pd.read_csv(source)
    raw_links = datatable['roughguide link'].tolist()
    return raw_links


# Function for skipping cookie banner
def skip_cookie():
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "cf2Lf6")))
    driver.find_element(By.CLASS_NAME, 'cf2Lf6').click()
    print('>> Cookie-Banner successful skipped!')


# Function for extracting place name of webpage
def get_place():
    raw_place = driver.find_element(By.XPATH, '/html/body/div[1]/div/div/div[4]/div/section[1]/div/div/main/div/section/h1').text
    return raw_place


# Function for extracting place description of webpage
def get_content():
    content_text = ''
    content_element = driver.find_element(By.CLASS_NAME, 'DestinationPageContent')
    raw_text = content_element.find_elements(By.TAG_NAME, 'p')
    for paragraph in raw_text:
        content_text += str(' ' + paragraph.text)
    content_text = ' '.join(content_text.splitlines())
    content_text = content_text.replace(';', ' -')
    content_text = content_text.replace("In-depth, easy-to-use travel guides filled with expert advice.", '')
    content_text = content_text.replace("Use Rough Guides' trusted partners for great rates", '')
    content_text = content_text.strip()
    return content_text


# Function for saving scraped data to csv
def save_to_csv(lst, destination_file_path):
    data = pd.DataFrame(lst, columns=['Link', 'Place', 'Content'])
    data.to_csv(destination_file_path, header=['Link', 'Place', 'Content'], encoding='utf-8-sig')



#### MAIN ####

# Set variables
source_path = 'C:/Users/Henryy/Downloads/DataScience2022_RoughGuides.csv'       ## Set ##
destination_file_path = 'C:/Users/Henryy/Downloads/ds_test.csv'                 ## Set ##
count = 1
lst = []
start_time = time.time()

try:

    # Read in csv with page links
    print('Reading links from csv..')
    links = read_links(source_path)

    # Start driver
    print('Setting up chrome webdriver..')
    driver = webdriver.Chrome()
    WebDriverWait(driver, 10)

    # Iterate over links
    print('Starting scraping data from pages..')
    for link in links:

        # Open page
        print('Processing scraping of page {0} of {1}'.format(count, len(links)))
        driver.get(link)

        # Check if first run, if yes then skip cookie banner
        if count == 0:
            print('>> Cookie-Banner must be skipped!')
            skip_cookie()

        # Get header text (place name) of page
        place = get_place()

        # Get content text (place description) of page
        content = get_content()

        # Append place and content to list
        lst.append([link, place, content])

        # Wait 1 sec and repeat procedure for next page
        time.sleep(1)
        count += 1

    # Save list as dataframe and export to csv
    print('Exporting scraped data to csv')
    save_to_csv(lst, destination_file_path)

    # Exit driver
    print('Finished scraping! Took %s min to scrape' % round(((time.time() - start_time)/60)), 2)
    time.sleep(2)
    driver.quit()


except KeyboardInterrupt:

    # Catch Keyboard Interrupt
    print('The programm was interrupted by keyboard')
    exit()

Reading links from csv..
Setting up chrome webdriver..
Starting scraping data from pages..
Processing scraping of page 1 of 100
Processing scraping of page 2 of 100
Processing scraping of page 3 of 100
Processing scraping of page 4 of 100
Processing scraping of page 5 of 100
Processing scraping of page 6 of 100
Processing scraping of page 7 of 100
Processing scraping of page 8 of 100
Processing scraping of page 9 of 100
Processing scraping of page 10 of 100
Processing scraping of page 11 of 100
Processing scraping of page 12 of 100
Processing scraping of page 13 of 100
Processing scraping of page 14 of 100
Processing scraping of page 15 of 100
Processing scraping of page 16 of 100
Processing scraping of page 17 of 100
Processing scraping of page 18 of 100
Processing scraping of page 19 of 100
Processing scraping of page 20 of 100
Processing scraping of page 21 of 100
Processing scraping of page 22 of 100
Processing scraping of page 23 of 100
Processing scraping of page 24 of 100
Proces