# Data Science SS 2022
## Task 1 - Data Preparation and Modeling
### Task 1a - Crawl and parse

In [8]:
# Import Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd

In [9]:
# Function for read in csv with web-links to scrape
def read_links(source):
    datatable = pd.read_csv(source)
    raw_links = datatable['roughguide link'].tolist()
    return raw_links


# Function for skipping cookie banner
def skip_cookie():
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "cf2Lf6")))
    driver.find_element(By.CLASS_NAME, 'cf2Lf6').click()
    print('>> Cookie-Banner successful skipped!')


# Function for extracting place name of webpage
def get_place():
    raw_place = driver.find_element(By.XPATH, '/html/body/div[1]/div/div/div[4]/div/section[1]/div/div/main/div/section/h1').text
    return raw_place


def get_place_h2(title_id):
    raw_place = driver.find_element(By.ID, title_id).text
    return raw_place

# Function for extracting place description of webpage
def get_full_content():
    content_text = ''
    content_element = driver.find_element(By.CLASS_NAME, 'DestinationPageContent')
    raw_text = content_element.find_elements(By.TAG_NAME, 'p')
    for paragraph in raw_text:
        content_text += str(' ' + paragraph.text)
    return content_text


def get_content_between_headlines(title_id):
    # h2 element with id that contains title (all lower case and - instead of ' ')
    keep_text = False
    content_text = ''
    xpath = '//*[@id="{0}"]'.format(title_id)
    container = driver.find_element(By.XPATH, '/html/body/div[1]/div/div/div[4]/div/main/div[2]/div/div/div/div/div[3]')
    #start_element = driver.find_element(By.XPATH, xpath)
    #current_element = start_element.find_element(By.XPATH, '//following-sibling::*')
    # for element in container:
    #     if element.tag_name == 'h2':
    #         # break while loop if next element is a h2
    #         break
    #     content_text += str(' ' + current_element.text)
    #     # continue to next element
    #     current_element = current_element.find_element(By.XPATH, '//following-sibling::*')

    for element in container:
        if element.getAttribute("id") == title_id:
            keep_text = True
        tag_value= element.get_attribute('outerHTML ').split('',1)[0]  # gets html tag of current element
        if keep_text and tag_value != 'h2':
            content_text += str(' ' + element.text)
    return content_text


def clean_content_text(content_text):
    content_text = ' '.join(content_text.splitlines())
    content_text = content_text.replace(';', ' -')
    content_text = content_text.replace("In-depth, easy-to-use travel guides filled with expert advice.", '')
    content_text = content_text.replace("Use Rough Guides' trusted partners for great rates", '')
    content_text = content_text.strip()
    return content_text


# Function for saving scraped data to csv
def save_to_csv(lst, destination_file_path):
    data = pd.DataFrame(lst, columns=['Link', 'Place', 'Content'])
    data.to_csv(destination_file_path, header=['Link', 'Place', 'Content'], encoding='utf-8-sig')

NameError: name 'html' is not defined

In [4]:
# Set variables
source_path = './data/link_test.csv'       ## Set ##
destination_file_path = './data/ds_test.csv'                 ## Set ##

count = 1
lst = []
start_time = time.time()

In [10]:
try:
    # Read in csv with page links
    print('Reading links from csv..')
    links = read_links(source_path)

    # Start driver
    print('Setting up chrome webdriver..')
    driver = webdriver.Chrome()
    WebDriverWait(driver, 10)

    # Iterate over links
    print('Starting scraping data from pages..')
    for link in links:

        # Open page
        print('Processing scraping of page {0} of {1}'.format(count, len(links)))
        driver.get(link)

        # Check if first run, if yes then skip cookie banner
        # if count == 0:
        #     print('>> Cookie-Banner must be skipped!')
        #     skip_cookie()

        if '#' in link:
            print('link to h2:')
            # This link only refers to a part of the page.
            # Only content between the h2 Headline and the next h2 should be extracted
            title_id = link.split('#')[1]

            # get place name
            place = get_place_h2(title_id)

            # get content until next h2
            content_raw = get_content_between_headlines(title_id)
            content = clean_content_text(content_raw)

            print(place)
            print(content)

        else:
            # whole page refers to the place and should be extracted
            # Get header text (place name) of page
            place = get_place()

            # Get content text (place description) of page
            content_raw = get_full_content()
            content = clean_content_text(content_raw)

        # Append place and content to list
        lst.append([link, place, content])

        # Wait 1 sec and repeat procedure for next page
        time.sleep(1)
        count += 1

    # Save list as dataframe and export to csv
    print('Exporting scraped data to csv')
    save_to_csv(lst, destination_file_path)

    # Exit driver
    print('Finished scraping! Took %s min to scrape' % round(((time.time() - start_time)/60)), 2)
    time.sleep(2)
    driver.quit()

except KeyboardInterrupt:
    # Catch Keyboard Interrupt
    print('The programm was interrupted by keyboard')
    exit()

except Exception as e:
    print(e)
    # try saving current process in dataframe
    print('trying to Export scraped data to csv')
    save_to_csv(lst, destination_file_path)
    print('done')

Reading links from csv..
Setting up chrome webdriver..
Starting scraping data from pages..
Processing scraping of page 2 of 3
Processing scraping of page 3 of 3
link to h2:
The programm was interrupted by keyboard
