In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from geopy.geocoders import Nominatim
import json
import requests

### 1. Use webdriver to put website into state of scrapping

In [5]:
baseUrl = "https://www.oeps.at/de/termine"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')

#Create the webdriver instance
browser = webdriver.Chrome("./Chromedriver/chromedriver", options=chrome_options)

#Open the url
browser.get(baseUrl)

#Find element to filter for the whole year data
dropdown = browser.find_element_by_xpath('/html/body/main/div[1]/div/div/div/div[2]/form/span[1]/select[1]')
dropdown.click()

#Click
option = browser.find_element_by_xpath("/html/body/main/div[1]/div/div/div/div[2]/form/span[1]/select[1]/option[1]")
option.click()

#Submit
submit = browser.find_element_by_xpath("/html/body/main/div[1]/div/div/div/div[2]/form/input")
submit.click()

#Get data
all_data = browser.find_element_by_class_name("appointmentslist").text

### 2. Scrap data from website and assemble it into a dataframe

In [6]:
#Convert month names into numerics
months_conversion = {
    "Jänner":1,
    "Februar":2,
    "März":3,
    "April":4,
    "Mai":5,
    "Juni":6,
    "Juli":7,
    "August":8,
    "September":9,
    "Oktober":10,
    "November":11,
    "Dezember":12
}

In [7]:
#Connect bs4 to selenium
html = browser.page_source

#Instantiate soup instance
soup = BeautifulSoup(html, 'lxml')

#Get main appointment list
main_div = soup.find('div', class_='appointmentslist')

#Get all appointments for competitions
all_divs = main_div.find_all("div", class_="appointment")

#Final data
data = []

for test in all_divs:
    
    curr = {}
    
    #Month & date
    full_date = test.find("div", class_="date").text
    
    #Only get the start of the competition to determine the week
    date = " ".join(full_date.split()).split()
    month = months_conversion[date[0]]
    start_date = date[1]
    
    #Tournament category
    category = test.find("div", class_="category").text
    category = category.split()
    
    #We only want showjumping competitions
    category = list(filter(lambda x: "S" in x,category))
    
    #If the list is empty (as we know that S stands for jumping) we drop this competition
    if len(category) == 0:
        continue
        
    category = ", ".join(category)
    
    #Location
    location = test.find("div", class_="location").text.split()
    
    #Only want the city or village of the location
    location = " ".join(location[1:])
    
    #Assemble dictionary
    curr["Month"] = month
    curr["Date"] = start_date
    curr["Destination"] = location
    curr["Category"] = category
    
    #Append to data
    data.append(curr)
    
#Close browser
browser.close()
    
data = pd.DataFrame(data)
    

In [8]:
#Final data after webscraping
data

Unnamed: 0,Month,Date,Destination,Category
0,1,28.01.2022,Ebreichsdorf,CSN-B*
1,2,11.02.2022,Stadl Paura,CSN-B*
2,2,19.02.2022,Tulln an der Donau,CSN-C
3,2,25.02.2022,Ebreichsdorf,CSN-B*
4,2,25.02.2022,Stadl Paura,CSN-B*
...,...,...,...,...
180,11,04.11.2022,Stadl Paura,CSN-B*
181,11,11.11.2022,Stadl Paura,CSN-B*
182,11,25.11.2022,Ebreichsdorf,"CSN-B*, CSNP-B"
183,12,02.12.2022,Stadl Paura,"CSN-A*, CSN-B"


### Use google maps to find distance between home base and tournament to get transportation cost

In [9]:
"""From my previous runs, I saw that some locations cannot be mapped to lat long given by the webiste. 
So I used the actual address of the location or a larger location close by."""

bad_loc = {
    "St. Margarethen-Stückler": "Reiterhofstraße 38, 9412 Wolfsberg, Austria",
    "Auhof Neufeld": "Neufeld an Leitha",
    "Treffen - GHPC": "Gaston Glock Straße 8, 9521 Treffen am Ossiacher See, Austria",
    "Wien-Freudenau/ASKÖ":"Freudenau 555, 1020 Wien, Austria"
}

In [10]:
#My base location was my old horse riding stable
data["Origin"] = "Zwentendorf"
geolocator = Nominatim(user_agent="Leo")
home = geolocator.geocode("Zwentendorf")

#Add lat and long to all destinations
def lat_long(address):
    
    geolocator = Nominatim(user_agent="Leo")
    location = geolocator.geocode(address)
    
    if not location:
        location = geolocator.geocode(bad_loc[address])
    
    return [location.longitude, location.latitude]

#Getting both coords at the same time to limit runtime
data["Destination_Coords"] = data["Destination"].apply(lambda x: lat_long(x))
data["Destination_Long"] = data["Destination_Coords"].apply(lambda x: x[0])
data["Destination_Lat"] = data["Destination_Coords"].apply(lambda x: x[1])
data["Origin_Lat"] = home.latitude
data["Origin_Long"] = home.longitude

KeyboardInterrupt: 

In [None]:
data

In [None]:
def get_drive_distance(long_o, lat_o, long_d, lat_d):
    r = requests.get(f"http://router.project-osrm.org/route/v1/car/{long_o},{lat_o};{long_d},{lat_d}?overview=false""")
    try:
        
        routes = json.loads(r.content)
        route = routes.get("routes")[0]
        distance = ["legs"][0]["distance"]
        
    except:
        print(r)
    return distance

In [None]:
data["Distance"] = data.apply(lambda x: get_drive_distance(data.Origin_Long, data.Origin_Lat, data.Destination_Long, data.Destination_Lat), axis=1)

In [None]:
#Save data
data.to_csv("./data/interim.csv")

In [None]:
data = pd.read_csv("")