In [None]:
### Download, Inspect and Upload Permit Data
import pandas as pd
import geopandas as gpd
import psycopg2
import os
import numpy as np
from dotenv import load_dotenv
from pandas.api.types import is_datetime64_any_dtype
from datetime import datetime
import requests
import time
import random
from bs4 import BeautifulSoup

### Call for bids:
Call for bids are opened since 2017 every three months in different intervals (3-7 times a year). Bids must be submitted before the bid date. After the bid-date, the BNetzA publishes .xlsx tables listing the bid winners. The bid winners have a mastr_nummer (linking to the units)

### Source-page: 
https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/Ausschreibungen/Wind_Onshore/BeendeteAusschreibungen/start.html

Stores links to the single pages, which hold the links to the .xlsx with the results (units) of the bids

### Link leading to the bid-date pages:

- Xpath-Expression (Selector-Gadget): `//*[contains(concat( " ", @class, " " ), concat( " ", "NavNode", " " ))]`
- Xpath (Inspect-Browser-Plugin): `/html/body/div[1]/div/main/div/div[2]/div/div/div/table[1]/tbody/tr[1]/th[2]/p/a`
- CSS-Selector (Selector-Gadget): `.NavNode`
- CSS-Selector (Inspect-Browser-Plugin): `.bodyText > table:nth-child(6) > tbody:nth-child(3) > tr:nth-child(1) > th:nth-child(2) > p:nth-child(1) > a:nth-child(1)`
- CSS-Path: `html body.gsb.js-off.main.ElektrizitaetUndGas div#wrapperOuter div#wrapperInner main#wrapperContentDivision.fwo.dapadding div.wrapperOuterContent div#wrapperContent.row div#content.col-lg-12.col-sm-12 div.wrapperText div.bodyText table tbody tr.odd th p.center a.RichTextIntLink.NavNode`

- HTML-Tag: `<a class="RichTextIntLink NavNode" href="DE/Fachthemen/ElektrizitaetundGas/Ausschreibungen/Wind_Onshore/BeendeteAusschreibungen/Ausschreibungen2023/Gebotstermin1022023/start.html" title="Gebotstermin 1. Februar 2023">Februar</a>`

In [None]:
url_bids = "https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/Ausschreibungen/Wind_Onshore/BeendeteAusschreibungen/start.html"

### Response object holds all information sent by after making an http request to the url, 
# not only the html behind the visible page but also headers, status codes
response_bids = requests.get(url_bids)

### BeautifulSoup Object is a tree like structure for holding html
soup_bids = BeautifulSoup(response_bids.content, 'html.parser')

### Define what beautiful soup is supposed to look for
# css-selector only and XPath-Expression did not work
# Links are hardcoded into the site, so search for common html-tag:
# Link starts with: <a class="RichTextIntLink NavNode" - 
# <a> Tag with class-attribute RichText... and NavMode

html_element = 'a'
css_class = 'RichTextIntLink NavNode'

# Find all <a> elements with the specified class attribute
# Holds the whole html like '<a class=... href=... title=...>Displayed-Text</a>'
links_html_bids = soup_bids.find_all(html_element, class_= css_class)

# Extract the href attribute from each link
urls_bid_date = [link['href'] for link in links_html_bids]

### link to XLSX:

- Xpath-Expression (Selector-Gadget): `//*[contains(concat( " ", @class, " " ), concat( " ", "FTxlsx", " " ))]`
- Xpath (Inspect-Browser-Plugin): `/html/body/div[1]/div/main/div/div[2]/div/div[1]/div[1]/div[2]/a`

- CSS-Selector (Selector-Gadget): `.FTxlsx`
- CSS-Selector (Inspect-Browser-Plugin): `a.downloadLink:nth-child(4)`
- CSS-Path: `html body.gsb.js-off.main.ElektrizitaetUndGas div#wrapperOuter div#wrapperInner main#wrapperContentDivision.fwo.dapadding div.wrapperOuterContent div#wrapperContent.row div#content.col-lg-12.col-sm-12 div.wrapperText div.bodyText div.MsoNormal.box2 a.downloadLink.Publication.FTxlsx`

- HTML-Tag: `<a href="/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Ausschreibungen/Onshore/Zuschlagslisten/ListeZuschlaege0112_2019.xlsx?__blob=publicationFile&amp;v=1" class="downloadLink Publication FTxlsx" title="zum&nbsp;Download:&nbsp;Liste der Zuschläge zum Gebotstermin 1. Dezember 2019&nbsp;(xlsx) (öffnet neues Fenster)" target="_blank">Liste der Zuschläge zum Gebotstermin 1. Dezember 2019&nbsp;<span>(xlsx / 26&nbsp;KB)  </span></a>`

In [None]:
# 'DE/Fachthemen/ElektrizitaetundGas/Ausschreibungen/Wind_Onshore/BeendeteAusschreibungen/Ausschreibungen2023/Gebotstermin1022023/start.html
# hrefs are internal/relative
# When clicking the pointer from the source-page above the url looks like:
# 'https://www.bundesnetzagentur.de/DE/Fachthemen/...'
# So the stored hrefs must be headed by 'https://www.bundesnetzagentur.de/'
source_url = 'https://www.bundesnetzagentur.de/'

html_element = 'a'
css_class = 'downloadLink Publication FTxlsx'

css_selector = '.FTxlsx'
xpath_expression = '//*[contains(concat( " ", @class, " " ), concat( " ", "FTxlsx", " " ))]'

urls_xlsx = []
### Open each link
for href in urls_bid_date:
    # add source_url
    full_url = f"{source_url}{href}"
    
    # Find the .xlsx link
    response_bid_date = requests.get(full_url)
    
    # extract the href
    soup_bid_date = BeautifulSoup(response_bid_date.content, 'html.parser')
    link_xslx_bid_date = soup_bid_date.select(css_selector)
    
    # Try Except for [0] out of range -> no .xlsx link
    href_xlsx = link_xslx_bid_date[0]['href']
    print(href_xlsx)
    
    # Download the .xlsx into ./data/mastr_bids
    urls_xlsx.append(href_xlsx)
    
    # Pause to prevent security issues with the server
    sleep_duration = random.randint(1,5)
    time.sleep(sleep_duration)
    # test wether the .blob url works for downloading
    
    # Do old and new links (change of link-syntax after)



In [49]:
urls_xlsx
### Abortion in 2017 -> no downloadable .xlsx present


['/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Ausschreibungen/Onshore/Zuschlagslisten/ListeZuschlaege1022023.xlsx?__blob=publicationFile&v=1',
 '/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Ausschreibungen/Onshore/Zuschlagslisten/ListeZuschlaege1052023.xlsx?__blob=publicationFile&v=1',
 '/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Ausschreibungen/Onshore/Zuschlagslisten/ListeZuschlaege0102_2022.xlsx?__blob=publicationFile&v=1',
 '/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Ausschreibungen/Onshore/Zuschlagslisten/ListeZuschlaege0105_2022.xlsx?__blob=publicationFile&v=1',
 '/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Ausschreibungen/Onshore/Zuschlagslisten/ListeZuschlaege0109_2022.xlsx?__blob=publicationFile&v=1',
 '/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Ausschreibungen/Onshore/Zuschlagslisten/ListeZuschlaege0112_2022.xlsx?_