In [None]:
### Download, Inspect and Upload Permit Data
import pandas as pd
import geopandas as gpd
import psycopg2
import os
import numpy as np
from dotenv import load_dotenv
from pandas.api.types import is_datetime64_any_dtype
from datetime import datetime
import requests
import time
import random
import re
import pickle
from bs4 import BeautifulSoup

### Call for bids:
Call for bids are opened since 2017 every three months in different intervals (3-7 times a year). Bids must be submitted before the bid date. After the bid-date, the BNetzA publishes .xlsx tables listing the bid winners. The bid winners have a mastr_nummer (linking to the units)

### Source-page: 
https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/Ausschreibungen/Wind_Onshore/BeendeteAusschreibungen/start.html

Stores links to the single pages, which hold the links to the .xlsx with the results (units) of the bids

### Link leading to the bid-date pages:

- Xpath-Expression (Selector-Gadget): `//*[contains(concat( " ", @class, " " ), concat( " ", "NavNode", " " ))]`
- Xpath (Inspect-Browser-Plugin): `/html/body/div[1]/div/main/div/div[2]/div/div/div/table[1]/tbody/tr[1]/th[2]/p/a`
- CSS-Selector (Selector-Gadget): `.NavNode`
- CSS-Selector (Inspect-Browser-Plugin): `.bodyText > table:nth-child(6) > tbody:nth-child(3) > tr:nth-child(1) > th:nth-child(2) > p:nth-child(1) > a:nth-child(1)`
- CSS-Path: `html body.gsb.js-off.main.ElektrizitaetUndGas div#wrapperOuter div#wrapperInner main#wrapperContentDivision.fwo.dapadding div.wrapperOuterContent div#wrapperContent.row div#content.col-lg-12.col-sm-12 div.wrapperText div.bodyText table tbody tr.odd th p.center a.RichTextIntLink.NavNode`

- HTML-Tag: `<a class="RichTextIntLink NavNode" href="DE/Fachthemen/ElektrizitaetundGas/Ausschreibungen/Wind_Onshore/BeendeteAusschreibungen/Ausschreibungen2023/Gebotstermin1022023/start.html" title="Gebotstermin 1. Februar 2023">Februar</a>`

In [None]:
url_bids = "https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/Ausschreibungen/Wind_Onshore/BeendeteAusschreibungen/start.html"

### Response object holds all information sent by after making an http request to the url, 
# not only the html behind the visible page but also headers, status codes
response_bids = requests.get(url_bids)

### BeautifulSoup Object is a tree like structure for holding html
soup_bids = BeautifulSoup(response_bids.content, 'html.parser')

### Define what beautiful soup is supposed to look for
# css-selector only and XPath-Expression did not work
# Links are hardcoded into the site, so search for common html-tag:
# Link starts with: <a class="RichTextIntLink NavNode" - 
# <a> Tag with class-attribute RichText... and NavMode

html_element = 'a'
css_class = 'RichTextIntLink NavNode'

# Find all <a> elements with the specified class attribute
# Holds the whole html like '<a class=... href=... title=...>Displayed-Text</a>'
links_html_bids = soup_bids.find_all(html_element, class_= css_class)

# Extract the href attribute from each link
urls_bid_date = [link['href'] for link in links_html_bids]

### link to XLSX:

- Xpath-Expression (Selector-Gadget): `//*[contains(concat( " ", @class, " " ), concat( " ", "FTxlsx", " " ))]`
- Xpath (Inspect-Browser-Plugin): `/html/body/div[1]/div/main/div/div[2]/div/div[1]/div[1]/div[2]/a`

- CSS-Selector (Selector-Gadget): `.FTxlsx`
- CSS-Selector (Inspect-Browser-Plugin): `a.downloadLink:nth-child(4)`
- CSS-Path: `html body.gsb.js-off.main.ElektrizitaetUndGas div#wrapperOuter div#wrapperInner main#wrapperContentDivision.fwo.dapadding div.wrapperOuterContent div#wrapperContent.row div#content.col-lg-12.col-sm-12 div.wrapperText div.bodyText div.MsoNormal.box2 a.downloadLink.Publication.FTxlsx`

- HTML-Tag: `<a href="/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Ausschreibungen/Onshore/Zuschlagslisten/ListeZuschlaege0112_2019.xlsx?__blob=publicationFile&amp;v=1" class="downloadLink Publication FTxlsx" title="zum&nbsp;Download:&nbsp;Liste der Zuschläge zum Gebotstermin 1. Dezember 2019&nbsp;(xlsx) (öffnet neues Fenster)" target="_blank">Liste der Zuschläge zum Gebotstermin 1. Dezember 2019&nbsp;<span>(xlsx / 26&nbsp;KB)  </span></a>`

In [None]:
# 'DE/Fachthemen/ElektrizitaetundGas/Ausschreibungen/Wind_Onshore/BeendeteAusschreibungen/Ausschreibungen2023/Gebotstermin1022023/start.html
# hrefs are internal/relative
# When clicking the pointer from the source-page above the url looks like:
# 'https://www.bundesnetzagentur.de/DE/Fachthemen/...'
# So the stored hrefs must be headed by 'https://www.bundesnetzagentur.de/'
source_url = 'https://www.bundesnetzagentur.de/'

html_element = 'a'
css_class = 'downloadLink Publication FTxlsx'

css_selector = '.FTxlsx'
xpath_expression = '//*[contains(concat( " ", @class, " " ), concat( " ", "FTxlsx", " " ))]'

urls_xlsx = []
### Open each link
for href in urls_bid_date:
    # add source_url
    full_url = f"{source_url}{href}"
    
    # Find the .xlsx link
    response_bid_date = requests.get(full_url)
    
    # extract the href
    soup_bid_date = BeautifulSoup(response_bid_date.content, 'html.parser')
    link_xslx_bid_date = soup_bid_date.select(css_selector)
    
    # Try Except for [0] out of range -> no .xlsx link
    href_xlsx = link_xslx_bid_date[0]['href']
    print(href_xlsx)
    
    # Download the .xlsx into ./data/mastr_bids
    urls_xlsx.append(href_xlsx)
    
    # Pause to prevent security issues with the server
    sleep_duration = random.randint(1,5)
    time.sleep(sleep_duration)
    # test wether the .blob url works for downloading
    
    # Do old and new links (change of link-syntax after)

del sleep_duration, href, href_xlsx, full_url, soup_bid_date, link_xslx_bid_date, href_xlsx

In [None]:
dict_xlsx = {}
for xlsx in urls_xlsx:
    # add source_url
    full_url = f"{source_url}{xlsx}"
    
    # only name.xlsx?__blob=publicationFile&v=1' of the href 
    filename = xlsx.split("/")[-1].rstrip("?__blob=publicationFile&v=1'")
    
    # filenames in the same format
    filename = filename.replace("_", "")
    
    pattern = "ListeZuschlaege"
    
    filename = re.sub(pattern, lambda m: m.group() + "_", filename)
    
    # Pattern to look for a 7 digit date after the underscore
    pattern = r"_(\d{7}.xlsx)$"
    # replacement insert 0 before the captured group (\1)
    replacement = r"_0\1" 
    
    filename = re.sub(pattern, replacement, filename)
    
    bid_date = filename.split("_")[1].split(".")[0]
    
    # strptime - convert string holding date to datetime-object, .date() to store it without the time of the day
    bid_date = datetime.strptime(bid_date, "%d%m%Y").date()
    
    response_xlsx = requests.get(full_url)
    
    full_path = f"../data/mastr_bids/{filename}"
    
    if response_xlsx.status_code == 200:
        with open(file = full_path, mode = "wb") as xlsx_file:
            xlsx_file.write(response_xlsx.content)
        print(f"Downloaded .xlsx for bid-date {bid_date}")
    else:
        print(f"Error: No Download .xlsx for bid-date {bid_date}")
    
    dict_xlsx[bid_date] = full_path
    
    # Pause to prevent security issues with the server
    sleep_duration = random.randint(1,5)
    time.sleep(sleep_duration)

# delete objects from within the loop
del xlsx_file, full_path, full_url, response_xlsx, bid_date, filename, pattern

In [None]:
with open("../data/mastr_bids/bids_xlsx.pkl", mode = "wb") as pkl_file:
    pickle.dump(dict_xlsx, pkl_file)

In [None]:
### Scrape the bid data for 2017 - Within html tables
source_url = 'https://www.bundesnetzagentur.de/'
urls_2017 = urls_bid_date[-3:]

dict_html_tables_2017 = {}
for url_html_table in urls_2017:
    
    # Pattern to look for a 7 digit date after the underscore
    pattern = r"\d{2}_\d{2}_\d{4}"
    # replacement insert 0 before the captured group (\1)
    
    bid_date = re.findall(pattern,url_html_table)
    
    # strptime - convert string holding date to datetime-object, .date() to store it without the time of the day
    bid_date = datetime.strptime(bid_date[0], "%d_%m_%Y").date()
    bid_date = datetime.strftime(bid_date, "%Y-%m-%d")
    
    # full url
    full_url = f"{source_url}/{url_html_table}"
    
    list_dfs = pd.read_html(full_url, attrs={'class':'MsoNormalTable'}, header=0)
    
    dict_html_tables_2017[bid_date] = list_dfs

del full_url, bid_date, list_dfs, pattern

In [None]:
columns_all = ["Name des Bieters",  "Gebots-Nr", "Zuschlags-Nr", "Standort"]
pattern_remove = "*sofern vom Bieter vergeben"

dict_dfs_2017 = {}
for bid_date, list_dfs in dict_html_tables_2017.items():
    # def process_tables(list_dfs, pattern_remove, columns_all):
    df_full = pd.DataFrame()  # Initialize an empty DataFrame

    # If several tables are upon the site    
    for df in list_dfs:
        df_copy = df.copy()

        # detect rows without actual information (footer)
        ind_series = df_copy.iloc[:, 0].str.startswith(pattern_remove)
        ind_series = ind_series.index[ind_series]

        # Somehow pd.read_html retrieved an empty last column in some cases -> remove
        df_copy = df_copy.iloc[:, 0:4]

        # rename the columns to a common pattern (columns_all) so that dfs can be bound together
        dict_rename = {}
        for i in range(0, len(df_copy.columns)):
            dict_rename[df_copy.columns[i]] = columns_all[i]
        df_copy = df_copy.rename(columns=dict_rename)

        # drop footer if necessary  
        if len(ind_series) > 0:
            df_copy = df_copy.drop(ind_series, axis=0)

        # rowbind several tables of one date together
        if df_full.empty:
            df_full = df_copy
        else:
            df_full = pd.concat([df_full, df_copy], ignore_index=True)
        
    dict_dfs_2017[bid_date] = df_full

del df_copy, ind_series, dict_rename, list_dfs

dict_dfs_2017

In [49]:
### Get the Zuschlagsdatum and append it as a column to the df
# 26. Mai 2017, 22. August 2017, 22. November 2017
# ["2017-05-26", "2017-08-22", "2017-11-22"]
dict_bid_award_dates = {"2017-05-01": "2017-05-26",
                        "2017-08-01": "2017-08-22", 
                        "2017-11-01": "2017-11-22"}

### Add columns:
for bid_date, df in dict_dfs_2017.items():
    df_copy = df.copy()
    
    df_copy["Gebotsdatum"] = bid_date
    df_copy["Zuschlagsdatum"] = dict_bid_award_dates[bid_date]
    
    dict_dfs_2017[bid_date] = df_copy


In [50]:
### save dict as pkl
with open("../data/mastr_bids/bids_2017_html.pkl", mode = "wb") as pkl_file:
    pickle.dump(dict_dfs_2017, pkl_file)

{'2017-05-01':                                      Name des Bieters Gebots-Nr Zuschlags-Nr  \
 0   1. BEB Bürgerenergie Behrenhoff UG (haftungsbe...       NaN  WIN17-1-096   
 1           Bad Bentheim Bürgerwindpark GmbH & Co. KG       NaN  WIN17-1-228   
 2                 Beckum Bürgerwindpark GmbH & Co. KG       NaN  WIN17-1-179   
 3                        BMR Windenergie GmbH & Co KG  BMR 1701  WIN17-1-160   
 4            Bürger Windpark Königsmoor GmbH & Co. KG       NaN  WIN17-1-004   
 ..                                                ...       ...          ...   
 65        Windpark Stemwede GmbH & Co. Mühlenheide KG       NaN  WIN17-1-244   
 66               Windpark Stoffeng Nord GmbH & Co. KG   Gebot 4  WIN17-1-245   
 67               Windpark Stoffeng Nord GmbH & Co. KG   Gebot 1  WIN17-1-247   
 68               Windpark Stoffeng Nord GmbH & Co. KG   Gebot 5  WIN17-1-249   
 69         Wölsickendorf Bürgerwindpark GmbH & Co. KG       NaN  WIN17-1-178   
 
            