In [None]:
### Download, Inspect and Upload Permit Data
import pandas as pd
import psycopg2
import os
import numpy as np
from dotenv import load_dotenv
from pandas.api.types import is_datetime64_any_dtype
from datetime import datetime
import requests
import time
import random
import re
import pickle
import openpyxl

In [None]:
### Import pickle of filenames of .xlsx with infos on bids
with open("../data/mastr_bids/bids_xlsx.pkl", mode = "rb") as pkl_file:
    dict_xlsx = pickle.load(pkl_file)

In [None]:
### 1. How does the header look like if we just raw-read the xlsx
dict_dfs = {}
for bid_date, path_xlsx in dict_xlsx.items():
    key = bid_date.strftime(format="%Y-%m-%d")
    print(key)
    
    xlsx_file = openpyxl.load_workbook(path_xlsx)
    
    num_sheets = len(xlsx_file.sheetnames)
    
    if num_sheets > 1:   
        sheet_to_read = 1
    else:
        sheet_to_read = 0
         
    raw_df = pd.read_excel(path_xlsx, sheet_name=sheet_to_read, header=None)
    
    # Which items of the first column equals the original first column name -> start of the DF
    # inner statement returns a boolean series
    # left outer statement half test_df[...] returns only these rows, where the value of the inner series is True
    # with right statement .index.values[0] retrieve the index of the df and this just as a number (first occurence)
    header_row = raw_df[raw_df[0].eq("Name des Bieters")].index.values[0]
    
    dict_dfs[key] = pd.read_excel(path_xlsx, sheet_name=sheet_to_read, header=header_row)

In [46]:
### Inspect-Data
for key, df in dict_dfs.items():
    print(key)
    print(df)
    print("""
          
          =========================================
          
          """)

2023-02-01
                                   Name des Bieters  Gebots-Nr Zuschlags-Nr  \
0           ABO Wind WP Drensteinfurt GmbH & Co. KG        NaN  WIN23-1/124   
1    ABO Wind WP Herscheid-Lennestadt GmbH & Co. KG        NaN  WIN23-1/122   
2    ABO Wind WP Herscheid-Lennestadt GmbH & Co. KG        NaN  WIN23-1/122   
3         ABO Wind WP Kevelaer-Wetten GmbH & Co. KG        NaN  WIN23-1/121   
4      ABO Wind WP Meschede-Freienohl GmbH & Co. KG        NaN  WIN23-1/123   
..                                              ...        ...          ...   
284         WIND-projekt GmbH & Co. 36. Betriebs-KG        1.0  WIN23-1/103   
285                   WKA Schauerberg GmbH & Co. KG        NaN  WIN23-1/095   
286                           WP A 33 GmbH & Co. KG        1.0  WIN23-1/015   
287           wpd Windpark Laichingen GmbH & Co. KG        NaN  WIN23-1/077   
288           wpd Windpark Laichingen GmbH & Co. KG        NaN  WIN23-1/077   

                 Bundesland           La

### Data is still messy:

#### merged rows in dfs in at least 2018. 
- the data does not have two sheets with compact and detailed data but only compact data
- The feature "Angegebner Standort der Anlage" holds the values for BLD, Landkreis, PLZ, Gemeinde, Gemarkung, Flurstück and Mastr Nummer like: 

    BLD Niedersachsen, Landkreis Salzgitter, Stadt, PLZ 38239, Gemeinde Salzgitter, Gemarkung Barum:
    Registernummer A4497640206941: Flur3: 1/1. 
    BLD Niedersachsen, Landkreis Salzgitter, Stadt, PLZ 38239, Gemeinde Salzgitter, Gemarkung Watenstedt:
    Registernummer A9617510206917: Flur5: 1/22. Registernummer A3273890206938: Flur5: 1/23. Registernummer A5669430206922: Flur5: 2/10. 

 Or:

    Niedersachsen, Landkreis Stade, PLZ 21698, Gemeinde Brest, Gemarkung Brest:
    Flur 2: 66/1; 66/2; 66/3 (SEE919421623876) 
    Flur 2: 71; 72 (SEE923510311766) 
    Gemarkung Wohlerst:
    Flur 2: 157/5; 5/7; 1/6 (SEE964469396954) 
    Flur 2: 5/7 (SEE974053806455) 
    Flur 2: 7/5; 170/5 (SEE968430555418)

- Here one Zuschlags-Nr encapsulates several power-units

In [None]:
### Count nr of columns:
bid_date = []
ncol = []
for key, df in dict_dfs.items():
    ncol.append(df.shape[1])
    bid_date.append(key)

pd.DataFrame({"bid_date":bid_date, "ncol":ncol})

In [None]:
### Helper function: When an item of 'Angegebener Standort der Anlage' is split into multiple groups
### of Administrative Infos: Mastr nr, Flur/Flurst, the function extracts the infos from this item seperately:
### dictionary of lists where the lists have the length of the nr of mastr_nrs in this split

def extract_mastr_nr_location(split_units, split_administrative):
    
    # Unnecessary headers which are within the item
    patterns_rem = ["Landkreis", r"Stadt|kreisfreie Stadt", "Gemeinde", "PLZ", "Gemarkung"]

    # lists of infos to be filled
    flur_list = []
    mastr_nr = []
    bld_list = []
    landkreis_list = []
    plz_list = []
    gemeinde_list = []
    gemarkung_list = []

    # clear the info from unnecessary headers and further garbage
    for pattern in patterns_rem:
        split_administrative = re.sub(pattern, "", split_administrative)

    names_administrative = [info.strip().rstrip("\n").replace("_x000D_", "").rstrip(":") for info in split_administrative.split(", ")]# 
    
    if names_administrative[2] == "":
        del names_administrative[2] 

    # loop: through the list with one item of "Mastr: Flurst" and append into the corresponding list
    # split_administrative is not repetetive, but holds a different information in each item -> name of the administrative unit 
    # ["name bundesland", ... , "name gemarkung"]. So these Items are appended repetitively to the corresponding list

    for unit in split_units:
    # remove leading and trailing spaces and dots
        unit = unit.strip(" ").strip(".")
        
    # Split at first occurence of ": "
        unit_flurst = unit.split(": ", maxsplit = 1)
        flur_list.append(unit_flurst[1].replace("_x000D_\n", "").rstrip(". ")) #.rstrip("\n").rstrip(":_x000D_").rstrip("."))
        mastr_nr.append(unit_flurst[0])
    
        bld_list.append(names_administrative[0])
        landkreis_list.append(names_administrative[1])
        plz_list.append(names_administrative[2])
        gemeinde_list.append(names_administrative[3])
        gemarkung_list.append(names_administrative[4])

    dict_row = {'Bundesland':bld_list,
       'Landkreis':landkreis_list, 
       'Postleitzahl':plz_list, 
       'Gemeinde':gemeinde_list, 
       'Gemarkung':gemarkung_list,
       'Register_Anlagennr':mastr_nr,
       'Flur / Flurstück':flur_list}

    return dict_row

In [None]:
### final function to apply onto the items of "Angegebener Standort der Anlage"
def extract_info_standort(item_standort, item_zuschlags_nr):
    
    # First split: Seperate the possibly multiple BLD ... Gemarkung: Regnr Flur: Flurstück into several of these, 
    # each starting with BLD ... Gemarkung. So that all witihn one item of the first split level are in the same 
    # administrative borders (bundesland -> gemarkung is the same)
    # Each of the these splits can hold multiple units (mastr_nr) with the corresponding Flur/Flurst entry
    split_list_top = item_standort.split("BLD")[1:]
    
    # Extract a dictionary of lists with repetitive administrative names and
    # unique units (mastr_nr) and the flur/flurst these are within
    # Make a df of these dicts
    
    # Create empty df first
    df_result = pd.DataFrame()
    
    for split_top in split_list_top:
        
        # Second split. [0] item administrative info BLD -> Gemarkung
        #               [1:] item Regnr Flur/Flurstück  
        split_list_admin = split_top.split("Registernummer")

        # Lokational info BLD -> Gemarkung
        split_administrative = split_list_admin[0]

        # [1:] item Regnr Flur/Flurstück
        split_units = split_list_admin[1:]
        
        dict_row = extract_mastr_nr_location(split_units=split_units, 
                                             split_administrative=split_administrative)
        
        if df_result.empty:
            
            df_result = pd.DataFrame(dict_row)
            
        else:
            df_result = pd.concat([df_result, 
                                   pd.DataFrame(dict_row)], ignore_index=True)       
    
    df_result["Zuschlags-Nr"] = [item_zuschlags_nr] * len(df_result)
    
    return df_result    

In [50]:
### Problem only present for two dates: 
# 2018-02-01	5 columns
# 2018-05-01    4 columns

test_df = dict_dfs["2018-02-01"]

test_item = test_df["Angegebener Standort der Anlage"][0]

test_zuschlags_nr = test_df["Zuschlags-Nr"][0]

# BLD Niedersachsen, Landkreis Salzgitter, Stadt, PLZ 38239, Gemeinde Salzgitter, Gemarkung Barum:
# Registernummer A4497640206941: Flur3: 1/1. 
# BLD Niedersachsen, Landkreis Salzgitter, Stadt, PLZ 38239, Gemeinde Salzgitter, Gemarkung Watenstedt:
# Registernummer A9617510206917: Flur5: 1/22. Registernummer A3273890206938: Flur5: 1/23. Registernummer A5669430206922: Flur5: 2/10.

# Can the string be divided by BLD?
# If at least the Gemarkung changes (lowest administrational level above flurstück)
# the whole sequence of Bundesland	Landkreis	Postleitzahl	Gemeinde	Gemarkung
# seems to repeat

extract_info_standort(test_item, test_zuschlags_nr)

Unnamed: 0,Bundesland,Landkreis,Postleitzahl,Gemeinde,Gemarkung,Register_Anlagennr,Flur / Flurstück,Zuschlags-Nr
0,Niedersachsen,Salzgitter,38239,Salzgitter,Barum,A4497640206941,Flur3: 1/1,WIN18-1-001
1,Niedersachsen,Salzgitter,38239,Salzgitter,Watenstedt,A9617510206917,Flur5: 1/22,WIN18-1-001
2,Niedersachsen,Salzgitter,38239,Salzgitter,Watenstedt,A3273890206938,Flur5: 1/23,WIN18-1-001
3,Niedersachsen,Salzgitter,38239,Salzgitter,Watenstedt,A5669430206922,Flur5: 2/10,WIN18-1-001


In [54]:
messy_column = "Angegebener Standort der Anlage"
bid_nr = "Zuschlags-Nr"
bid_date = "2018-02-01"

dict_cleaned_dfs = {}
df_2018_02_01 = dict_dfs[bid_date]

# Empty long df to be filled and merged with the remainder of the messy-df
df_2018_long = pd.DataFrame()

for index, row in df_2018_02_01.iterrows():
    
    df_bid_nr = extract_info_standort(row[messy_column], row[bid_nr])
    
    if df_2018_long.empty:
        df_2018_long = df_bid_nr
    else:
        df_2018_long = pd.concat([df_2018_long, df_bid_nr], ignore_index=True)

### Visually inspected and compared with downloaded .xlsx -> seems fine
df_2018_long

### Merge with the columns not presend in [df_2018_long
cols_keep = [col for col in df_2018_02_01.columns if col not in df_2018_long.columns]
cols_keep.append(bid_nr)
cols_keep.remove('Angegebener Standort der Anlage')

dict_cleaned_dfs[bid_date] = pd.merge(df_2018_02_01[cols_keep], df_2018_long, on=bid_nr)

In [None]:
### dict_dfs["2018-05-01"] - Second Messy-DF

test_df = dict_dfs["2018-05-01"]

test_item = test_df["Angegebener Standort der Anlage"][108]
test_zuschlags_nr = test_df["Zuschlags-Nr"][108]

extract_info_standort(test_item, test_zuschlags_nr)

# BLD Mecklenburg-Vorpommern, Landkreis Rostock, PLZ 18198, Gemeinde Stäbelow, Gemarkung Bliesekow:_x000D_ 
# Registernummer A2692250180724: Flur 1: 66. Registernummer A7961750180738: Flur 1: 94.

# Function seems to work for this DF too

In [58]:
messy_column = "Angegebener Standort der Anlage"
bid_nr = "Zuschlags-Nr"
bid_date = "2018-05-01"

dict_cleaned_dfs = {}
df_2018_05_01 = dict_dfs[bid_date]

# Empty long df to be filled and merged with the remainder of the messy-df
df_2018_long = pd.DataFrame()

for index, row in df_2018_05_01.iterrows():
    
    df_bid_nr = extract_info_standort(row[messy_column], row[bid_nr])
    
    if df_2018_long.empty:
        df_2018_long = df_bid_nr
    else:
        df_2018_long = pd.concat([df_2018_long, df_bid_nr], ignore_index=True)
        
### Visually inspected and compared with downloaded .xlsx -> seems fine
df_2018_long

### Merge with the columns not presend in [df_2018_long
cols_keep = [col for col in df_2018_05_01.columns if col not in df_2018_long.columns]
cols_keep.append(bid_nr)
cols_keep.remove('Angegebener Standort der Anlage')

dict_cleaned_dfs[bid_date] = pd.merge(df_2018_05_01[cols_keep], df_2018_long, on=bid_nr)

dict_cleaned_dfs[bid_date]

Unnamed: 0,Name des Bieters,Gebots-Nr,Zuschlags-Nr,Bundesland,Landkreis,Postleitzahl,Gemeinde,Gemarkung,Register_Anlagennr,Flur / Flurstück
0,ABO Wind WP Adorf-Ost GmbH & Co. KG,,WIN18-2-071,Hessen,Waldeck-Frankenberg,34519,Diemelsee,Adorf,A3573560217432,Flur 14: 12/3
1,ABO Wind WP Adorf-Ost GmbH & Co. KG,,WIN18-2-071,Hessen,Waldeck-Frankenberg,34519,Diemelsee,Adorf,A9255860217445,Flur 14: 12/3; 11/2
2,AgrarVolt Keppeln GmbH & Co. KG,,WIN18-2-050,Nordrhein-Westfalen,Kleve,47589,Uedem,Keppeln,A3792360153133,Flur 4: 62
3,Asselner Windkraft GmbH & Co. KG,1,WIN18-2-019,Nordrhein-Westfalen,Paderborn,33165,Lichtenau,Asseln,A1493910202241,Flur 6: 15; 16
4,BayWa r.e. Wind GmbH,1,WIN18-2-110,Nordrhein-Westfalen,Kleve,47638,Straelen,Herongen,A1341330215160,Flur 4: 12
...,...,...,...,...,...,...,...,...,...,...
182,WP Vier Berge ApS & Co. KG,,WIN18-2-001,Sachsen-Anhalt,Burgenlandkreis,06682,Teuchern,Prittitz,A4688210216426,Flur 6: 8/1
183,Zweite Bökingharder Windpark GmbH & Co. KG,2,WIN18-2-072,Schleswig-Holstein,Nordfriesland,25899,Dagebüll,Fahretoft,A3577190176640,Flur 6: 16
184,Zweite Bökingharder Windpark GmbH & Co. KG,2,WIN18-2-072,Schleswig-Holstein,Nordfriesland,25899,Dagebüll,Fahretoft,A3462450176419,Flur 6: 19
185,Zweite Bökingharder Windpark GmbH & Co. KG,2,WIN18-2-072,Schleswig-Holstein,Nordfriesland,25899,Dagebüll,Fahretoft,A6811420176428,Flur 6: 5/1


In [None]:
### Data_Cleaning After 2018:

# 2020-03-01 - Actually two tables with two different Zuschlagsdaten
# Gebotsdatum (first line of raw-data/date in table name and dict-key) and Zuschlagsdatum/Bekanntgabedatum (4th line of raw data)

# Before extracting data from raw format:
# Split by Zuschlagsdatum if necessary
# extract Zuschlagsdatum from header-info in raw data
# Append Zuschlags and Gebotsdatum to the df

# in / before third cell