In [None]:
### Download, Inspect and Upload Permit Data
import pandas as pd
import psycopg2
import os
import numpy as np
from dotenv import load_dotenv
from pandas.api.types import is_datetime64_any_dtype
from datetime import datetime
import requests
import time
import random
import re
import pickle
import openpyxl

In [None]:
### Import pickle of filenames of .xlsx with infos on bids
with open("../data/mastr_bids/bids_xlsx.pkl", mode = "rb") as pkl_file:
    dict_xlsx = pickle.load(pkl_file)

In [None]:
### Data_Cleaning After 2018 (2 Tables)

# 2020-03-01 - Actually two tables with two different Zuschlagsdaten
# Gebotsdatum (first line of raw-data/date in table name and dict-key) and Zuschlagsdatum/Bekanntgabedatum (4th line of raw data)

# But Not all lines contain the line with the Zuschlagsdatum

# Before extracting data from raw format:
# Split by Zuschlagsdatum if necessary
# extract Zuschlagsdatum from header-info in raw data
# Append Zuschlags and Gebotsdatum to the df

# in / before third cell

# Pattern which starts the row above the actual header, can appear multiple times in the second sheet of the full-data
# and thus distinguish tables with two different Zuschlags/Bekanntgabedaten


def extract_df_bids_xlsx(bid_date, path_xlsx):
    # Pattern for line in raw .xlsx where Info on the Zuschlagsdatum is hidden
    pattern_award_date = "Die Zuschläge gelten eine Woche"
    # pattern for line in raw .xlsx where a header row occurs
    pattern_header = "Name des Bieters"

    # Patterns for Zuschlagsdatum
    pattern = r"(Bekanntgabe am \d{1,2}.\d{2}.\d{4})"
    pattern_2 = r"(\d{1,2}.\d{2}.\d{4})"

    # Look into xlsx workbook
    xlsx_file = openpyxl.load_workbook(path_xlsx)

    num_sheets = len(xlsx_file.sheetnames)

    # Determine which sheet to read
    if num_sheets > 1:   
        sheet_to_read = 1
    else:
        sheet_to_read = 0

    # raw .xlsx including messy headers with infos an Gebots and Zuschlagsdaten and obsolete, non tabular lines
    raw_df = pd.read_excel(path_xlsx, sheet_name=sheet_to_read, header=None)

    # Boolean series to map, in which row the Sentence about Zuschlagsdatum is hidden
    bool_series = raw_df[0].str.startswith(pattern_award_date, na= False)

    # Index Values to map, in which rows the header of the tables are hidden
    header_rows = raw_df[0].eq(pattern_header)
    header_rows = header_rows.index[header_rows]

    # Also find
    if bool_series.any():
        ### Extract indices - self referential: left: boolean series. all index values. 
        # apply boolean series upon the all the indices -> returns indices where == True
        ind = bool_series.index[bool_series]

        ### Extract dates from these rows as list
        award_dates = raw_df[0][ind].str.extract(pattern)[0].str.extract(pattern_2)[0].tolist()

        dict_award_dfs = {}
        pos = 0
        ### mit while pos < len (ind
        while pos < (len(ind)-1):

            nrows = nrows = header_rows[pos + 1] - 4 - header_rows[pos]
            #skipfooter = len(raw_df) - ind[pos+1] - 3

            ### Über Index von Index + 2 = header bis Index+1 - 3 daten extrahieren
            df_clean = pd.read_excel(path_xlsx, sheet_name=sheet_to_read, header=header_rows[pos], 
                                     nrows=nrows) 
                                     #skipfooter=skipfooter)

            dict_award_dfs[award_dates[pos]] = df_clean

            pos += 1

        else:
            df_clean = pd.read_excel(path_xlsx, sheet_name=sheet_to_read, header=header_rows[pos])

            dict_award_dfs[award_dates[pos]] = df_clean

        ### Patch dict_award_dfs together 
        df_clean_full = pd.DataFrame() 

        for award_date, df in dict_award_dfs.items():
            df["Gebotsdatum"] = [bid_date] * len(df)
            df["Zuschlagsdatum"] = [award_date] * len(df)
            
            if df_clean_full.empty:
                df_clean_full = df
            else:
                df_clean_full = pd.concat([df_clean_full, df], ignore_index=True)

        return df_clean_full

    # else - No line found with additional Info on Zuschlagsdates. Take the stuff from below and put it here
    else: 
        df_clean = pd.read_excel(path_xlsx, sheet_name=sheet_to_read, header=header_rows[0])
        df_clean["Gebotsdatum"] = [bid_date] * len(df_clean)
        df_clean["Zuschlagsdatum"] = [None] * len(df_clean)

        return df_clean

In [None]:
dict_dfs = {}
for bid_date, path_xlsx in dict_xlsx.items():
    bid_date = datetime.strftime(bid_date, format = "%Y-%m-%d")
    dict_dfs[bid_date] = extract_df_bids_xlsx(bid_date=bid_date, path_xlsx=path_xlsx)

In [None]:
### Inspect-Data
for key, df in dict_dfs.items():
    print(key)
    print(df)
    print("""
          
          =========================================
          
          """)

### Data looks good. Data complete for tables where Info on Zuschlagsdatum was hidden in additional columns 
# and also those, where multiple Tables for multiple Zuschlagsdaten where hidden in sheet 2

### Data is still messy:

#### merged rows in dfs in at least 2018. 
- the data does not have two sheets with compact and detailed data but only compact data
- The feature "Angegebner Standort der Anlage" holds the values for BLD, Landkreis, PLZ, Gemeinde, Gemarkung, Flurstück and Mastr Nummer like: 

    BLD Niedersachsen, Landkreis Salzgitter, Stadt, PLZ 38239, Gemeinde Salzgitter, Gemarkung Barum:
    Registernummer A4497640206941: Flur3: 1/1. 
    BLD Niedersachsen, Landkreis Salzgitter, Stadt, PLZ 38239, Gemeinde Salzgitter, Gemarkung Watenstedt:
    Registernummer A9617510206917: Flur5: 1/22. Registernummer A3273890206938: Flur5: 1/23. Registernummer A5669430206922: Flur5: 2/10. 

 Or:

    Niedersachsen, Landkreis Stade, PLZ 21698, Gemeinde Brest, Gemarkung Brest:
    Flur 2: 66/1; 66/2; 66/3 (SEE919421623876) 
    Flur 2: 71; 72 (SEE923510311766) 
    Gemarkung Wohlerst:
    Flur 2: 157/5; 5/7; 1/6 (SEE964469396954) 
    Flur 2: 5/7 (SEE974053806455) 
    Flur 2: 7/5; 170/5 (SEE968430555418)

- Here one Zuschlags-Nr encapsulates several power-units

In [None]:
### Count nr of columns:
bid_date = []
ncol = []
for key, df in dict_dfs.items():
    ncol.append(df.shape[1])
    bid_date.append(key)

pd.DataFrame({"bid_date":bid_date, "ncol":ncol})

In [None]:
### Helper function: When an item of 'Angegebener Standort der Anlage' is split into multiple groups
### of Administrative Infos: Mastr nr, Flur/Flurst, the function extracts the infos from this item seperately:
### dictionary of lists where the lists have the length of the nr of mastr_nrs in this split

def extract_mastr_nr_location(split_units, split_administrative):
    
    # Unnecessary headers which are within the item
    patterns_rem = ["Landkreis", r"Stadt|kreisfreie Stadt", "Gemeinde", "PLZ", "Gemarkung"]

    # lists of infos to be filled
    flur_list = []
    mastr_nr = []
    bld_list = []
    landkreis_list = []
    plz_list = []
    gemeinde_list = []
    gemarkung_list = []

    # clear the info from unnecessary headers and further garbage
    for pattern in patterns_rem:
        split_administrative = re.sub(pattern, "", split_administrative)

    names_administrative = [info.strip().rstrip("\n").replace("_x000D_", "").rstrip(":") for info in split_administrative.split(", ")]# 
    
    if names_administrative[2] == "":
        del names_administrative[2] 

    # loop: through the list with one item of "Mastr: Flurst" and append into the corresponding list
    # split_administrative is not repetetive, but holds a different information in each item -> name of the administrative unit 
    # ["name bundesland", ... , "name gemarkung"]. So these Items are appended repetitively to the corresponding list

    for unit in split_units:
    # remove leading and trailing spaces and dots
        unit = unit.strip(" ").strip(".")
        
    # Split at first occurence of ": "
        unit_flurst = unit.split(": ", maxsplit = 1)
        flur_list.append(unit_flurst[1].replace("_x000D_\n", "").rstrip(". ")) #.rstrip("\n").rstrip(":_x000D_").rstrip("."))
        mastr_nr.append(unit_flurst[0])
    
        bld_list.append(names_administrative[0])
        landkreis_list.append(names_administrative[1])
        plz_list.append(names_administrative[2])
        gemeinde_list.append(names_administrative[3])
        gemarkung_list.append(names_administrative[4])

    dict_row = {'Bundesland':bld_list,
       'Landkreis':landkreis_list, 
       'Postleitzahl':plz_list, 
       'Gemeinde':gemeinde_list, 
       'Gemarkung':gemarkung_list,
       'Register_Anlagennr':mastr_nr,
       'Flur / Flurstück':flur_list}

    return dict_row

In [None]:
### final function to apply onto the items of "Angegebener Standort der Anlage"
def extract_info_standort(item_standort, item_zuschlags_nr):
    
    # First split: Seperate the possibly multiple BLD ... Gemarkung: Regnr Flur: Flurstück into several of these, 
    # each starting with BLD ... Gemarkung. So that all witihn one item of the first split level are in the same 
    # administrative borders (bundesland -> gemarkung is the same)
    # Each of the these splits can hold multiple units (mastr_nr) with the corresponding Flur/Flurst entry
    split_list_top = item_standort.split("BLD")[1:]
    
    # Extract a dictionary of lists with repetitive administrative names and
    # unique units (mastr_nr) and the flur/flurst these are within
    # Make a df of these dicts
    
    # Create empty df first
    df_result = pd.DataFrame()
    
    for split_top in split_list_top:
        
        # Second split. [0] item administrative info BLD -> Gemarkung
        #               [1:] item Regnr Flur/Flurstück  
        split_list_admin = split_top.split("Registernummer")

        # Lokational info BLD -> Gemarkung
        split_administrative = split_list_admin[0]

        # [1:] item Regnr Flur/Flurstück
        split_units = split_list_admin[1:]
        
        dict_row = extract_mastr_nr_location(split_units=split_units, 
                                             split_administrative=split_administrative)
        
        if df_result.empty:
            
            df_result = pd.DataFrame(dict_row)
            
        else:
            df_result = pd.concat([df_result, 
                                   pd.DataFrame(dict_row)], ignore_index=True)       
    
    df_result["Zuschlags-Nr"] = [item_zuschlags_nr] * len(df_result)
    
    return df_result    

In [None]:
### Problem only present for two dates: 
# 2018-02-01	5 columns
# 2018-05-01    4 columns

df = dict_dfs["2018-02-01"]

test_item = df["Angegebener Standort der Anlage"][0]

test_zuschlags_nr = df["Zuschlags-Nr"][0]

# BLD Niedersachsen, Landkreis Salzgitter, Stadt, PLZ 38239, Gemeinde Salzgitter, Gemarkung Barum:
# Registernummer A4497640206941: Flur3: 1/1. 
# BLD Niedersachsen, Landkreis Salzgitter, Stadt, PLZ 38239, Gemeinde Salzgitter, Gemarkung Watenstedt:
# Registernummer A9617510206917: Flur5: 1/22. Registernummer A3273890206938: Flur5: 1/23. Registernummer A5669430206922: Flur5: 2/10.

# Can the string be divided by BLD?
# If at least the Gemarkung changes (lowest administrational level above flurstück)
# the whole sequence of Bundesland	Landkreis	Postleitzahl	Gemeinde	Gemarkung
# seems to repeat

extract_info_standort(test_item, test_zuschlags_nr)

del df, test_item, test_zuschlags_nr

In [None]:
messy_column = "Angegebener Standort der Anlage"
bid_nr = "Zuschlags-Nr"
bid_date = "2018-02-01"

dict_cleaned_dfs = {}
df_messy = dict_dfs[bid_date]

# Empty long df to be filled and merged with the remainder of the messy-df
df_clean_long = pd.DataFrame()

for index, row in df_messy.iterrows():
    
    df_bid_nr = extract_info_standort(row[messy_column], row[bid_nr])
    
    if df_clean_long.empty:
        df_clean_long = df_bid_nr
    else:
        df_clean_long = pd.concat([df_clean_long, df_bid_nr], ignore_index=True)

### Visually inspected and compared with downloaded .xlsx -> seems fine
df_clean_long

### Merge with the columns not presend in [df_clean_long
cols_keep = [col for col in df_messy.columns if col not in df_clean_long.columns]
cols_keep.append(bid_nr)
cols_keep.remove('Angegebener Standort der Anlage')

dict_cleaned_dfs[bid_date] = pd.merge(df_messy[cols_keep], df_clean_long, on=bid_nr)

In [None]:
dict_cleaned_dfs[bid_date]

del cols_keep, df_clean_long, df_bid_nr, df_messy, bid_date

In [None]:
### dict_dfs["2018-05-01"] - Second Messy-DF
bid_date = "2018-05-01"

df = dict_dfs[bid_date]

test_item = df["Angegebener Standort der Anlage"][108]
test_zuschlags_nr = df["Zuschlags-Nr"][108]

extract_info_standort(test_item, test_zuschlags_nr)

# BLD Mecklenburg-Vorpommern, Landkreis Rostock, PLZ 18198, Gemeinde Stäbelow, Gemarkung Bliesekow:_x000D_ 
# Registernummer A2692250180724: Flur 1: 66. Registernummer A7961750180738: Flur 1: 94.

# Function seems to work for this DF too

In [None]:
bid_nr = "Zuschlags-Nr"
bid_date = "2018-05-01"

df_messy = dict_dfs[bid_date]

# Empty long df to be filled and merged with the remainder of the messy-df
df_clean_long = pd.DataFrame()

for index, row in df_messy.iterrows():
    
    df_bid_nr = extract_info_standort(row[messy_column], row[bid_nr])
    
    if df_clean_long.empty:
        df_clean_long = df_bid_nr
    else:
        df_clean_long = pd.concat([df_clean_long, df_bid_nr], ignore_index=True)
        
### Visually inspected and compared with downloaded .xlsx -> seems fine
df_clean_long

### Merge with the columns not presend in [df_clean_long
cols_keep = [col for col in df_messy.columns if col not in df_clean_long.columns]
cols_keep.append(bid_nr)
cols_keep.remove('Angegebener Standort der Anlage')

dict_cleaned_dfs[bid_date] = pd.merge(df_messy[cols_keep], df_clean_long, on=bid_nr)

dict_cleaned_dfs[bid_date]

In [None]:
### Bring the already clean dfs into dict_cleaned_dfs

rem_dates = [key for key in dict_dfs.keys() if key not in dict_cleaned_dfs.keys()]

for bid_date in rem_dates:
    dict_cleaned_dfs[bid_date] = dict_dfs[bid_date]
    

In [None]:
for key, df in dict_cleaned_dfs.items():
    print(key)
    print(len(df.columns))
    print(df.columns)
    print("""
          
          =========================================
          
          """)

In [None]:
# Changes to columns: "Anlagennummer" or "Registernummer" to "Register_Anlagennr"

# Set of columns to keep, remove those not within and add empty if not present    
columns_all = ['Name des Bieters', 'Gebots-Nr', 'Zuschlags-Nr', 'Bundesland',
       'Landkreis', 'Postleitzahl', 'Gemeinde', 'Gemarkung',
       'Flur / Flurstück', 'Register_Anlagennr', 'Gebotsdatum',
       'Zuschlagsdatum']
# Set common column names (12)
# remove those not within
# add those missing

# Add empty column Bundesland to 2022-12-01, 2022-02-01, 2022-05-01
# Add Postleitzahl to 2022-02-01, 2022-05-01
# remove Kassenzeichen from 2018-02-01
# remove Bemerung from 2023-05-01

dict_final_dfs = {}

for key, df in dict_cleaned_dfs.items():
    # Create a copy of the DataFrame
    df_copy = df.copy()

    # Remove dots from column names
    for name in df_copy.columns:
        df_copy = df_copy.rename(columns={name: name.rstrip(".")})
    del name

    # Unified column for Unit-Nr
    for name in ["Anlagennummer", "Registernummer"]:
        if name in df_copy.columns:
            df_copy = df_copy.rename(columns={name: "Register_Anlagennr"})
            break

    # Remove unnecessary columns
    for col in df_copy.columns:
        if col not in columns_all:
            df_copy.drop(col, axis=1, inplace=True)
    del col

    # Add missing columns
    for col in columns_all:
        if col not in df_copy.columns:
            df_copy[col] = None

    dict_final_dfs[key] = df_copy.reindex(columns=columns_all)

In [None]:
for key, df in dict_final_dfs.items():
    print(key)
    #print(len(df.columns))
    print(df.head())
    print("""
          
          =========================================
          
          """)
    
# Data looks good -> concat to one df



In [None]:
df_bids = pd.DataFrame()

for df in dict_final_dfs.values():
    if df_bids.empty:
        df_bids = df
    else:
        df_bids = pd.concat([df_bids, df], ignore_index=True)

In [None]:
with open("../data/mastr_bids/bids_cleaned_2018_2023.pkl", mode = "wb") as pkl_file:
    pickle.dump(df_bids, pkl_file)

In [None]:
### Load dict of 2017s dfs to clean them
with open("../data/mastr_bids/bids_2017_html.pkl", mode = "rb") as pkl_file:
    dict_dfs_2017 = pickle.load(pkl_file)

### Messy Data in 2017:

- No columns/data for Anlagen/Registernr
- Data on administrative units and location is similarly structured as above in one cell of "Standort":

```
Landkreis Diepholz, PLZ 27211,                                                     
Gemeinde Bassum, Gemarkung Apelstedt: Anlage 1: Flur 6: 025.
Gemeinde Bassum, Gemarkung Nienstedt: Anlage 2: Flur 7: 011. Anlage 3: Flur 7: 015/1.
Gemeinde Bassum, Gemarkung Schorlingborstel: Anlage 4: Flur 6: 032.
```
- (BLD) - only present for 2017-11, Landkreis, PLZ as first row in cell, which is not repeated for 2017-08 and 2017-11
-> While the first line is missing for the data of may 2017

In [None]:
def extract_location(split_units, split_administrative):
    # Unnecessary headers which are within the item
    patterns_rem = ["Landkreis", "PLZ", r"Stadt|kreisfreie Stadt", "Gemeinde", "Gemarkung"]

    # lists of infos to be filled
    
    flur_list = []
    gemeinde_list = []
    gemarkung_list = []

    # clear the info from unnecessary headers and further garbage
    for pattern in patterns_rem:
        split_administrative = re.sub(pattern, "", split_administrative)

    # List of 2 -> Gemeinde, Gemarkung
    names_administrative = [info.strip().rstrip(":") for info in split_administrative.split(", ")]

    for unit in split_units:
        # remove leading and trailing spaces and dots
        unit = unit.strip(" ").strip(".")

        # load unit info into the corresponding list
        
        flur_list.append(unit)
        gemeinde_list.append(names_administrative[0])
        gemarkung_list.append(names_administrative[1])

    # dictionary which holds all splitted values from this cells standort-info
    dict_row = {'Gemeinde':gemeinde_list, 
                'Gemarkung':gemarkung_list,
                'Flur / Flurstück':flur_list}

    return dict_row

In [None]:
### final stripped down function without extraction of Mastr/Anlagennr 
#   to apply onto the items of "Angegebener Standort der Anlage"

def extract_info_standort_2(item_standort, item_zuschlags_nr):
    
    # First split- seperate multiple Geminde, Gemarkung: Anlage ... into several items where each holds all units within the same gemarkung
    split_list_top = item_standort.split("Gemeinde")[1:]

    # Extract a dictionary of lists with repetitive administrative names and
    # unique units (mastr_nr) and the flur/flurst these are within
    # Make a df of these dicts
    
    # Create empty df first
    df_result = pd.DataFrame()
    
    for split_top in split_list_top:
        split_list_admin = re.split(r"Anlage \d{1,2}:", split_top)
    
        # Lokational info Gemeinde -> Gemarkung
        split_administrative = split_list_admin[0]
    
        # [1:] item Flur/Flurstück
        split_units = split_list_admin[1:]
        
        dict_row = extract_location(split_units=split_units, 
                                             split_administrative=split_administrative)
        
        if df_result.empty:
            
            df_result = pd.DataFrame(dict_row)
            
        else:
            df_result = pd.concat([df_result, 
                                   pd.DataFrame(dict_row)], ignore_index=True)       
    
    df_result["Zuschlags-Nr"] = [item_zuschlags_nr] * len(df_result)
    return df_result

In [None]:
### Messy columns also present in 2017 data
df = dict_dfs_2017["2017-05-01"]

### Again structured differently - Registernummer is missing - No Landkreis and Bundesland
### Units are distinguished by "Anlage:"
item_standort = df["Standort"][65]
item_zuschlags_nr = df["Zuschlags-Nr"][65]

print(item_standort)
extract_info_standort_2(item_standort=item_standort, item_zuschlags_nr=item_zuschlags_nr)

In [None]:
### Clean for 2017 
messy_column = "Standort"

bid_nr = "Zuschlags-Nr"
bid_date = "2017-05-01"

df_messy = dict_dfs_2017[bid_date]

# Empty long df to be filled and merged with the remainder of the messy-df
df_clean_long = pd.DataFrame()

for index, row in df_messy.iterrows():
    
    df_bid_nr = extract_info_standort_2(row[messy_column], row[bid_nr])
    
    if df_clean_long.empty:
        df_clean_long = df_bid_nr
    else:
        df_clean_long = pd.concat([df_clean_long, df_bid_nr], ignore_index=True)
        
### Visually inspected and compared with downloaded .xlsx -> seems fine
df_clean_long


In [None]:
### Merge with the columns not presend in [df_clean_long
cols_keep = [col for col in df_messy.columns if col not in df_clean_long.columns]

cols_keep.append(bid_nr)
cols_keep.remove('Standort')

dict_cleaned_2017_dfs = {}
dict_cleaned_2017_dfs[bid_date] = pd.merge(df_messy[cols_keep], df_clean_long, on=bid_nr)

### 2017-11 and 2017-08

- column for Standort is even more variable in these dfs
- function to extract data needs a more dynamic approach

In [None]:
def split_standort(item_standort):
    # Split item of "Standort" into items of either administrative info or unit info on multiple Flur/Flurstück in 1 string
    split_list_top = re.split(r"Anlage \d{1,2}:", item_standort)

    # Empty
    split_2 = []
    # Split item of multiple flurstücke into seperated items with one unit respectively
    for split_top in split_list_top:
        if split_top.startswith(" Flur"):
            [split_2.append(split) for split in split_top.split(". ")]
        else:
            split_2.append(split_top)

    # List with ["Bld -> Gemarkung", "Flur...",]
    split_2 = [item.strip() for item in split_2 if item != ""]

    return split_2

In [None]:
def units_into_dict(split_2):
    # Sort units into a dictionary.
    # keys are the strings of administrative infos
    # values are lists of multiple unit items, which lie within the administrative unit defined in the key
    current_key = None
    current_group = []
    dict_admin_units = {}

    for item in split_2:
        # Check if the item is a Unit Info ("Flur") or administrative info for one or several units
        if item.startswith("Flur"):
            # if it is unit info - append to current group of the same administrative info
            current_group.append(item)

        # If administrative info?
        else:
            # administrative info of the first group?
            if current_key is not None:
                # No -> create key (administrative info) - value (unit location) info pair in dictionary result
                dict_admin_units[current_key] = current_group
                # empty current group to be refilled
                current_group = []
                # overwrite current key
                current_key = item
            else:
                # first occurrence of admin-info -> start of list
                current_key = item

    # loop ended, current key-value pair must be written into dict            
    dict_admin_units[current_key] = current_group

    return dict_admin_units

In [None]:
def combine_admin_unit(administrative_info, split_units):
    patterns_rem = ["Bundesland", "Landkreis", r"Stadt|kreisfreie Stadt", 
                    "Gemeinde", r"[Pp][Ll][Zz]", "Postleitzahl", "Gemarkung"]
    
    split_administrative = administrative_info.split(", ")
    
    # clear the info from unnecessary headers and further garbage
    for pattern in patterns_rem:
        split_administrative = [re.sub(pattern, "", level_admin) for level_admin in split_administrative]
    
    names_administrative = [info.strip().rstrip(":") for info in split_administrative]
    
    len_administrative = len (names_administrative)
    len_units = len(split_units)
    
    # Define the whole possible range of administrative levels
    # levels_admin = ["bundesland_list", "landkreis_list", "plz_list", "gemeinde_list", "gemarkung_list"]
    levels_admin = ["Bundesland", "Landkreis", "PLZ", "Gemeinde", "Gemarkung"]
    
    # Create dict with an empty list for each level
    dict_row = {}
    for level in levels_admin:
       dict_row[level] = []
    del level

    flur_list = []
    
    for unit in split_units:
        # remove leading and trailing spaces and dots
        unit = unit.strip().strip(".")

        # load unit info into the corresponding list
        flur_list.append(unit)
    
    # Fill given values for administrative levels    
    for i in range(len_administrative):
        ind = -(i+1)
        level = levels_admin[ind]
        dict_row[level] = [names_administrative[ind]] * len_units
    
    # Fill those not present: levels_admin 1: len(levels_admin) - len_administrative    
    levels_missing = len(levels_admin) - len_administrative
    
    for i in range(levels_missing):
        
        level = levels_admin[i]
        dict_row[level] = [None] * len_units
    
    dict_row["Flur / Flurstück"] = flur_list

    return dict_row

In [None]:
# Function for one cell of Standort
def extract_info_standort_3(item_standort, item_zuschlags_nr):
    split_2 = split_standort(item_standort)
    
    dict_admin_units = units_into_dict(split_2)
    
    df_result = pd.DataFrame()
    # print(dict_admin_units)
    
    for administrative_info, split_units in dict_admin_units.items():
        dict_row = combine_admin_unit(administrative_info, split_units)
        df_row = pd.DataFrame(dict_row)
        
        if df_result.empty:
            df_result = df_row
        else:
            df_result = pd.concat([df_result, df_row], ignore_index=True)
    
    df_result["Zuschlags-Nr"] = [item_zuschlags_nr] * len(df_result)
    df_result.ffill(inplace=True)
    return df_result

In [None]:
messy_column = "Standort"

bid_nr = "Zuschlags-Nr"
bid_date = "2017-08-01"

df_messy = dict_dfs_2017[bid_date]

# Empty long df to be filled and merged with the remainder of the messy-df
list_dfs = []

# Index of errorneous rows (again a different formatting)
bid_nr_errors = []

for index, row in df_messy.iterrows():
    
    try:
        df_bid_nr = extract_info_standort_3(row[messy_column], row[bid_nr])
        
        list_dfs.append(df_bid_nr)
    
    except Exception as e:
        print(f"Row {index} raised error: {e}")
        print(row[messy_column])
        
        bid_nr_errors.append(df_messy[bid_nr][index]) 

### Visually inspected and compared with downloaded .xlsx -> seems fine
df_clean_long = pd.concat(list_dfs, ignore_index=True)

In [None]:
df_clean_long

### Still dirty data for some cleaned bid-nrs 
bid_nr_messy = ["WIN17-2-209", "WIN17-2-177", "WIN17-2-202"]

ind = df_clean_long[bid_nr].isin(bid_nr_messy)

### Add messy bid nrs to list to later save the messy remaining raw-data
[bid_nr_errors.append(i) for i in bid_nr_messy]

### Remove rows resulting from messy bid nrs from long df

df_clean_long = df_clean_long[~ind]

### DF errors into a dict
dict_messy_2017_dfs = {}
dict_messy_2017_dfs[bid_date] = df_messy[df_messy[bid_nr].isin(bid_nr_errors)]


In [None]:
### Merge with the columns not presend in [df_clean_long
### rename the columns
cols_keep = [col for col in df_messy.columns if col not in df_clean_long.columns]

cols_keep.append(bid_nr)
cols_keep.remove('Standort')

dict_cleaned_2017_dfs[bid_date] = pd.merge(df_messy[cols_keep], df_clean_long, on=bid_nr)

In [None]:
### All for 2017-11

messy_column = "Standort"

bid_nr = "Zuschlags-Nr"
bid_date = "2017-11-01"

df_messy = dict_dfs_2017[bid_date]

# Empty long df to be filled and merged with the remainder of the messy-df
list_dfs = []

# Index of errorneous rows (again a different formatting)
bid_nr_errors = []

for index, row in df_messy.iterrows():
    
    try:
        df_bid_nr = extract_info_standort_3(row[messy_column], row[bid_nr])
        
        list_dfs.append(df_bid_nr)
    
    except Exception as e:
        print(f"Row {index} raised error: {e}")
        print(row[messy_column])
        
        bid_nr_errors.append(df_messy[bid_nr][index]) 

### Visually inspected and compared with downloaded .xlsx -> seems fine
df_clean_long = pd.concat(list_dfs, ignore_index=True)

In [None]:
["WIN17-3-165"]

ind = df_clean_long[bid_nr].isin(bid_nr_messy)

### Add messy bid nrs to list to later save the messy remaining raw-data
[bid_nr_errors.append(i) for i in bid_nr_messy]

### Remove rows resulting from messy bid nrs from long df

df_clean_long = df_clean_long[~ind]

### messy remaining df into dict
### DF errors into a dict
dict_messy_2017_dfs[bid_date] = df_messy[df_messy[bid_nr].isin(bid_nr_errors)]

In [None]:
### Merge with the columns not presend in [df_clean_long
### rename the columns
cols_keep = [col for col in df_messy.columns if col not in df_clean_long.columns]

cols_keep.append(bid_nr)
cols_keep.remove('Standort')

dict_cleaned_2017_dfs[bid_date] = pd.merge(df_messy[cols_keep], df_clean_long, on=bid_nr)

In [None]:
for bid_date, df in dict_cleaned_2017_dfs.items():
    print(bid_date)
    print(df.head())
    print("===============================================")

In [None]:
# Changes to columns: "Anlagennummer" or "Registernummer" to "Register_Anlagennr"

# Set of columns to keep, remove those not within and add empty if not present    
columns_all = ['Name des Bieters', 'Gebots-Nr', 'Zuschlags-Nr', 'Bundesland',
       'Landkreis', 'Postleitzahl', 'Gemeinde', 'Gemarkung',
       'Flur / Flurstück', 'Register_Anlagennr', 'Gebotsdatum',
       'Zuschlagsdatum']
# Set common column names (12)
# remove those not within
# add those missing

dict_final_2017_dfs = {}

for key, df in dict_cleaned_2017_dfs.items():
    # Create a copy of the DataFrame
    df_copy = df.copy()
    
    # Rename PLZ
    df_copy.rename({"PLZ": "Postleitzahl"}, axis = 1, inplace=True)

    # Remove unnecessary columns
    for col in df_copy.columns:
        if col not in columns_all:
            df_copy.drop(col, axis=1, inplace=True)
    del col

    # Add missing columns
    for col in columns_all:
        if col not in df_copy.columns:
            df_copy[col] = None

    dict_final_2017_dfs[key] = df_copy.reindex(columns=columns_all)
    

In [None]:
### Bind together and save
df_bids_2017 = pd.DataFrame()

for df in dict_final_2017_dfs.values():
    if df_bids_2017.empty:
        df_bids_2017 = df
    else:
        df_bids_2017 = pd.concat([df_bids_2017, df], ignore_index=True)
        
df_bids_2017

In [None]:
### Bind df_bids_2017 and df_bids (2018-23) together
df_bids_all = pd.concat([df_bids, df_bids_2017], ignore_index=True)

In [43]:
### save as pkl
with open("../data/mastr_bids/bids_cleaned_2017_2023.pkl", mode = "wb") as pkl_file:
    pickle.dump(df_bids_all, pkl_file)