### (Short) EDA of bids data and Upload to DB

- download units and permits
- extract Anlagen EEG from open mastr dump
- units-table: How do the mastr nrs start? Are units starting with A among the units, as they are in earlier years of the bid-data?
- can all units with a Anlagen_Registernr be linked to units of the units-table?
- can any columns be omitted? Following Database Design rules, administrative/locational data should not be kept in the the bid data if they can be linked to via foreign key to the units data
- If they can be linked, delete the data in the bids-table -> since this will not be updated

In [None]:
### Download, Inspect and Upload Permit Data
import pandas as pd
import psycopg2
import os
import numpy as np
from dotenv import load_dotenv
from pandas.api.types import is_datetime64_any_dtype
from datetime import datetime
import time
import random
import re
import pickle

In [None]:
### Get Units data

# Load environment variables from .env file
load_dotenv()

# Get connection parameters from environment variables
dbname = os.getenv("DB_NAME")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
ssl_cert_path = os.getenv("SSL_CERT_PATH")

# Construct the connection string
conn_str = f"dbname={dbname} user={user} password={password} host={host} port={port} sslmode=require sslrootcert={ssl_cert_path}"

# Etablish connection object

#conn.close()
conn = psycopg2.connect(conn_str)

sql_select = "SELECT * FROM public.wind_extended;"
df_wind = pd.read_sql(sql_select, conn)

In [None]:
### Get bid data
with open("../data/mastr_bids/bids_cleaned_2017_2023.pkl", mode = "rb") as pkl_file:
   df_bids_all = pickle.load(pkl_file)

In [24]:
def find_unique_starting_letters(x):
    x_2 = x.fillna('')
    series_match = x_2.apply(lambda ser: re.findall(r"^[A-Za-z]+", ser))

    vals = []
    for match_re in series_match.values:
        if len(match_re) == 1:
            vals.append(match_re[0])

    return pd.unique(vals).tolist()

In [26]:
### How do mastr_nr s in df_wind and Anlagen_Registernr in df_bids_all start?
cols_nr = [col for col in df_wind.columns if re.search("mastr_nummer", col)]

cols_nr_start = {}

### Find all beginning unique Letters in the mastr_nummer columns 
for col in cols_nr:
    cols_nr_start[col] = find_unique_starting_letters(df_wind[col])

# List to store tuples, where each tuple represents one row for the long df    
rows_list = []

# loop through each nr-column
for col, values in cols_nr_start.items():
    # loop through each unique value of the number column and create tuple with column name in it
    for value in values:
        rows_list.append((col, value))

pd.DataFrame(rows_list, columns = ["mastr-column", "starting-letters"])

  return pd.unique(vals).tolist()


Unnamed: 0,mastr-column,starting-letters
0,eeg_mastr_nummer,EEG
1,einheit_mastr_nummer,SEE
2,lokation_mastr_nummer,SEL
3,anlagenbetreiber_mastr_nummer,ABR
4,gen_mastr_nummer,SGE


In [27]:
find_unique_starting_letters(df_bids_all["Register_Anlagennr"])

  return pd.unique(vals).tolist()


['A', 'SEE']

### Discrepancy in df_bids identifiers

- No foreign key nr in wind_extended start with only A
- Try: Can they be linked to the eeg_anlagen table?
- Try: When A in df_bids.Anlagen_Registernr is exchanged by SEE -> can it be linked to the units table

In [63]:
# Try: When A in df_bids.Anlagen_Registernr is exchanged by SEE -> can it be linked to the units table

# copy df
df_bids_a = df_bids_all.copy()
# .str is an acessor used to apply simple string-methods (used on one string object "foo") on a series
ind = df_bids_a["Register_Anlagennr"].fillna("").str.startswith("A")
df_bids_a = df_bids_a[ind]
df_bids_a = df_bids_a[['Name des Bieters', 
       'Landkreis', 'Postleitzahl', 'Gemeinde', 'Gemarkung',
       'Flur / Flurstück', 'Register_Anlagennr', 'Gebotsdatum']]
# Create column A -> SEE
df_bids_a["A_to_SEE"] = df_bids_a["Register_Anlagennr"].str.replace("A", "SEE")
# Create column firs three positions -> SEE
df_bids_a["pos_3_to_SEE"] = df_bids_a["Register_Anlagennr"].str.replace(r"^.{3}", "SEE")

# Create column A -> SEE
df_bids_a["A_to_EEG"] = df_bids_a["Register_Anlagennr"].str.replace("A", "EEG")
# Create column firs three positions -> SEE
df_bids_a["pos_3_to_EEG"] = df_bids_a["Register_Anlagennr"].str.replace(r"^.{3}", "EEG")

In [64]:
rows_list = []
for key, row in df_wind.iterrows():
    rows_list.append((row["einheit_mastr_nummer"], 
    row["einheit_mastr_nummer"] in df_bids_a["A_to_SEE"].tolist(),
    row["einheit_mastr_nummer"] in df_bids_a["pos_3_to_SEE"].tolist(),
    row["eeg_mastr_nummer"] in df_bids_a["A_to_EEG"].tolist(),
    row["eeg_mastr_nummer"] in df_bids_a["pos_3_to_EEG"].tolist()))

df_test = pd.DataFrame(rows_list, columns=["einheit_mastr_nr", "A_to_SEE", "pos_3_to_SEE", "A_to_EEG", "pos_3_to_EEG"])

In [65]:
for col in ["A_to_SEE", "pos_3_to_SEE", "A_to_EEG", "pos_3_to_EEG"]:
    print(col, df_test[col].sum())

A_to_SEE 0
pos_3_to_SEE 0
A_to_EEG 0
pos_3_to_EEG 0


In [68]:
### Can the registernr at least be found?
# copy df
df_bids_see = df_bids_all.copy()
# .str is an acessor used to apply simple string-methods (used on one string object "foo") on a series
ind = df_bids_see["Register_Anlagennr"].fillna("").str.startswith("SEE")
df_bids_see = df_bids_see[ind]
df_bids_see = df_bids_see[['Name des Bieters', 
       'Landkreis', 'Postleitzahl', 'Gemeinde', 'Gemarkung',
       'Flur / Flurstück', 'Register_Anlagennr', 'Gebotsdatum']]

rows_list = []
for key, row in df_wind.iterrows():
    rows_list.append((row["einheit_mastr_nummer"], 
    row["einheit_mastr_nummer"] in df_bids_see["Register_Anlagennr"].tolist()))

df_test = pd.DataFrame(rows_list, columns=["einheit_mastr_nr", "Register_Anlagennr"])
print(df_test["Register_Anlagennr"].sum(), len(df_bids_see), len(df_bids_all))

### 2854 unit_mastr_nrs from wind_extended can be found in the 2868 mastr_nrs starting with SEE from df_bids_see of all 4418 bid-units

2854 2868 4418
